# Creating the Dataset for Original Exploratory Analysis
This notebook reads in the CSV files for public schools in the 2013-2014, 2014-2015, 2015-2016, and 2016-2017 years to create a dataset containing the school-level factors for the 2016-2017 years with the inclusion of school performance indicators for the previous years.  

In [1]:
#import required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(color_codes=True)

import warnings
warnings.filterwarnings("ignore")

Read in full datasets from school years 2013-2014, 2014-2015, 2015-2016, 2016-2017.

In [2]:
import os
cwd = os.getcwd()
cwd = cwd + '/'

In [3]:
#csv path
path = cwd + "2014/publicSchools2014.csv"
publicSchools14 = pd.read_csv(path, low_memory = False)
print(publicSchools14.info())

path = cwd + "2015/publicSchools2015.csv"
publicSchools15 = pd.read_csv(path, low_memory = False)
print(publicSchools15.info())

path = cwd + "2016/publicSchools2016.csv"
publicSchools16 = pd.read_csv(path, low_memory = False)
print(publicSchools16.info())

path = cwd + "2017/publicSchools2017.csv"
publicSchools17 = pd.read_csv(path, low_memory = False)
print(publicSchools17.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2561 entries, 0 to 2560
Columns: 365 entries, vphone_ad to WhitePct
dtypes: float64(331), int64(3), object(31)
memory usage: 7.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2585 entries, 0 to 2584
Columns: 362 entries, vphone_ad to WhitePct
dtypes: float64(315), int64(3), object(44)
memory usage: 7.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2599 entries, 0 to 2598
Columns: 362 entries, vphone_ad to WhitePct
dtypes: float64(318), int64(3), object(41)
memory usage: 7.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2617 entries, 0 to 2616
Columns: 385 entries, vphone_ad to Number_Industry_Recognized_Crede
dtypes: float64(340), int64(3), object(42)
memory usage: 7.7+ MB
None


### Low Performing School Definition:

“Low-performing schools are those that receive a school performance grade of D or F and a school growth score of "met expected growth" or "not met expected growth" as defined by G.S. 115C-83.15.” (G.S. 115C-105.37(a)), and

http://www.dpi.state.nc.us/schooltransformation/low-performing/

In [4]:
#create new column that equals 1 if low Performing
publicSchools14['LPS'] = np.where(publicSchools14['SPG Grade'].isin(['F', 'D'])&
                            publicSchools14['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)

publicSchools15['LPS'] = np.where(publicSchools15['SPG Grade'].isin(['F', 'D'])&
                            publicSchools15['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)

publicSchools16['LPS'] = np.where(publicSchools16['SPG Grade'].isin(['F', 'D'])&
                            publicSchools16['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)

publicSchools17['LPS'] = np.where(publicSchools17['SPG Grade'].isin(['F', 'D'])&
                            publicSchools17['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)


In [5]:
## Get the unit_codes of the low performing schools for each year and create new columns in the 2017 dataset
## to reflect the years of low performance
unit_codes_14 = publicSchools14['unit_code'][publicSchools14.LPS == 1]
unit_codes_15 = publicSchools15['unit_code'][publicSchools15.LPS == 1]
unit_codes_16 = publicSchools16['unit_code'][publicSchools16.LPS == 1]
unit_codes_17 = publicSchools17['unit_code'][publicSchools17.LPS == 1]

In [6]:
##Creat columns in publicSchools17 dataset
publicSchools17['LPS_14'] = np.where(publicSchools17['unit_code'].isin(unit_codes_14), 1, 0)
print('************** 2014 **************')
print('Num Public Schools Low Performing 2013-14: ', len(unit_codes_14))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_14[publicSchools17.LPS_14 ==1]))
print('Difference: ', (len(unit_codes_14) - len(publicSchools17.LPS_14[publicSchools17.LPS_14 ==1])))

publicSchools17['LPS_15'] = np.where(publicSchools17['unit_code'].isin(unit_codes_15), 1, 0)
print('************** 2015 **************')
print('Num Public Schools Low Performing 2014-15: ', len(unit_codes_15))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_15[publicSchools17.LPS_15 ==1]))
print('Difference: ', (len(unit_codes_15) - len(publicSchools17.LPS_15[publicSchools17.LPS_15 ==1])))

publicSchools17['LPS_16'] = np.where(publicSchools17['unit_code'].isin(unit_codes_16), 1, 0)
print('************** 2016 **************')
print('Num Public Schools Low Performing 2015-16: ', len(unit_codes_16))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_16[publicSchools17.LPS_16 ==1]))
print('Difference: ', (len(unit_codes_16) - len(publicSchools17.LPS_16[publicSchools17.LPS_16 ==1])))

publicSchools17['LPS_17'] = np.where(publicSchools17['unit_code'].isin(unit_codes_17), 1, 0)
print('************** 2014 **************')
print('Num Public Schools Low Performing 2016-17: ', len(unit_codes_17))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_17[publicSchools17.LPS_17 ==1]))
print('Difference: ', (len(unit_codes_17) - len(publicSchools17.LPS_17[publicSchools17.LPS_17 ==1])))

************** 2014 **************
Num Public Schools Low Performing 2013-14:  615
Num Schools Still Running in 2017:  600
Difference:  15
************** 2015 **************
Num Public Schools Low Performing 2014-15:  580
Num Schools Still Running in 2017:  572
Difference:  8
************** 2016 **************
Num Public Schools Low Performing 2015-16:  488
Num Schools Still Running in 2017:  485
Difference:  3
************** 2014 **************
Num Public Schools Low Performing 2016-17:  505
Num Schools Still Running in 2017:  505
Difference:  0


In [7]:
publicSchools17 = publicSchools17.drop(columns = 'LPS', axis=1)

In [8]:
### Create columns in 2017 dataset for EVAAS growth score and SPG Grade
mergCols = ['SPG Grade', 'SPG Score', 'EVAAS Growth Status','EVAAS Growth Score', 'unit_code']
merge14 = publicSchools14[mergCols]
merge15 = publicSchools15[mergCols]
merge16 = publicSchools16[mergCols]
publicSchools17 = publicSchools17.merge(merge14,how='left',on='unit_code', suffixes=('', '_14'))
publicSchools17 = publicSchools17.merge(merge15,how='left',on='unit_code', suffixes=('', '_15'))
publicSchools17 = publicSchools17.merge(merge16,how='left',on='unit_code', suffixes=('', '_16'))
print(publicSchools17.columns)

Index(['vphone_ad', 'year', 'unit_code', 'street_ad', 'scity_ad', 'state_ad',
       'szip_ad', 'type_cd', 'closed_ind', 'new_ind',
       ...
       'EVAAS Growth Status_14', 'EVAAS Growth Score_14', 'SPG Grade_15',
       'SPG Score_15', 'EVAAS Growth Status_15', 'EVAAS Growth Score_15',
       'SPG Grade_16', 'SPG Score_16', 'EVAAS Growth Status_16',
       'EVAAS Growth Score_16'],
      dtype='object', length=401)


## Number of times the school has been low performing

In [9]:
## Times low performing 
RLPS_series = []
for index, row in publicSchools17.iterrows():
    if row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 1:
        RLPS_series.append(1)
    elif row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 2:
        RLPS_series.append(2)
    elif row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 3:
        RLPS_series.append(3)
    elif row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 4:
        RLPS_series.append(4)
    else: 
        RLPS_series.append(0)
print(np.unique(RLPS_series))
publicSchools17['RLPS'] = RLPS_series

[0 1 2 3 4]


In [10]:
## Check Work: 
school_LPS_1 = publicSchools17.unit_code[publicSchools17['RLPS'] == 1]
school_LPS_2 = publicSchools17.unit_code[publicSchools17['RLPS'] == 2]
school_LPS_3 = publicSchools17.unit_code[publicSchools17['RLPS'] == 3]
school_LPS_4 = publicSchools17.unit_code[publicSchools17['RLPS'] == 4]
school_LPS_0 = publicSchools17.unit_code[publicSchools17['RLPS'] == 0]


print('Number of Schools Low Performing One Year: ', (school_LPS_1.count()))
print('Number of Schools Low Performing Two Years: ', (school_LPS_2.count()))
print('Number of Schools Low Performing Three Years: ', (school_LPS_3.count()))
print('Number of Schools Low Performing Four Years: ', (school_LPS_4.count()))
print('Number of Schools Low Performing Never: ', (school_LPS_0.count()))

Number of Schools Low Performing One Year:  263
Number of Schools Low Performing Two Years:  227
Number of Schools Low Performing Three Years:  203
Number of Schools Low Performing Four Years:  209
Number of Schools Low Performing Never:  1718


In [11]:
outputDir = cwd
#Save the interim dataset to a .csv file
publicSchools17.to_csv(outputDir + 'PublicSchools17_EDA.csv', sep=',', index=False)