# Creating the Dataset for 2014

This Notebook: 
 - Reads in our All Public Schools data for 2014
 - Creates a new column for low performing schools 
 - Cleans up the dataset for ML capabilities

In [2]:
#import required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(color_codes=True)

import warnings
warnings.filterwarnings("ignore")

Read in full datasets from school years 2013-2014, 2014-2015, 2015-2016, 2016-2017.

In [3]:
#csv path
path = "/Users/Olivia/SMUDS/Capstone/B/Analysis/2014/publicSchools2014.csv"
publicSchools14 = pd.read_csv(path, low_memory = False)
print(publicSchools14.info())

path = "/Users/Olivia/SMUDS/Capstone/B/Analysis/2015/publicSchools2015.csv"
publicSchools15 = pd.read_csv(path, low_memory = False)
print(publicSchools15.info())

path = "/Users/Olivia/SMUDS/Capstone/B/Analysis/2016/publicSchools2016.csv"
publicSchools16 = pd.read_csv(path, low_memory = False)
print(publicSchools16.info())

path = "/Users/Olivia/SMUDS/Capstone/B/Analysis/2017/publicSchools2017.csv"
publicSchools17 = pd.read_csv(path, low_memory = False)
print(publicSchools17.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2561 entries, 0 to 2560
Columns: 365 entries, vphone_ad to WhitePct
dtypes: float64(331), int64(3), object(31)
memory usage: 7.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2585 entries, 0 to 2584
Columns: 362 entries, vphone_ad to WhitePct
dtypes: float64(315), int64(3), object(44)
memory usage: 7.1+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2599 entries, 0 to 2598
Columns: 362 entries, vphone_ad to WhitePct
dtypes: float64(318), int64(3), object(41)
memory usage: 7.2+ MB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2617 entries, 0 to 2616
Columns: 385 entries, vphone_ad to Number_Industry_Recognized_Crede
dtypes: float64(340), int64(3), object(42)
memory usage: 7.7+ MB
None


In [3]:
# #take out charter schools
# publicSchools17 = publicSchools17[publicSchools17['type_cd'] == 'P']
# publicSchools16 = publicSchools16[publicSchools16['type_cd'] == 'P']
# publicSchools15 = publicSchools15[publicSchools15['type_cd'] == 'P']
# publicSchools14 = publicSchools14[publicSchools14['type_cd'] == 'P']

### Low Performing School Definition:

“Low-performing schools are those that receive a school performance grade of D or F and a school growth score of "met expected growth" or "not met expected growth" as defined by G.S. 115C-83.15.” (G.S. 115C-105.37(a)), and

http://www.dpi.state.nc.us/schooltransformation/low-performing/

In [4]:
#create new column that equals 1 if low Performing
publicSchools14['LPS'] = np.where(publicSchools14['SPG Grade'].isin(['F', 'D'])&
                            publicSchools14['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)

publicSchools15['LPS'] = np.where(publicSchools15['SPG Grade'].isin(['F', 'D'])&
                            publicSchools15['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)

publicSchools16['LPS'] = np.where(publicSchools16['SPG Grade'].isin(['F', 'D'])&
                            publicSchools16['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)

publicSchools17['LPS'] = np.where(publicSchools17['SPG Grade'].isin(['F', 'D'])&
                            publicSchools17['EVAAS Growth Status'].isin(['NotMet', 'Met']),
                            1, 0)


In [5]:
## Get the unit_codes of the low performing schools for each year and create new columns in the 2017 dataset
## to reflect the years of low performance
unit_codes_14 = publicSchools14['unit_code'][publicSchools14.LPS == 1]
unit_codes_15 = publicSchools15['unit_code'][publicSchools15.LPS == 1]
unit_codes_16 = publicSchools16['unit_code'][publicSchools16.LPS == 1]
unit_codes_17 = publicSchools17['unit_code'][publicSchools17.LPS == 1]

In [6]:
##Creat columns in publicSchools17 dataset
publicSchools17['LPS_14'] = np.where(publicSchools17['unit_code'].isin(unit_codes_14), 1, 0)
print('************** 2014 **************')
print('Num Public Schools Low Performing 2013-14: ', len(unit_codes_14))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_14[publicSchools17.LPS_14 ==1]))
print('Difference: ', (len(unit_codes_14) - len(publicSchools17.LPS_14[publicSchools17.LPS_14 ==1])))

publicSchools17['LPS_15'] = np.where(publicSchools17['unit_code'].isin(unit_codes_15), 1, 0)
print('************** 2015 **************')
print('Num Public Schools Low Performing 2014-15: ', len(unit_codes_15))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_15[publicSchools17.LPS_15 ==1]))
print('Difference: ', (len(unit_codes_15) - len(publicSchools17.LPS_15[publicSchools17.LPS_15 ==1])))

publicSchools17['LPS_16'] = np.where(publicSchools17['unit_code'].isin(unit_codes_16), 1, 0)
print('************** 2016 **************')
print('Num Public Schools Low Performing 2015-16: ', len(unit_codes_16))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_16[publicSchools17.LPS_16 ==1]))
print('Difference: ', (len(unit_codes_16) - len(publicSchools17.LPS_16[publicSchools17.LPS_16 ==1])))

publicSchools17['LPS_17'] = np.where(publicSchools17['unit_code'].isin(unit_codes_17), 1, 0)
print('************** 2014 **************')
print('Num Public Schools Low Performing 2016-17: ', len(unit_codes_17))
print('Num Schools Still Running in 2017: ', len(publicSchools17.LPS_17[publicSchools17.LPS_17 ==1]))
print('Difference: ', (len(unit_codes_17) - len(publicSchools17.LPS_17[publicSchools17.LPS_17 ==1])))

************** 2014 **************
Num Public Schools Low Performing 2013-14:  615
Num Schools Still Running in 2017:  600
Difference:  15
************** 2015 **************
Num Public Schools Low Performing 2014-15:  580
Num Schools Still Running in 2017:  572
Difference:  8
************** 2016 **************
Num Public Schools Low Performing 2015-16:  488
Num Schools Still Running in 2017:  485
Difference:  3
************** 2014 **************
Num Public Schools Low Performing 2016-17:  505
Num Schools Still Running in 2017:  505
Difference:  0


In [7]:
publicSchools17 = publicSchools17.drop(columns = 'LPS', axis=1)

## Number of times the school has been low performing

In [8]:
## Times low performing 
RLPS_series = []
for index, row in publicSchools17.iterrows():
    if row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 1:
        RLPS_series.append(1)
    elif row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 2:
        RLPS_series.append(2)
    elif row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 3:
        RLPS_series.append(3)
    elif row['LPS_14'] + row['LPS_15'] + row['LPS_16'] + row['LPS_17']  == 4:
        RLPS_series.append(4)
    else: 
        RLPS_series.append(0)
print(np.unique(RLPS_series))
publicSchools17['RLPS'] = RLPS_series

[0 1 2 3 4]


In [9]:
## Check Work: 
school_LPS_1 = publicSchools17.unit_code[publicSchools17['RLPS'] == 1]
school_LPS_2 = publicSchools17.unit_code[publicSchools17['RLPS'] == 2]
school_LPS_3 = publicSchools17.unit_code[publicSchools17['RLPS'] == 3]
school_LPS_4 = publicSchools17.unit_code[publicSchools17['RLPS'] == 4]
school_LPS_0 = publicSchools17.unit_code[publicSchools17['RLPS'] == 0]


print('Number of Schools Low Performing One Year: ', (school_LPS_1.count()))
print('Number of Schools Low Performing Two Years: ', (school_LPS_2.count()))
print('Number of Schools Low Performing Three Years: ', (school_LPS_3.count()))
print('Number of Schools Low Performing Four Years: ', (school_LPS_4.count()))
print('Number of Schools Low Performing Never: ', (school_LPS_0.count()))

Number of Schools Low Performing One Year:  263
Number of Schools Low Performing Two Years:  227
Number of Schools Low Performing Three Years:  203
Number of Schools Low Performing Four Years:  209
Number of Schools Low Performing Never:  1715


In [10]:
outputDir = '/Users/Olivia/SMUDS/Capstone/B/Analysis/RLPS/data/'
#Save the interim dataset to a .csv file
publicSchools17.to_csv(outputDir + 'publicSchools17EDA.csv', sep=',', index=False)

## Create Machine Learning Dataset for 2017

## Define Missing Data Thresholds

In [11]:
schoolData = publicSchools17.copy()

In [12]:
#Missing Data Threshold (Per Column)
missingThreshold = 0.95

#Unique Value Threshold (Per Column)
#Delete Columns >  uniqueThreshold unique values prior to one-hot encoding. 
#(each unique value becomes a new column during one-hot encoding)
uniqueThreshold = 25

print('*********Start: Beginning Column and Row Counts********************************************')
schoolData.info(verbose=False)

#Select only public schools as charter schools are missing data for many columns.
schoolData = schoolData[(schoolData['type_cd'] == 'P') & (schoolData['student_num'] > 0)]

print('\r\n*********After: Selecting Only Public School Campuses**********************************')
schoolData.info(verbose=False)

#Save primary key
unit_code = schoolData['unit_code']
#Convert zip code to string
schoolData['szip_ad'] = schoolData['szip_ad'].astype('object')

*********Start: Beginning Column and Row Counts********************************************
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2617 entries, 0 to 2616
Columns: 390 entries, vphone_ad to RLPS
dtypes: float64(340), int64(8), object(42)
memory usage: 7.8+ MB

*********After: Selecting Only Public School Campuses**********************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 390 entries, vphone_ad to RLPS
dtypes: float64(340), int64(8), object(42)
memory usage: 7.3+ MB


### Impute Missing Racial Composition Data

In [13]:
#Get Student Body Racial Composition Fields
raceCompositionFields = schoolData.filter(regex='Indian|Asian|Hispanic|Black|White|PacificIsland|TwoOrMore|Minority')\
                                  .filter(regex='Pct').columns
    
rowsBefore = schoolData[raceCompositionFields].isnull().T.any().T.sum()

#Update missing race values with the district average when avaiable (No district averages for charter schools) 
schoolData[raceCompositionFields] = schoolData.groupby('District Name')[raceCompositionFields]\
                                              .transform(lambda x: x.fillna(x.mean()))

    #Review dataset contents after Racial Composition Imputation
print('*********After: Updating Missing Racial Compostion Values****************************')   
rowsAfter = schoolData[raceCompositionFields].isnull().T.any().T.sum()
rowsUpdated = rowsBefore - rowsAfter
print ('Rows Updated / Imputed: ', rowsUpdated) 
print('\r\nTotal Rows Missing Racial Compositions By District Name') 
schoolData['District Name'][schoolData[raceCompositionFields].isnull().T.any().T].value_counts()
print(schoolData['District Name'][schoolData[raceCompositionFields].isnull().T.any().T].value_counts())

*********After: Updating Missing Racial Compostion Values****************************
Rows Updated / Imputed:  0

Total Rows Missing Racial Compositions By District Name
Series([], Name: District Name, dtype: int64)


### Remove Columns with the same data in all rows

In [14]:
#Remove any fields that have the same value in all rows
UniqueValueCounts = schoolData.nunique(dropna=False)
SingleValueCols = UniqueValueCounts[UniqueValueCounts == 1].index
schoolData = schoolData.drop(SingleValueCols, axis=1)

#Review dataset contents after drops
print('*********After: Removing columns with the same value in every row.*******************')
schoolData.info(verbose=False)
print ('\r\nColumns Deleted: ', len(SingleValueCols))
print('Columns:', list(SingleValueCols))

*********After: Removing columns with the same value in every row.*******************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 265 entries, vphone_ad to RLPS
dtypes: float64(221), int64(5), object(39)
memory usage: 5.0+ MB

Columns Deleted:  125
Columns: ['year', 'state_ad', 'type_cd', 'closed_ind', 'new_ind', 'super_nm', 'State_Name', 'stem', 'total_expense_num', 'salary_expense_pct', 'benefits_expense_pct', 'services_expense_pct', 'supplies_expense_pct', 'instruct_equip_exp_pct', 'other_expense_pct', 'federal_perpupil_num', 'local_perpupil_num', 'state_perpupil_num', 'lea_benefits_expense_pct', 'lea_other_expense_pct', 'st_total_expense_num', 'st_salary_expense_pct', 'st_benefits_expense_pct', 'st_services_expense_pct', 'st_supplies_expense_pct', 'st_instruct_equip_exp_pct', 'st_other_expense_pct', 'st_federal_perpupil_num', 'st_local_perpupil_num', 'st_state_perpupil_num', 'building_expense_pct', 'lea_building_expense_pct', 'st_building_exp

## Remove Columns with Unique Data In All Rows

In [15]:
#Remove any fields that have unique values in every row
schoolDataRecordCt = schoolData.shape[0]
UniqueValueCounts = schoolData.apply(pd.Series.nunique)
AllUniqueValueCols = UniqueValueCounts[UniqueValueCounts == schoolDataRecordCt].index
schoolData = schoolData.drop(AllUniqueValueCols, axis=1)

#Review dataset contents after drops
print('*********After: Removing columns with unique values in every row.*******************')
schoolData.info(verbose=False)
print ('\r\nColumns Deleted: ', len(AllUniqueValueCols))

*********After: Removing columns with unique values in every row.*******************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 263 entries, vphone_ad to RLPS
dtypes: float64(221), int64(5), object(37)
memory usage: 5.0+ MB

Columns Deleted:  2


### Remove identifying columns (unit code, school code) 

In [16]:
#Remove any fields that have unique values in every row
schoolDataRecordCt = schoolData.shape[0]
UniqueValueCounts = schoolData.apply(pd.Series.nunique)
AllUniqueValueCols = UniqueValueCounts[UniqueValueCounts == schoolDataRecordCt].index
schoolData = schoolData.drop(AllUniqueValueCols, axis=1)

#Review dataset contents after drops
print('*********After: Removing columns with unique values in every row.*******************')
schoolData.info(verbose=False)
print ('\r\nColumns Deleted: ', len(AllUniqueValueCols))
print('Columns: ', AllUniqueValueCols)

*********After: Removing columns with unique values in every row.*******************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 263 entries, vphone_ad to RLPS
dtypes: float64(221), int64(5), object(37)
memory usage: 4.9+ MB

Columns Deleted:  0
Columns:  Index([], dtype='object')


### Remove columns with all null values

In [17]:
#Remove any empty fields (null values in every row)
schoolDataRecordCt = schoolData.shape[0]
NullValueCounts = schoolData.isnull().sum()
NullValueCols = NullValueCounts[NullValueCounts == schoolDataRecordCt].index
schoolData = schoolData.drop(NullValueCols, axis=1)

#Review dataset contents after empty field drops
print('*********After: Removing columns with null / blank values in every row.*************')
schoolData.info(verbose=False)
print ('\r\nColumns Deleted: ', len(NullValueCols))
print ('Columns: ', list(NullValueCols))

*********After: Removing columns with null / blank values in every row.*************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 263 entries, vphone_ad to RLPS
dtypes: float64(221), int64(5), object(37)
memory usage: 4.9+ MB

Columns Deleted:  0
Columns:  []


## Data Types

In [18]:
#Isolate continuous and categorical data types
#These are indexers into the schoolData dataframe and may be used similar to the schoolData dataframe 
sD_boolean = schoolData.loc[:, (schoolData.dtypes == bool) ]
sD_nominal = schoolData.loc[:, (schoolData.dtypes == object)]
sD_continuous = schoolData.loc[:, (schoolData.dtypes != bool) & (schoolData.dtypes != object)]
print ("Boolean Columns: ", sD_boolean.shape[1])
print ("Nominal Columns: ", sD_nominal.shape[1])
print ("Continuous Columns: ", sD_continuous.shape[1])
print ("Columns Accounted for: ", sD_nominal.shape[1] + sD_continuous.shape[1] + sD_boolean.shape[1])

Boolean Columns:  0
Nominal Columns:  37
Continuous Columns:  226
Columns Accounted for:  263


### Convert Booleans to 1s and 0s

In [19]:
#Convert Columns with values of nan, 'Y' as booleans
hasY = []
for x in schoolData.columns: 
    if ((['Y'] in schoolData[x].unique()) & (len(schoolData[x].unique()) == 2)):
        hasY.append(x)
print(hasY)

for x in hasY: 
# Map flag fields into bool
    schoolData[x] = schoolData[x].map({'Y':1, np.nan:0})

schoolData['esea_status'] = schoolData['esea_status'].map({'P':'Esea_Pass', 'F':'Esea_Fail', np.nan:'Non_Esea'})
schoolData['Grad_project_status'] = schoolData['Grad_project_status'].map({'Y':1, 'N':0, np.nan:0})

#Boolean Columns
sD_boolean = schoolData.loc[:, (schoolData.dtypes == bool) ]
print ("Boolean Columns: ", sD_boolean.shape[1])

['title1_type_cd', 'clp_ind', 'focus_clp_ind', 'summer_program_ind', 'asm_no_spg_ind', 'no_data_spg_ind']
Boolean Columns:  0


### Eliminate columns with < 5% data

In [20]:
#Eliminate continuous columns with more than missingThreshold percentage of missing values
schoolDataRecordCt = sD_continuous.shape[0]
missingValueLimit = schoolDataRecordCt * missingThreshold
NullValueCounts = sD_continuous.isnull().sum()
NullValueCols = NullValueCounts[NullValueCounts >= missingValueLimit].index
schoolData = schoolData.drop(NullValueCols, axis=1)

#Review dataset contents after empty field drops
print('*********After: Removing columns with >= missingThreshold % of missing values******')
schoolData.info(verbose=False)
print ('\r\nColumns Deleted: ', len(NullValueCols))
print ('\r\nColumns: ', list(NullValueCols))

*********After: Removing columns with >= missingThreshold % of missing values******
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 254 entries, vphone_ad to RLPS
dtypes: float64(212), int64(12), object(30)
memory usage: 4.8+ MB

Columns Deleted:  9

Columns:  ['13_Size', 'ib_participation_pct', 'ib_pct_4_or_above', 'LEP_Limited English Proficiency_ENROLL_sch_pct', 'MA_Asian_ENROLL_sch_pct', 'MAN_American Indian_ENROLL_sch_pct', 'MM_Multiracial_ENROLL_sch_pct', 'WDIS_Students With Disabilities_ENROLL_sch_pct', 'EDUC_Concentrator_Ct']


### Delete columns with more than 25 unique values before one hot encoding

In [21]:
#Delete categorical columns with > 25 unique values (Each unique value becomes a column during one-hot encoding)
oneHotUniqueValueCounts = schoolData[sD_nominal.columns].apply(lambda x: x.nunique())
oneHotUniqueValueCols = oneHotUniqueValueCounts[oneHotUniqueValueCounts >= uniqueThreshold].index
schoolData.drop(oneHotUniqueValueCols, axis=1, inplace=True) 

#Review dataset contents one hot high unique value drops
print('*********After: Removing columns with >= uniqueThreshold unique values***********')
schoolData.info(verbose=False)
print ('\r\nColumns Deleted: ', len(oneHotUniqueValueCols))
print ('\r\nColumns : ', list(oneHotUniqueValueCols))

*********After: Removing columns with >= uniqueThreshold unique values***********
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 240 entries, category_cd to RLPS
dtypes: float64(212), int64(12), object(16)
memory usage: 4.5+ MB

Columns Deleted:  14

Columns :  ['vphone_ad', 'street_ad', 'scity_ad', 'szip_ad', 'url_ad', 'grade_range_cd', 'cover_letter_ad', 'Lea_Name', 'url', 'District Name', 'School Name', 'grades_BYOD', 'grades_1_to_1_access', 'SRC_Grades_Devices_Sent_Home']


### One hot encode categorical variables

In [22]:
#Isolate remaining categorical variables
begColumnCt = len(schoolData.columns)
sD_nominal = schoolData.loc[:, (schoolData.dtypes == object)]

#one hot encode categorical variables
schoolData = pd.get_dummies(data=schoolData, 
                       columns=sD_nominal, drop_first=True)

#Determine change in column count
endColumnCt = len(schoolData.columns)
columnsAdded = endColumnCt - begColumnCt

#Review dataset contents one hot high unique value drops
print ('Columns To One-Hot Encode: ', len(sD_nominal.columns), list(sD_nominal.columns))
print('\r\n*********After: Adding New Columns Via One-Hot Encoding*************************')
schoolData.info(verbose=False)
print ('\r\nNew Columns Created Via One-Hot Encoding: ', columnsAdded)

Columns To One-Hot Encode:  16 ['category_cd', 'calendar_type_txt', 'sna_pgm_type_cd', 'school_type_txt', 'calendar_only_txt', 'esea_status', 'SBE District', 'SPG Grade', 'Reading SPG Grade', 'Math SPG Grade', 'EVAAS Growth Status', 'State Gap Compared', 'Category_Cd', 'Byod', '_1_to_1_access', 'SRC_devices_sent_home']

*********After: Adding New Columns Via One-Hot Encoding*************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 283 entries, title1_type_cd to SRC_devices_sent_home_Yes
dtypes: float64(212), int64(12), uint8(59)
memory usage: 4.3 MB

New Columns Created Via One-Hot Encoding:  43


### Impute remaining missing values to zero

In [23]:
#Print out all the missing value rows
pd.set_option('display.max_rows', 1000)

print('\r\n*********The Remaining Missing Values Below will be set to Zero!*************************')

#Check for Missing values 
missing_values = schoolData.isnull().sum().reset_index()
missing_values.columns = ['Variable Name', 'Number Missing Values']
missing_values = missing_values[missing_values['Number Missing Values'] > 0] 
missing_values


*********The Remaining Missing Values Below will be set to Zero!*************************


Unnamed: 0,Variable Name,Number Missing Values
10,00_Size,1143
11,01_Size,1144
12,02_Size,1142
13,03_Size,1149
14,04_Size,1139
15,05_Size,1170
16,06_Size,1853
17,07_Size,1884
18,08_Size,1883
19,09_Size,1985


In [24]:
#Replace all remaining NaN with 0
schoolData = schoolData.fillna(0)

#Check for Missing values after final imputation 
missing_values = schoolData.isnull().sum().reset_index()
missing_values.columns = ['Variable Name', 'Number Missing Values']
missing_values = missing_values[missing_values['Number Missing Values'] > 0] 
missing_values

Unnamed: 0,Variable Name,Number Missing Values


## Scale Expression of Percentages

In [25]:
PercentageFields = schoolData.filter(regex='pct|Pct|percent|Percent').columns
converted = []
for x in PercentageFields:
    if(schoolData[x].max() > 1):
        converted.append(x)
        schoolData[x] = schoolData[x]/100
print('Number of Percentage Fields:', len(PercentageFields))
print('Percentage Fields Converted to between 0 and 1: ', len(converted))

Number of Percentage Fields: 117
Percentage Fields Converted to between 0 and 1:  10


## Correlated Features

In [26]:
# calculate the correlation matrix
corr_matrix  = schoolData.corr().abs()

# Select upper triangle of correlation matrix
upper = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(np.bool))

# Find index of feature columns with correlation greater than 0.95
to_drop = [column for column in upper.columns if any(upper[column] > 0.95)]

In [27]:
#Get all of the correlation values > 95%
x = np.where(upper > 0.95)

#Display all field combinations with > 95% correlation
cf = pd.DataFrame()
cf['Field1'] = upper.columns[x[1]]
cf['Field2'] = upper.index[x[0]]

#Get the correlation values for every field combination. (There must be a more pythonic way to do this!)
corr = [0] * len(cf)
for i in range(0, len(cf)):
    corr[i] =  upper[cf['Field1'][i]][cf['Field2'][i]] 
    
cf['Correlation'] = corr

print ('There are ', str(len(cf['Field1'])), ' field correlations > 95%.')
cf

There are  101  field correlations > 95%.


Unnamed: 0,Field1,Field2,Correlation
0,SPG Grade_I,no_data_spg_ind,1.0
1,class_teach_num,student_num,0.967407
2,lea_class_teach_num,lea_avg_student_num,0.970002
3,st_class_teach_num,st_avg_student_num,0.999101
4,st_advance_dgr_pct,st_avg_student_num,0.985325
5,01_Size,00_Size,0.951996
6,08_Size,07_Size,0.973609
7,English II_Size,Biology_Size,0.967887
8,Overall Achievement Score,SPG Score,0.988052
9,Math SPG Score,Reading SPG Score,0.975327


In [28]:
print ('Dropping the following ', str(len(to_drop)), ' highly correlated fields.')
to_drop

Dropping the following  62  highly correlated fields.


['01_Size',
 '08_Size',
 'English II_Size',
 'Math SPG Score',
 'Overall Achievement Score',
 'Reading Score',
 'Math Score',
 'Biology Score',
 'ACT Score',
 '4-Year Cohort Graduation Rate Score',
 'ReadingGr10_pTarget_PctMet',
 'ReadingGr3-8_pTarget_PctMet',
 'SciGr11_pTarget_PctMet',
 'ap_ib_courses',
 'lea_total_specialized_courses',
 'lea_cte_courses',
 'st_total_specialized_courses',
 'st_ap_ib_courses',
 'st_cte_courses',
 'st_univ_college_courses',
 'F_Female_ENROLL_sch_pct',
 'M_Male_ENROLL_sch_pct',
 'lea_avg_age_media_collection',
 'class_teach_num',
 'lea_class_teach_num',
 'lea_lateral_teach_pct',
 'st_tchyrs_4thru10_pct',
 'st_tchyrs_11plus_pct',
 'st_class_teach_num',
 'st_advance_dgr_pct',
 'st_lateral_teach_pct',
 '0-3 Years_Exp_Pct_Tch',
 '10+ Years_Exp_Pct_Tch',
 '4-10 Years_Exp_Pct_Tch',
 'Not Demostrated_TCHR_Standard 3_Pct',
 'Not Demostrated_TCHR_Standard 5_Pct',
 'AsianPct',
 'BlackPct',
 'HispanicPct',
 'IndianMalePct',
 'IndianPct',
 'MinorityPct',
 'WhiteFema

In [29]:
#Check columns before drop 
print('\r\n*********Before: Dropping Highly Correlated Fields*************************************')
schoolData.info(verbose=False)

# Drop the highly correlated features from our training data 
schoolData = schoolData.drop(to_drop, axis=1)

#Check columns after drop 
print('\r\n*********After: Dropping Highly Correlated Fields**************************************')
schoolData.info(verbose=False)


*********Before: Dropping Highly Correlated Fields*************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 283 entries, title1_type_cd to SRC_devices_sent_home_Yes
dtypes: float64(212), int64(12), uint8(59)
memory usage: 4.3 MB

*********After: Dropping Highly Correlated Fields**************************************
<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Columns: 221 entries, title1_type_cd to SRC_devices_sent_home_Yes
dtypes: float64(167), int64(12), uint8(42)
memory usage: 3.5 MB


In [30]:
outputDir = '/Users/Olivia/SMUDS/Capstone/B/Analysis/RLPS/data/'
#Restore the unit_code before saving
schoolData['unit_code'] = unit_code
#Save the final dataset to a .csv file
schoolData.to_csv(outputDir + 'RLPS2017Classification' + '_ML.csv', sep=',', index=False)

In [31]:
print('*********FINAL DATASET DETAILS*********************************************************\r\n')
schoolData.info(verbose=True)

*********FINAL DATASET DETAILS*********************************************************

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2443 entries, 2 to 2616
Data columns (total 222 columns):
title1_type_cd                                                  int64
clp_ind                                                         int64
focus_clp_ind                                                   int64
summer_program_ind                                              int64
asm_no_spg_ind                                                  int64
no_data_spg_ind                                                 int64
student_num                                                     float64
lea_avg_student_num                                             float64
st_avg_student_num                                              float64
Grad_project_status                                             int64
00_Size                                                         float64
02_Size                 