Consolidate and clean enrollment data from California Department of Education - Data included in repository also available at: https://www.cde.ca.gov/ds/sd/sd/

In [32]:
import pandas as pd
import warnings
import numpy as np
warnings.filterwarnings('ignore')

Enrollment - clean and standardize

In [33]:
for i in range(4, 19):
    enrollment = pd.read_csv('Data/Enrollment/enr{}.txt'.format(i), sep="\t", header=0)
    enrollment['year'] = i
    enrollment['ENR_WHITE'] = np.where(enrollment['ETHNIC'] == 7, enrollment['ENR_TOTAL'], 0)
    enrollment['ENR_MALE'] = np.where(enrollment['GENDER'] == "M", enrollment['ENR_TOTAL'], 0)
    
    annual_enrollment = enrollment.groupby(['CDS_CODE'])['ENR_TOTAL'].transform('sum')
    annual_enrollment = pd.DataFrame(annual_enrollment)
    
    white_enrollment = enrollment.groupby(['CDS_CODE'])['ENR_WHITE'].transform('sum')
    white_enrollment = pd.DataFrame(white_enrollment)
    
    male_enrollment = enrollment.groupby(['CDS_CODE'])['ENR_MALE'].transform('sum')
    male_enrollment = pd.DataFrame(male_enrollment)
    
    enrollment2 = pd.merge(enrollment, annual_enrollment, left_index=True, right_index=True)
    enrollment2b = pd.merge(enrollment2, white_enrollment, left_index=True, right_index=True)
    enrollment2c = pd.merge(enrollment2b, male_enrollment, left_index=True, right_index=True)
    
    enrollment3 = enrollment2c.drop_duplicates(subset='CDS_CODE')
    enrollment4 = enrollment3[['CDS_CODE','ENR_TOTAL_y', 'ENR_WHITE_y', 'ENR_MALE_y' ,'year']]
    enrollment4.to_pickle("Data/Enrollment/California_Enrollment_{}.pkl".format(i))
    del enrollment 
    del enrollment2
    del enrollment2b
    del enrollment2c
    del enrollment3
    del enrollment4
    del annual_enrollment
    del male_enrollment
    del white_enrollment

enrollment = pd.read_pickle("Data/Enrollment/California_Enrollment_4.pkl")

for i in range(5, 19):
    enrollment_i = pd.read_pickle("Data/Enrollment/California_Enrollment_{}.pkl".format(i))
    enrollment = enrollment.append(enrollment_i)
    del enrollment_i
    

enrollment = enrollment.sort_values(by=['CDS_CODE', 'year'])
enrollment = enrollment.reset_index(drop=True)

enrollment = enrollment.rename(columns={"ENR_TOTAL_y": "ANN_TOTAL_ENROLL", "ENR_WHITE_y": "WHITE_ENROLL", "ENR_MALE_y": "MALE_ENROLL"})

enrollment['CDS_CODE'] = enrollment['CDS_CODE'].astype(str)
enrollment['CDS_CODE'] = enrollment['CDS_CODE'].str.zfill(14)

English Language Learner - ELL students - standardize and clean

In [34]:
for i in range(4, 19):
    ell = pd.read_csv('Data/EL/elsch{}.txt'.format(i), sep="\t", header=0)
    ell['year'] = i
    annual_ell = ell.groupby(['CDS'])['TOTAL_EL'].transform('sum')
    annual_ell = pd.DataFrame(annual_ell)
    ell2 = pd.merge(ell, annual_ell, left_index=True, right_index=True)
    ell3 = ell2.drop_duplicates(subset='CDS')
    ell4 = ell3[['CDS','TOTAL_EL_y', 'year']]
    ell4.to_pickle("Data/EL/California_ELL_{}.pkl".format(i))
    del ell 
    del ell2
    del ell3
    del ell4
    del annual_ell

ell = pd.read_pickle("Data/EL/California_ELL_4.pkl")

for i in range(5, 19):
    ell_i = pd.read_pickle("Data/EL/California_ELL_{}.pkl".format(i))
    ell = ell.append(ell_i)
    del ell_i

ell = ell.sort_values(by=['CDS', 'year'])
ell = ell.reset_index(drop=True)
ell = ell.rename(columns={"TOTAL_EL_y": "ANN_ELL" ,"CDS" : "CDS_CODE"})


ell['CDS_CODE'] = ell['CDS_CODE'].astype(str)
ell['CDS_CODE'] = ell['CDS_CODE'].str.zfill(14)

Poverty - standardize and clean

In [35]:
for i in range(4, 19):
    poverty = pd.read_excel('Data/Poverty/frpm{}.xls'.format(i), sheet_name='frpm', header=0)
    poverty = poverty.dropna(subset=['School Code'])
    poverty = poverty.dropna(subset=['District Code'])
    poverty['DistrictCode'] = poverty['District Code'].astype(int)
    poverty['SchoolCode'] = poverty['School Code'].astype(int)
    poverty['CountyCode'] = poverty['County Code'].astype(int)
    poverty['DistrictCode'] = poverty['DistrictCode'].astype(str)
    poverty['DistrictCode'] = poverty['DistrictCode'].str.zfill(5)
    poverty['SchoolCode'] = poverty['SchoolCode'].astype(str)
    poverty['SchoolCode'] = poverty['SchoolCode'].str.zfill(7)
    poverty['CountyCode'] = poverty['CountyCode'].astype(str)
    poverty['CountyCode'] = poverty['CountyCode'].str.zfill(2)
    poverty['CDS_CODE'] = poverty['CountyCode'] + poverty['DistrictCode'] + poverty['SchoolCode']

   
    poverty['year'] = i
    annual_poverty = poverty.groupby(['CDS_CODE'])['Total FRPM'].transform('sum')
    annual_poverty = pd.DataFrame(annual_poverty)
    poverty2 = pd.merge(poverty, annual_poverty, left_index=True, right_index=True)
    poverty3 = poverty2.drop_duplicates(subset='CDS_CODE')
    poverty4 = poverty3[['CDS_CODE','Total FRPM_y', 'year']]
    poverty4.to_pickle("Data/Poverty/California_poverty_{}.pkl".format(i))
    del poverty
    del poverty2
    del poverty3
    del poverty4
    del annual_poverty

poverty = pd.read_pickle("Data/Poverty/California_poverty_4.pkl")

for i in range(5, 19):
    poverty_i = pd.read_pickle("Data/Poverty/California_poverty_{}.pkl".format(i))
    poverty = poverty.append(poverty_i)
    del poverty_i

poverty = poverty.sort_values(by=['CDS_CODE', 'year'])
poverty = poverty.reset_index(drop=True)
poverty = poverty.rename(columns={"Total FRPM_y": "ANN_FRPM"})

Merge sets

In [36]:
merge = pd.merge(enrollment, ell, on=['CDS_CODE', 'year'], how = 'outer')
merge2 = pd.merge(merge, poverty, on=['CDS_CODE', 'year'], how = 'outer')

In [37]:
merge_demo = merge2.loc[merge2['ANN_TOTAL_ENROLL'] > 0]

In [38]:
del merge
del merge2

Merge with Master

In [39]:
master_list= pd.read_excel('Data/SchoolDirectory/pubschls.xls', sheet_name='schools', header=0)
master_list = master_list.rename(columns={"CDSCode": "CDS_CODE"})
master_list = master_list.drop(master_list.index[18065])

In [40]:
master_list['CDS_CODE']=master_list['CDS_CODE'].astype('str')

master_list = master_list.drop(columns=['State', 'MailStreet', 'MailStrAbr', 'MailCity', 'MailZip', 'MailState', 'Phone',  
                                        'Ext', 'WebSite', 'AdmFName1', 'AdmLName1', 'AdmEmail1', 'AdmFName2', 'AdmLName2', 
                                        'AdmEmail2', 'AdmFName3', 'AdmLName3'])

master_list['CDS_CODE'] = master_list['CDS_CODE'].astype(str)
master_list['CDS_CODE'] = master_list['CDS_CODE'].str.zfill(14)

In [41]:
All_Schools_Analysis = pd.merge(master_list, merge_demo, on=['CDS_CODE'], how = 'outer')

In [42]:
All_Schools_Analysis['last_year'] = All_Schools_Analysis.groupby('CDS_CODE')['year'].transform('last')
All_Schools_Analysis['open_next_year'] = np.where(All_Schools_Analysis['last_year'] > All_Schools_Analysis['year'], 1, 0)
All_Schools_Analysis['open_two_years'] = np.where(All_Schools_Analysis['last_year'] > All_Schools_Analysis['year']+1, 1, 0)

In [43]:
All_Schools_Analysis = All_Schools_Analysis.sort_values(by=['CDS_CODE', 'year'])

In [44]:
All_Schools_Analysis = All_Schools_Analysis[All_Schools_Analysis.School.notnull()]
All_Schools_Analysis  = All_Schools_Analysis.loc[All_Schools_Analysis['ANN_TOTAL_ENROLL'] > 0]

In [45]:
All_Schools_Analysis.to_pickle("Data/SchoolDirectory/All_Schools_Analysis.pkl")

Clean missing values of ELL and FRPM enrollment for schools with multiple years of data

In [25]:
Charter_Schools_Analysis = All_Schools_Analysis.loc[All_Schools_Analysis['Charter'] == "Y"]
Charter_Schools_Analysis = Charter_Schools_Analysis.loc[Charter_Schools_Analysis['ANN_TOTAL_ENROLL'] > 0]

In [26]:
Charter_Schools_Analysis['Pct_White'] = Charter_Schools_Analysis['WHITE_ENROLL']/Charter_Schools_Analysis['ANN_TOTAL_ENROLL']
Charter_Schools_Analysis['Pct_Male'] = Charter_Schools_Analysis['MALE_ENROLL']/Charter_Schools_Analysis['ANN_TOTAL_ENROLL']
Charter_Schools_Analysis['Pct_ELL'] = Charter_Schools_Analysis['ANN_ELL']/Charter_Schools_Analysis['ANN_TOTAL_ENROLL']
Charter_Schools_Analysis['Pct_FRPM'] = Charter_Schools_Analysis['ANN_FRPM']/Charter_Schools_Analysis['ANN_TOTAL_ENROLL']

In [27]:
missing_ell1 = Charter_Schools_Analysis[Charter_Schools_Analysis['Pct_ELL'].isnull()]
missing_FRPM1 = Charter_Schools_Analysis[Charter_Schools_Analysis['Pct_FRPM'].isnull()]

In [28]:
Charter_Schools_Analysis['Avg_ELL'] = Charter_Schools_Analysis.groupby('CDS_CODE')['Pct_ELL'].transform('mean')
Charter_Schools_Analysis['Avg_FRPM'] = Charter_Schools_Analysis.groupby('CDS_CODE')['Pct_FRPM'].transform('mean')

In [29]:
Charter_Schools_Analysis['Pct_ELL2'] = np.where(Charter_Schools_Analysis['Pct_ELL'].isnull(), Charter_Schools_Analysis.Avg_ELL, Charter_Schools_Analysis.Pct_ELL)
Charter_Schools_Analysis['Pct_FRPM2'] = np.where(Charter_Schools_Analysis['Pct_FRPM'].isnull(), Charter_Schools_Analysis.Avg_FRPM, Charter_Schools_Analysis.Pct_FRPM)

In [30]:
Charter_Schools_Analysis = Charter_Schools_Analysis.drop(['Pct_ELL', 'Pct_FRPM', 'Avg_FRPM', 'Avg_ELL'], axis=1)
Charter_Schools_Analysis = Charter_Schools_Analysis.rename(columns={'Pct_ELL2': 'Pct_ELL', 'Pct_FRPM2': 'Pct_FRPM'})

In [31]:
Charter_Schools_Analysis.to_pickle("Data/SchoolDirectory/CDE_data.pkl")