In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from my_functions import combokey_converter

%matplotlib inline
sns.set_style('whitegrid')
plt.rc('axes', titlesize = 14, titleweight = 'bold', labelweight = 'bold')

# <font color = green> I. Column Info </font>

#  Column info for crdc_1516 
<b><div class="alert alert-block alert-info"> Contains 111 Fields </b>

In [2]:
crdc_cols = pd.read_csv('../filtered_data/00_crdc_1516_initial_layout.csv')

In [3]:
with pd.option_context('display.max_colwidth', 150, 'display.max_rows', 125):
    display(crdc_cols.drop('Module', axis = 1).set_index('Field_Name'))

Unnamed: 0_level_0,Field_Description
Field_Name,Unnamed: 1_level_1
LEA_STATE,District State Abbreviation
LEA_STATE_NAME,District State Name
LEAID,7 Digit LEAID District Identification Code
LEA_NAME,District Name
SCHID,5 Digit School Identification Code
SCH_NAME,School Name
COMBOKEY,7 Digit LEAID District Identification Code+5 Digit School Identification Code
JJ,"Juvenile Justice Facility: ""Yes"" indicates a long-term secure facility; ""No"" indicates not a JJ facility"
SCH_GRADE_PS,Grades with Students Enrolled: Preschool
SCH_GRADE_KG,Grades with Students Enrolled: Kindergarten


In [4]:
len(crdc_cols.index)

111

# Column info for nces_1516
<div class="alert alert-block alert-info">**17 Fields**

In [5]:
nces_cols = pd.read_csv('../filtered_data/01_nces_1516_initial_ccd_layout.csv')

In [6]:
"""Replace \n literals with commas for readability"""
nces_cols['Categorical Values'] = nces_cols['Categorical Values'].apply(lambda x: x.replace('\n', ', ') if type(x) == str else x)

In [7]:
with pd.option_context('display.max_colwidth', 350, 'display.max_rows', 25):
    display(nces_cols[['Variable Name', 'Description', 'Categorical Values']])

Unnamed: 0,Variable Name,Description,Categorical Values
0,LEAID,NCES Agency Identification Number,
1,LEA_NAME,LEA Name,
2,SCHID,NCES school identifier,
3,STABR,State Abreviation,
4,SCH_NAME,School name,
5,TITLEI,Title I Eligible School. This flag indicates whether a school is eligible for participation in either TAS or SWP program authorized by Title I of Public Law 103-382.,"No, Yes, Missing, Not applicable, -9-Suppressed"
6,SCH_TYPE_TEXT,School type (description),"Alternative Education School, Regular School, Special Education School, Vocational Education School,"
7,SCH_TYPE,School type (code),"1 = Regular school, 2 = Special education school, 3 = Vocational school, 4 = Other/alternative school, 5 = Reportable program (new code starting in 2007–08),"
8,LEVEL,School level,"1 = Primary (low grade = PK through 03; high grade = PK through 08), 2 = Middle (low grade = 04 through 07; high grade = 04 through 09), 3 = High (low grade = 07 through 12; high grade = 12 only), 4 = Other (any other configuration not falling within the above three categories;including ungraded), N = Not applicable, ,"
9,VIRTUAL,Virtual School Status,"Missing, No, Yes"


In [8]:
len(nces_cols.index)

17

# <font color = green> II. Data Cleaning/Joining </font>

# crdc_1516 Data
<div class="alert alert-block alert-info"><b> 96,360 Schools before any filtering <br>
111 Fields (Matches the crdc_cols)</b></div>
<br><br>
Used combokey_convert.converter to create a csv-compatible "COMBOKEY"

In [9]:
crdc_1516 = pd.read_csv('../filtered_data/00_crdc_1516_initial.csv', 
                        dtype = {'LEAID':np.object})

In [10]:
crdc_1516['COMBOKEY'] = combokey_converter.convert(crdc_1516, 'LEAID', 'SCHID')

In [11]:
crdc_1516.head()

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,SCH_IBENR_WH_M,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F
0,AL,ALABAMA,100002,Alabama Youth Services,1705,Wallace Sch - Mt Meigs Campus,='010000201705',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
1,AL,ALABAMA,100002,Alabama Youth Services,1706,McNeel Sch - Vacca Campus,='010000201706',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
2,AL,ALABAMA,100002,Alabama Youth Services,1876,Alabama Youth Services,='010000201876',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
3,AL,ALABAMA,100002,Alabama Youth Services,99995,AUTAUGA CAMPUS,='010000299995',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
4,AL,ALABAMA,100005,Albertville City,870,Albertville Middle School,='010000500870',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9


In [12]:
len(crdc_1516.index)

96360

In [13]:
len(crdc_1516.columns)

111

# nces_1516 Data
<div class="alert alert-block alert-info"><b> The nces_1516 Data was recorded in separate files (each with different numbers of schools), so I will have to join the separate files to avoid corruption/loss of data. </b><br>
    <u>Files</u><br>
    1. Characteristics <br>
    2. Directory <br>
    3. Geographic <br>
</div><div class = 'alert alert-block alert-info'>
Like the crdc data, the combokey field was generated using my combokey_converter.convert function.<br></div>

<div class="alert alert-block alert-warning">
1. **100232 Initial Schools**<br><br>
2. **After first inner join (Directory and Characteristics) --> 100232 schools**<br>
Note: I ran a check to ensure that all of the matching combokeys have matching school names -- 100% identical.<br><br>
3. **After second inner join (above_combined and Geographic) --> 100087**<br> Note:  I ran the same check to ensure that all of the schools matched and nearly 9000 came back as non-matching.  I then compared the first word of each of the two name fields, and only 9 schools came back as non-matching.  After close examination, I decided to cull these 9 schools.<br></div><div class = 'alert alert-block alert-warning'>
**CSV saved to '../filtered_data/01_nces_1516_initial_ccd.csv'**

In [14]:
nces_1516_characteristics = pd.read_csv('../filtered_data/01_nces_1516_initial_school_characteristics.csv')

In [15]:
nces_1516_characteristics['combokey'] = combokey_converter.convert(nces_1516_characteristics, 'LEAID', 'SCHID')

In [16]:
len(nces_1516_characteristics.index)

100232

In [17]:
nces_1516_directory = pd.read_csv('../filtered_data/01_nces_1516_initial_school_directory.csv')

In [18]:
nces_1516_directory['combokey'] = combokey_converter.convert(nces_1516_directory, 'LEAID', 'SCHID')

**First Join:  Directory + Characteristics**

In [19]:
nces_1516 = nces_1516_characteristics.set_index('combokey').join(nces_1516_directory.set_index('combokey'), how = 'inner', lsuffix = 'dir_')

In [20]:
len(nces_1516.index)

100232

In [21]:
len(nces_1516[nces_1516.SCH_NAME == nces_1516.SCH_NAMEdir_].index)

100232

In [22]:
nces_1516 = nces_1516.drop(['LEAIDdir_', 'SCHIDdir_', 'SCH_NAMEdir_'], axis = 1)

**Second Join: combined + geo**

In [23]:
nces_1516_geo = pd.read_csv('../filtered_data/01_nces_1516_initial_geographic.csv',  dtype = {'LOCALE15': np.object})

In [24]:
nces_1516_geo['combokey'] = combokey_converter.convert(nces_1516_geo, 'LEAID', 'SCHID')

In [25]:
nces_1516_test = nces_1516.join(nces_1516_geo.set_index('combokey'), how = 'inner', rsuffix = 'dir_')

In [26]:
len(nces_1516_test.index)

100096

In [27]:
"""How many schools have matching School Names between CRDC and NCES?"""
len(nces_1516_test[nces_1516_test.SCH_NAME == nces_1516_test.NAME].index)

91091

In [28]:
def name_checker(sch1, sch2):
    sch1 = sch1.lower()
    sch2 = sch2.lower()
    
    if sch1[0] == sch2[0]:
        return 0
    return 1

nces_1516_test['no_match_name'] = nces_1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['NAME']), axis = 1)
nces_1516_test[nces_1516_test.no_match_name == 1][['NAME', 'SCH_NAME']]

Unnamed: 0_level_0,NAME,SCH_NAME
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1
='051266001562',HYLTON JUNIOR HIGH SCHOOL,LAKESIDE JUNIOR HIGH SCHOOL
='090147001810',Stowe - Early Learning Center (S,EPS PK STEAM Academy
='090171001700',Alternative High School Programs,Greenwich Alternative High School
='090192001616',STEM Magnet School at Dwight,Betances STEM Magnet School
='090279000148',Hyde School of Health Science an,Cortlandt V.R. Creed Health and Sport Sciences...
='090279001543',Helene Grant Headstart,Dr. Mayo Early Childhood School
='090279001585',Katherine Brennan/Clarence Roger,Brennan Rogers School
='090351201476',Education Connection Special Edu,GFLC/ACCESS School
='090423001808',Hatton Preschool Program,Southington Public Schools Preschool Program a...


In [29]:
nces_1516_full = nces_1516_test[nces_1516_test.no_match_name == 0].drop(['LEAIDdir_', 'SCHIDdir_', 'no_match_name', 'NAME'], axis = 1)

In [30]:
nces_1516_full.head()

Unnamed: 0_level_0,TITLEI,LEAID,LEA_NAME,STABR,SCHID,SCH_NAME,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
='010000200277',-9,100002,Alabama Youth Services,AL,277,Sequoyah Sch - Chalkville Campus,Alternative Education School,4,3,No,7,12,Jefferson County,21,33.673661,-86.628755
='010000201667',-9,100002,Alabama Youth Services,AL,1667,Camps,Alternative Education School,4,3,No,7,12,Autauga County,41,32.521681,-86.530132
='010000201670',-9,100002,Alabama Youth Services,AL,1670,Det Ctr,Alternative Education School,4,3,No,7,12,Clarke County,41,31.938444,-87.750529
='010000201705',-9,100002,Alabama Youth Services,AL,1705,Wallace Sch - Mt Meigs Campus,Alternative Education School,4,3,No,7,12,Montgomery County,41,32.374812,-86.08236
='010000201706',-9,100002,Alabama Youth Services,AL,1706,McNeel Sch - Vacca Campus,Alternative Education School,4,3,No,7,12,Jefferson County,12,33.583385,-86.710058


In [31]:
len(nces_1516_full.index)

100087

In [32]:
# nces_1516_full.to_csv('../filtered_data/01_nces_1516_initial_combined_ccd.csv')

# NCES (combined) and CRDC join
<div class="alert alert-block alert-warning">Out of the 96360 schools in the crdc1516 dataset, <b>3861</b> schools did not have a matching Combokey. These non-matching schools were kept in the dataset.<br><br>

Using the name checker function from above, another <b>182</b> schools were found to have School Names whose first words did not match between the NCES and CRDC sets.  Airing on the side of caution, these schools were indiscriminately culled.<br><br>

**Final school count in the combined dataset:  96178**</div>
<div class = 'alert alert_block alert-info'>Dataset saved to '03_crdc_nces_1516_raw_combined.csv'

In [33]:
crdc_nces1516_test = crdc_1516.set_index('COMBOKEY').join(nces_1516_full, how = 'left', rsuffix=('_'))

In [34]:
crdc_nces1516_test[crdc_nces1516_test.SCH_NAME_.isnull()].LEAID.count()

3861

In [35]:
def name_checker(sch1, sch2):
    if type(sch2) == float:
        return 2
    else:
        sch1 = sch1.lower()
        sch2 = sch2.lower()
        
    if sch1[0] == sch2[0]:
        return 0
    return 1

crdc_nces1516_test['no_match_name'] = crdc_nces1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['SCH_NAME_']), axis = 1)

In [36]:
"""How many schools don't have matching Schools Names (from the CRDC and NCES datasets, respectively)"""
len(crdc_nces1516_test[crdc_nces1516_test.no_match_name == 1][['SCH_NAME', 'SCH_NAME_']].index)

182

In [37]:
crdc_nces_1516 = crdc_nces1516_test[crdc_nces1516_test.no_match_name != 1].drop(['LEA_NAME_', 'LEAID_', 'SCHID_', 'SCH_NAME_', 'no_match_name'], axis = 1)

In [38]:
len(crdc_nces_1516.index)

96178

In [39]:
crdc_nces_1516 = crdc_nces_1516.fillna('Missing')

In [40]:
# crdc_nces_1516.to_csv('../filtered_data/03_crdc_nces_1516_raw_combined.csv')

# <font color = green> IV. Filtration </font>

# Select Non-[Junvile Justice, Special Education, and Alternative Schools]
<div class = 'alert alert-block alert-info'>Schools that answered 'No' to each of those three questions on the CRDC Sruvey.<br><br> 
I also used a keyword filter to remove any remaining "Juvenile Justice"-eque Institutions.</div>
<div class = 'alert alert-block alert-warning'>**90448** Schools Remain</div>

In [41]:
filter1_crdc_nces_1516 = crdc_nces_1516[(crdc_nces_1516.JJ == 'No') & (crdc_nces_1516.SCH_STATUS_ALT == 'No') & (crdc_nces_1516.SCH_STATUS_SPED == 'No')]

In [42]:
def jj_keyword_remove(name):
    kws = ['behavioral', 'juvenile', 'correction']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.SCH_NAME.apply(lambda x: jj_keyword_remove(x))]
filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.LEA_NAME.apply(lambda x: jj_keyword_remove(x))]

In [43]:
len(filter1_crdc_nces_1516.index)

90448

# Select Schools with Lowest Grade (9-12) or Highest Grade (12) or Ungraded HS-students
<div class = 'alert alert-block alert-info'>I made formulas that take in the data from CRDC (because there are no null values for the Grade Enrollment Flags), and determines: (1) if a school's lowest grade offered is above 9, (2) if it contains ungraded HS-aged studnets, and (3) if a school's highest grade offered is 12th.<br><br>While NCES CCD does have columns for lowest and highest grades, there were many null values, so the filtration may have been too intense.</div>
<div class = 'alert alert-block alert-warning'>**21606** Schools Remain</div>

In [44]:
"""How many missing values in the grade columns?"""
np.sum(filter1_crdc_nces_1516.SCH_GRADE_G01.isnull())

0

In [45]:
def lowest_grade_9orAbove(g9, g10, g11, g12, ug, ug_hs):
    """Inputs all of the grade enrollment flags from the CRDC dataset
        Returns a label representing whether or not a school's lowest grade is 9, 10, 11, or 12 
        (or if a school is ungraded, but has hs-aged students)
        """
    if ug == 'Yes':
        if ug_hs == 'Yes': 
            return 'UG-Yes'
        else:
            return 'UG-No'

    if g9 == 'Yes':
        return 'Yes'
    if g10 == 'Yes':
        return 'Yes'
    if g11 == 'Yes':
        return 'Yes'
    if g12 == 'Yes':
        return 'Yes'
    return 'No'

In [46]:
def highest_grade_12(g12):
    """Inputs all of the grade enrollment flags from the CRDC dataset
        Returns a label representing whether or not a school's highest grade is 12.
        """
    
    if g12 == 'Yes':
        return 'Yes'
    return 'No'

In [47]:
filter2_crdc_nces_1516 = filter1_crdc_nces_1516.copy()
filter2_crdc_nces_1516['Low_Grade_Above9'] = filter2_crdc_nces_1516.apply(lambda row: lowest_grade_9orAbove(row['SCH_GRADE_G09'], row['SCH_GRADE_G10'], row['SCH_GRADE_G11'], row['SCH_GRADE_G12'], row['SCH_GRADE_UG'], row['SCH_UGDETAIL_HS']), axis = 1)

In [48]:
filter2_crdc_nces_1516['High_Grade_12'] = filter2_crdc_nces_1516.apply(lambda row: highest_grade_12(row['SCH_GRADE_G12']), axis = 1)

In [49]:
"""Breakdown of Schools with a Lowest Grade Above 9 (or with Ungrade HS-aged Students)"""
filter2_crdc_nces_1516.Low_Grade_Above9.value_counts()

No        66886
Yes       19771
UG-No      2294
UG-Yes     1497
Name: Low_Grade_Above9, dtype: int64

In [50]:
"""Breakdown of Schools with a Highest Grade of 12"""
filter2_crdc_nces_1516.High_Grade_12.value_counts()

No     70094
Yes    20354
Name: High_Grade_12, dtype: int64

In [51]:
filter2_crdc_nces_1516 = filter2_crdc_nces_1516[(filter2_crdc_nces_1516.Low_Grade_Above9 == 'Yes') | (filter2_crdc_nces_1516.Low_Grade_Above9 == 'UG-Yes') | (filter2_crdc_nces_1516.High_Grade_12 == 'Yes')]

In [52]:
len(filter2_crdc_nces_1516.index)

21606

# Remove Virtual Schools
<div class = 'alert alert-block alert-info'>
1. Remove any Schools that reported 'Yes' to the Virtual Schools Question<br>
2. Remove Schools that have certain keyword that likely indicate an online school
</div>
<div class = 'alert alert-block alert-warning'>**21209** Schools Remain</div>

In [53]:
filter2_crdc_nces_1516.VIRTUAL.value_counts()

No         17408
Missing     3867
Yes          331
Name: VIRTUAL, dtype: int64

In [54]:
filter3_crdc_nces_1516 = filter2_crdc_nces_1516[filter2_crdc_nces_1516.VIRTUAL != 'Yes']

In [55]:
len(filter3_crdc_nces_1516.index)

21275

In [56]:
def any_missed_virtuals(name):
    kws = ['virtual', 'cyber', 'electronic', 'internet', 'online', 'distance']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter3_crdc_nces_1516 = filter3_crdc_nces_1516[filter3_crdc_nces_1516.SCH_NAME.apply(lambda x: any_missed_virtuals(x))]

In [57]:
len(filter3_crdc_nces_1516.index)

21209

# Remove schools reported as elementary, middle, or "Other"
<div class = 'alert alert-block alert-info'>Even with the Lowest/Highest Grade filter, I wanted to ensure that no non-typical high schools (as reported by the NCES's LEVEL Field) are retained.  The Other category is perhaps the most important to cull here, as many of the very, very large charter-type schools are listed in this category.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**17542** Schools Remain</div>

In [58]:
filter4_crdc_nces_1516 = filter3_crdc_nces_1516.copy()

In [59]:
filter4_crdc_nces_1516.LEVEL.value_counts()

3          16348
4           3214
Missing     1194
2            307
1             76
N             70
Name: LEVEL, dtype: int64

In [60]:
filter4_crdc_nces_1516 = filter4_crdc_nces_1516[(filter4_crdc_nces_1516.LEVEL == 'Missing') | (filter4_crdc_nces_1516.LEVEL == '3')]

In [61]:
len(filter4_crdc_nces_1516.index)

17542

# Select Schools reported as Regular
<div class = 'alert alert-block alert-info'>Removed Schools with a SCH_TYPE that was not 1 (Regular).  Culls additional "Special Education", "Vocational", and "Alternative/Other" schools.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**16451** Schools Remain</div>

In [62]:
filter5_crdc_nces_1516 = filter4_crdc_nces_1516.copy()

In [63]:
filter5_crdc_nces_1516.SCH_TYPE.value_counts()

1.0        15257
Missing     1194
4.0          744
3.0          332
2.0           15
Name: SCH_TYPE, dtype: int64

In [64]:
filter5_crdc_nces_1516 = filter5_crdc_nces_1516[(filter5_crdc_nces_1516.SCH_TYPE == 'Missing') | (filter5_crdc_nces_1516.SCH_TYPE == 1)]

In [65]:
len(filter5_crdc_nces_1516.index)

16451

# <font color = green> V. Dealing with Missing Values </font>
<div class = 'alert alert-cell alert-info'> With nearly 1200 schools missing NCES data, include schools from prominent districts like "NEW YORK CITY PUBLIC SCHOOLS" and "Green Dot Public Schools," it is important to try to recover as much of these schools as possible.
<br><br>
The problem that I found was that the CRDC lumped a number of school districts together; therefore, the combokeys of schools in these districts do not match those of the NCES.
</div>
<div class = 'alert alert-cell alert-info'>
**I tried a number of methods to try to properly join these missing schools:**<br>
- Using only the school name:  This had difficulties because there are many schools that share the same name, so when a join is implemented, these schools are given all of the values of the other schools (i.e. it creates a lot of duplicate values).
- Using the NCES data from 2013:  This was also problematic, as most of the same schools that were missing in this dataset were also constrained to the same problem in the 2013-2014 dataset.<br>
- Using the District and the name together:  This also suffered from the fact that the CRDC data combines some school districts; therefore, the names of the districts still did not match up.<br>
- **Finally, I used a combination of the name of the school and the state:  There were only a handfull in the dataset containing the missing values.**<br><br>
</div>
<div class = 'alert alert-cell alert-warning'>
**821 (out of 1194)** Missing Schools were recovered using this method </div>
<div class = 'alert alert-cell alert-info'>
I performed the same (nces-provided field)-filtration steps on the recovered data.  Then, I hand-removed duplicate values by checking the original filtered data for matching records. </div>
<div class = 'alert alert-cell alert-warning'>
**453** Recovered High Schools Total  </div>

In [66]:
"""Which districts had the most missing schools?"""
with pd.option_context('display.max_rows', 1200):
    display(filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False))

LEA_NAME
NEW YORK CITY PUBLIC SCHOOLS                                                               615
Green Dot Public Schools                                                                    11
NORMAN                                                                                       9
Peters Township SD                                                                           6
Dept. of Svs. for Children Youth & Their Families                                            5
Boston                                                                                       4
Ombudsman Educational Services Ltd. a subsidiary of Educ 2                                   4
OFFICE OF EDUCATION DEPARTMENT OF CHILDREN AND FAMILIES                                      4
Cherokee County                                                                              3
NASSAU BOCES                                                                                 3
WINDSOR SCHOOL DISTRICT                  

In [67]:
filter5_missing_leas = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False)

In [68]:
# filter5_missing_leas.to_csv('../filtered_data/04_inital_filter_missing_LEAs.csv')

In [69]:
"""How many missing schools?"""
filter5_missing_schools = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing']
len(filter5_missing_schools.index)

1194

In [70]:
# filter5_missing_schools.to_csv('../filtered_data/04_intital_filter_missing_schools.csv')

** Manipulate missing schools and original nces data --> join **

In [71]:
filter5_schname_state = filter5_missing_schools.copy()

In [72]:
filter5_schname_state = filter5_schname_state.reset_index()

In [73]:
filter5_schname_state['SCH_NAME'] = filter5_schname_state['SCH_NAME'].apply(lambda x: x.lower())
filter5_schname_state['SCH_NAME_ST_NUM'] = filter5_schname_state.SCH_NAME + filter5_schname_state.LEA_STATE

In [74]:
"""How many duplicate schools in the filter5 dataset?"""
filter5_schname_state.groupby('SCH_NAME_ST_NUM')['SCH_NAME_ST_NUM'].count().sort_values(ascending = False).head(10)

SCH_NAME_ST_NUM
performance learning centerGA                   2
community collaborative charterCA               2
harlem village academies highNY                 2
yuba city charterCA                             1
foreign language academy of global studiesNY    1
flint river programGA                           1
flushing high schoolNY                          1
flushing international high schoolNY            1
food and finance high schoolNY                  1
fordham high school for the artsNY              1
Name: SCH_NAME_ST_NUM, dtype: int64

In [75]:
filter5_schname_state[filter5_schname_state.SCH_NAME_ST_NUM == 'performance learning centerGA']

Unnamed: 0,COMBOKEY,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Low_Grade_Above9,High_Grade_12,SCH_NAME_ST_NUM
334,='130129003727',GA,GEORGIA,1301290,Cobb County,3727,performance learning center,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes,performance learning centerGA
346,='130270003728',GA,GEORGIA,1302700,Harris County,3728,performance learning center,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes,performance learning centerGA


In [76]:
nces_1516_schname_state = nces_1516_full.copy()

In [77]:
nces_1516_schname_state = nces_1516_schname_state.reset_index()

In [78]:
nces_1516_schname_state['SCH_NAME'] = nces_1516_schname_state['SCH_NAME'].apply(lambda x: x.lower())
nces_1516_schname_state['SCH_NAME_ST_NUM'] = nces_1516_schname_state.SCH_NAME + nces_1516_schname_state.STABR

In [79]:
"""Join the NCES and filter5 datasets on the SCH_NAME_ST_NUM column"""
schname_combined = filter5_schname_state.set_index('SCH_NAME_ST_NUM').join(nces_1516_schname_state.set_index('SCH_NAME_ST_NUM'), how = 'left', rsuffix = '_')

In [80]:
"""How many schools have duplicated values?"""
schname_combined.SCH_NAME_.value_counts().sort_values(ascending = False).head(10)

tarrant co j j a e p                              6
community collaborative charter                   4
accelerated achievement academy                   2
university high                                   2
beacon high school                                2
hart el                                           2
performance learning center                       2
life academy high school for film and music       1
world view high school                            1
holcombe l rucker school of community research    1
Name: SCH_NAME_, dtype: int64

In [81]:
"""How may more schools were matched?"""
len(schname_combined[schname_combined.SCH_NAME_.notnull()].index)

821

In [82]:
"""How mnay schools still did not have a match?"""
len(schname_combined[schname_combined.SCH_NAME_.isnull()].index)

384

In [122]:
"""How mnay schools still did not have a match?"""
with pd.option_context('display.max_rows', 400):
    display(schname_combined[schname_combined.SCH_NAME_.isnull()])

Unnamed: 0_level_0,COMBOKEY,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,SCH_TYPE_TEXT_,SCH_TYPE_,LEVEL_,VIRTUAL_,GSLO_,GSHI_,NMCNTY15_,LOCALE15_,LAT1516_,LON1516_
SCH_NAME_ST_NUM,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""""kenneth """"honey"""" rubenstein center""""WV",='540051199999',WV,WEST VIRGINIA,5400511,INSTITUTIONAL EDUCATIONAL PROGRAMS,99999,"""""kenneth """"honey"""" rubenstein center""""",No,No,No,...,,,,,,,,,,
academic enrichment centerOH,='390450499999',OH,OHIO,3904504,Westerville City,99999,academic enrichment center,No,No,No,...,,,,,,,,,,
academy of arts and sciences del mar middle & high 6-12CA,='06CC10700001',CA,CALIFORNIA,06CC107,Academy of Arts and Sciences Del Mar Middle &...,1,academy of arts and sciences del mar middle &...,No,No,No,...,,,,,,,,,,
academy of arts and sciences el cajon middle and high 6-12CA,='06CC11900001',CA,CALIFORNIA,06CC119,Academy of Arts and Sciences El Cajon Middle ...,1,academy of arts and sciences el cajon middle ...,No,No,No,...,,,,,,,,,,
academy of arts and sciences oxnard & venturaCA,='06CC12500001',CA,CALIFORNIA,06CC125,Academy of Arts and Sciences Oxnard & Ventura,1,academy of arts and sciences oxnard & ventura,No,No,Yes,...,,,,,,,,,,
academy of arts and sciences thousand oaks and simiCA,='06CC13500001',CA,CALIFORNIA,06CC135,Academy of Arts and Sciences Thousand Oaks an...,1,academy of arts and sciences thousand oaks an...,No,No,Yes,...,,,,,,,,,,
academy of earth and space science (panther)NJ,='341269099999',NJ,NEW JERSEY,3412690,Paterson Public School District,99999,academy of earth and space science (panther),No,No,No,...,,,,,,,,,,
ace academyGA,='130111002690',GA,GEORGIA,1301110,Cherokee County,2690,ace academy,No,No,No,...,,,,,,,,,,
acld tillotson schoolPA,='420077999999',PA,PENNSYLVANIA,4200779,ACLD TILLOTSON SCHOOL,99999,acld tillotson school,No,No,No,...,,,,,,,,,,
administrative officesKS,='201299099998',KS,KANSAS,2012990,Wichita,99998,administrative offices,No,Yes,No,...,,,,,,,,,,


In [123]:
# schname_combined[schname_combined.SCH_NAME_.isnull()].to_csv('../filtered_data/04_final_missing.csv')

## Performing filters on the recovered schools

** Reformat the Columns ** -- Need to make sure that the recovered schools dataset's columns match the original filtered dataset's columns (required for concatenating the two sets properly)

In [83]:
recovered_schools = schname_combined.copy()
recovered_schools = recovered_schools.fillna("Missing")

In [84]:
recovered_schools = recovered_schools[recovered_schools['SCH_NAME_'] != "Missing"]

In [85]:
"""Drop original nces columns (the ones with missing values)"""    
recovered_schools = recovered_schools.drop(['TITLEI', 'STABR', 'SCH_TYPE_TEXT', 'SCH_TYPE', 'LEVEL', 'VIRTUAL', 'GSLO', 'GSHI', 
                                            'NMCNTY15', 'LOCALE15', 'LAT1516', 'LON1516', 'combokey',
                                            'LEAID_', 'LEA_NAME_', 'SCH_NAME_', 'SCHID_'], axis = 1)
"""Rename new matching columns to replace the columns above (necessary for a proper concatenation later)"""
recovered_schools = recovered_schools.rename(lambda x: x.strip('_'), axis = 'columns')
recovered_schools = recovered_schools.set_index('COMBOKEY')


In [86]:
"""Do the columns between the original filtered set and recovered missing values set match"""
print(len(recovered_schools.columns.values))
print(len(filter5_crdc_nces_1516.columns.values))

124
124


** Non-Virtual Schools **

In [87]:
recovered_schools_filter1 = recovered_schools.copy()

In [88]:
recovered_schools_filter1 = recovered_schools_filter1[recovered_schools.VIRTUAL != 'Yes']

In [89]:
"""How many schools remain?"""
len(recovered_schools_filter1.index)

808

** NCES-Reported High Schools **

In [90]:
recovered_schools_filter2 = recovered_schools_filter1.copy()

In [91]:
recovered_schools_filter2 = recovered_schools_filter2[(recovered_schools_filter2.LEVEL == '3')]

In [92]:
"""How many schools remain?"""
len(recovered_schools_filter2.index)

493

** NCES-Reported Regular **

In [93]:
recovered_schools_filter3 = recovered_schools_filter2.copy()

In [94]:
recovered_schools_filter3 = recovered_schools_filter3[recovered_schools_filter3.SCH_TYPE == 1]

In [95]:
"""How many schools remain?"""
len(recovered_schools_filter3.index)

457

**Clean Duplicate Values **

In [96]:
recovered_schools_filter3.groupby('SCH_NAME')['SCH_NAME'].count().sort_values(ascending = False).head(4)

SCH_NAME
beacon high school                          2
performance learning center                 2
university high                             2
world academy for total community health    1
Name: SCH_NAME, dtype: int64

In [97]:
"""Dealing with Beacon"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('beacon')])

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,SCH_GRADE_G02,SCH_GRADE_G03,SCH_GRADE_G04,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_UG,SCH_UGDETAIL_HS,SCH_STATUS_SPED,SCH_STATUS_MAGNET,SCH_STATUS_CHARTER,SCH_STATUS_ALT,SCH_ENR_HI_M,SCH_ENR_HI_F,SCH_ENR_AM_M,SCH_ENR_AM_F,SCH_ENR_AS_M,SCH_ENR_AS_F,SCH_ENR_HP_M,SCH_ENR_HP_F,SCH_ENR_BL_M,SCH_ENR_BL_F,SCH_ENR_WH_M,SCH_ENR_WH_F,SCH_ENR_TR_M,SCH_ENR_TR_F,TOT_ENR_M,TOT_ENR_F,SCH_ENR_LEP_M,SCH_ENR_LEP_F,SCH_ENR_IDEA_M,SCH_ENR_IDEA_F,SCH_DUAL_IND,SCH_DUALENR_HI_M,SCH_DUALENR_HI_F,...,SCH_APENR_AS_F,SCH_APENR_HP_M,SCH_APENR_HP_F,SCH_APENR_BL_M,SCH_APENR_BL_F,SCH_APENR_WH_M,SCH_APENR_WH_F,SCH_APENR_TR_M,SCH_APENR_TR_F,TOT_APENR_M,TOT_APENR_F,SCH_APENR_LEP_M,SCH_APENR_LEP_F,SCH_APENR_IDEA_M,SCH_APENR_IDEA_F,SCH_IBENR_IND,SCH_IBENR_HI_M,SCH_IBENR_HI_F,SCH_IBENR_AM_M,SCH_IBENR_AM_F,SCH_IBENR_AS_M,SCH_IBENR_AS_F,SCH_IBENR_HP_M,SCH_IBENR_HP_F,SCH_IBENR_BL_M,SCH_IBENR_BL_F,SCH_IBENR_WH_M,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F,Low_Grade_Above9,High_Grade_12,TITLEI,STABR,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='362058000592',NY,NEW YORK,3620580,NEW YORK CITY PUBLIC SCHOOLS,592,beacon high school,No,No,No,No,No,No,No,No,No,No,No,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No,No,92,191,0,8,23,77,0,2,53,125,281,407,17,29,466,839,2,2,64,43,No,-9,-9,...,17,0,0,5,11,32,65,2,0,58,125,0,0,2,2,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,UG-Yes,Yes,No,NY,Regular School,1,3,No,9,12,New York County,11,40.7612,-73.9952
='362058000592',NY,NEW YORK,3620580,NEW YORK CITY PUBLIC SCHOOLS,592,beacon high school,No,No,No,No,No,No,No,No,No,No,No,Yes,Yes,Yes,Yes,Yes,Yes,No,No,No,No,92,191,0,8,23,77,0,2,53,125,281,407,17,29,466,839,2,2,64,43,No,-9,-9,...,17,0,0,5,11,32,65,2,0,58,125,0,0,2,2,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,UG-Yes,Yes,No,NY,Regular School,1,3,No,9,12,Dutchess County,21,41.5145,-73.9635


In [98]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('BEACON')]

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Low_Grade_Above9,High_Grade_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='360414000177',NY,NEW YORK,3604140,BEACON CITY SCHOOL DISTRICT,177,BEACON HIGH SCHOOL,No,No,No,No,...,3,No,9,12,Dutchess County,21,41.5145,-73.9635,UG-Yes,Yes
='362058000592',NY,NEW YORK,3620580,NEW YORK CITY PUBLIC SCHOOLS,592,BEACON HIGH SCHOOL,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,UG-Yes,Yes
='440000800297',RI,RHODE ISLAND,4400008,Beacon Charter School,297,BEACON Charter School,No,No,No,No,...,3,No,9,12,Providence County,21,42.0134,-71.502,Yes,Yes


In [99]:
"""Beacon High School in Dutchess County is already in the filter5 dataset -- Remove"""
recovered_schools_filter4 = recovered_schools_filter3.copy()
recovered_schools_filter4 = recovered_schools_filter4[(recovered_schools_filter4.SCH_NAME != 'beacon high school') | (recovered_schools_filter4.NMCNTY15 != 'Dutchess County')]

In [100]:
"""Dealing with Performance Learning Center"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('performance')])

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,SCH_GRADE_G02,SCH_GRADE_G03,SCH_GRADE_G04,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_UG,SCH_UGDETAIL_HS,SCH_STATUS_SPED,SCH_STATUS_MAGNET,SCH_STATUS_CHARTER,SCH_STATUS_ALT,SCH_ENR_HI_M,SCH_ENR_HI_F,SCH_ENR_AM_M,SCH_ENR_AM_F,SCH_ENR_AS_M,SCH_ENR_AS_F,SCH_ENR_HP_M,SCH_ENR_HP_F,SCH_ENR_BL_M,SCH_ENR_BL_F,SCH_ENR_WH_M,SCH_ENR_WH_F,SCH_ENR_TR_M,SCH_ENR_TR_F,TOT_ENR_M,TOT_ENR_F,SCH_ENR_LEP_M,SCH_ENR_LEP_F,SCH_ENR_IDEA_M,SCH_ENR_IDEA_F,SCH_DUAL_IND,SCH_DUALENR_HI_M,SCH_DUALENR_HI_F,...,SCH_APENR_AS_F,SCH_APENR_HP_M,SCH_APENR_HP_F,SCH_APENR_BL_M,SCH_APENR_BL_F,SCH_APENR_WH_M,SCH_APENR_WH_F,SCH_APENR_TR_M,SCH_APENR_TR_F,TOT_APENR_M,TOT_APENR_F,SCH_APENR_LEP_M,SCH_APENR_LEP_F,SCH_APENR_IDEA_M,SCH_APENR_IDEA_F,SCH_IBENR_IND,SCH_IBENR_HI_M,SCH_IBENR_HI_F,SCH_IBENR_AM_M,SCH_IBENR_AM_F,SCH_IBENR_AS_M,SCH_IBENR_AS_F,SCH_IBENR_HP_M,SCH_IBENR_HP_F,SCH_IBENR_BL_M,SCH_IBENR_BL_F,SCH_IBENR_WH_M,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F,Low_Grade_Above9,High_Grade_12,TITLEI,STABR,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='130129003727',GA,GEORGIA,1301290,Cobb County,3727,performance learning center,No,No,No,No,No,No,No,No,No,No,No,No,Yes,Yes,Yes,No,-9,No,No,No,No,11,14,0,0,0,2,0,0,23,17,29,14,2,2,65,49,2,0,10,4,Yes,0,0,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,Yes,Yes,Yes,GA,Regular School,1,3,No,9,12,Richmond County,12,33.4739,-81.9974
='130270003728',GA,GEORGIA,1302700,Harris County,3728,performance learning center,No,No,No,No,No,No,No,No,No,No,No,Yes,Yes,Yes,Yes,No,-9,No,No,No,No,2,2,0,0,0,0,0,0,11,8,32,32,2,2,47,44,0,0,-2,-2,Yes,0,2,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,Yes,Yes,Yes,GA,Regular School,1,3,No,9,12,Richmond County,12,33.4739,-81.9974


In [101]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('Performance')]

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Low_Grade_Above9,High_Grade_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='130129003727',GA,GEORGIA,1301290,Cobb County,3727,Performance Learning Center,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes
='130270003728',GA,GEORGIA,1302700,Harris County,3728,Performance Learning Center,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes
='130438004221',GA,GEORGIA,1304380,Richmond County,4221,Performance Learning Center,No,No,No,No,...,3,No,9,12,Richmond County,12,33.4739,-81.9974,Yes,Yes
='370297002842',NC,NORTH CAROLINA,3702970,Charlotte-Mecklenburg Schools,2842,Performance Learning Center,No,No,No,No,...,3,No,9,12,Mecklenburg County,11,35.2951,-80.7957,Yes,Yes


In [102]:
"""Both of the performance learning centers here actually matched to a different 'performance learning center' record;
therefore, they should both be removed"""
recovered_schools_filter4 = recovered_schools_filter4[recovered_schools_filter4.SCH_NAME != 'performance learning center']

In [103]:
"""Dealing with university high"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('university high')])

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,SCH_GRADE_G02,SCH_GRADE_G03,SCH_GRADE_G04,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_UG,SCH_UGDETAIL_HS,SCH_STATUS_SPED,SCH_STATUS_MAGNET,SCH_STATUS_CHARTER,SCH_STATUS_ALT,SCH_ENR_HI_M,SCH_ENR_HI_F,SCH_ENR_AM_M,SCH_ENR_AM_F,SCH_ENR_AS_M,SCH_ENR_AS_F,SCH_ENR_HP_M,SCH_ENR_HP_F,SCH_ENR_BL_M,SCH_ENR_BL_F,SCH_ENR_WH_M,SCH_ENR_WH_F,SCH_ENR_TR_M,SCH_ENR_TR_F,TOT_ENR_M,TOT_ENR_F,SCH_ENR_LEP_M,SCH_ENR_LEP_F,SCH_ENR_IDEA_M,SCH_ENR_IDEA_F,SCH_DUAL_IND,SCH_DUALENR_HI_M,SCH_DUALENR_HI_F,...,SCH_APENR_AS_F,SCH_APENR_HP_M,SCH_APENR_HP_F,SCH_APENR_BL_M,SCH_APENR_BL_F,SCH_APENR_WH_M,SCH_APENR_WH_F,SCH_APENR_TR_M,SCH_APENR_TR_F,TOT_APENR_M,TOT_APENR_F,SCH_APENR_LEP_M,SCH_APENR_LEP_F,SCH_APENR_IDEA_M,SCH_APENR_IDEA_F,SCH_IBENR_IND,SCH_IBENR_HI_M,SCH_IBENR_HI_F,SCH_IBENR_AM_M,SCH_IBENR_AM_F,SCH_IBENR_AS_M,SCH_IBENR_AS_F,SCH_IBENR_HP_M,SCH_IBENR_HP_F,SCH_IBENR_BL_M,SCH_IBENR_BL_F,SCH_IBENR_WH_M,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F,Low_Grade_Above9,High_Grade_12,TITLEI,STABR,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='069902400001',CA,CALIFORNIA,699024,University High School,1,university high,No,No,No,No,No,No,No,No,No,No,No,Yes,Yes,Yes,Yes,No,-9,No,No,Yes,No,53,68,5,5,77,98,0,0,5,11,77,89,0,0,217,271,0,0,-2,-2,Yes,38,50,...,47,0,0,5,8,44,50,0,0,103,133,0,0,2,0,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,Yes,Yes,Missing,CA,Regular School,1,3,No,9,12,Fresno County,11,36.8097,-119.748
='069902400001',CA,CALIFORNIA,699024,University High School,1,university high,No,No,No,No,No,No,No,No,No,No,No,Yes,Yes,Yes,Yes,No,-9,No,No,Yes,No,53,68,5,5,77,98,0,0,5,11,77,89,0,0,217,271,0,0,-2,-2,Yes,38,50,...,47,0,0,5,8,44,50,0,0,103,133,0,0,2,0,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,Yes,Yes,Yes,CA,Regular School,1,3,No,9,12,Orange County,12,33.6513,-117.823


In [104]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('University High')].head()

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Low_Grade_Above9,High_Grade_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='040852003206',AZ,ARIZONA,408520,Tolleson Union High School District,3206,University High School,No,No,No,No,...,3,No,9,12,Maricopa County,21,33.4484,-112.264,Yes,Yes
='040880001441',AZ,ARIZONA,408800,Tucson Unified District,1441,University High School,No,No,No,No,...,3,No,9,12,Pima County,11,32.2274,-110.89,Yes,Yes
='068450007067',CA,CALIFORNIA,684500,Irvine Unified,7067,University High,No,No,No,No,...,3,No,9,12,Orange County,12,33.6513,-117.823,Yes,Yes
='069902400001',CA,CALIFORNIA,699024,University High School,1,University High,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes
='090192001381',CT,CONNECTICUT,901920,HARTFORD SCHOOL DISTRICT,1381,University High of Science and Engineering,No,No,No,No,...,3,No,9,12,Hartford County,12,41.7979,-72.7097,Yes,Yes


In [105]:
"""The University High in Irvine was already accounted for; therefore, needs to be removed from the recovered"""
recovered_schools_filter4 = recovered_schools_filter4[(recovered_schools_filter4.SCH_NAME != 'university high') | (recovered_schools_filter4.NMCNTY15 != 'Orange County')]

In [106]:
'How many final recovered values?'
len(recovered_schools_filter4.index)

453

# <font color = green> VI. Concatenating Recovered Missing Values with the original Filtered Dataset </font>
<div class = 'alert alert-cell alert-info'> Finally, I concatenated the recovered high schools with the original filtered set.<br><br>

I ensured that no duplicate values were added in the process.

Then saved the file to "../filtered_data/04_intial_filter.csv" </div>
<div class = 'alert alert-cell alert-warning'>
Final Total:  **15710 High Schools**

In [107]:
"""Remove the missing values"""
filter6_crdc_nces_1516 = filter5_crdc_nces_1516.copy()
filter6_crdc_nces_1516 = filter6_crdc_nces_1516[filter6_crdc_nces_1516.LEVEL != "Missing"]

In [108]:
"""How many initial Duplicates?
Interesting enough, these duplicates appear to legitimate; the problem seems to be that the schools actually have 
different names (e.g. "The ADAIR Co. High"'s are actually supposed to be labeled ADAIR Co. R-I High and ADAIR Co. R-II BRASHEAR)"""
filter6_crdc_nces_1516.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME                    NMCNTY15      
MO     ADAIR CO. HIGH              Adair County      2
TX     STERLING H S                Harris County     2
       TAYLOR H S                  Harris County     2
       LEE H S                     Harris County     2
WY     Wyoming Indian High School  Fremont County    1
Name: SCH_NAME, dtype: int64

In [109]:
recovered_schools_filter4.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME                           NMCNTY15       
TX     ischool high of hickory creek      Denton County      1
NY     bronx high school of business      Bronx County       1
       baruch college campus high school  New York County    1
       bayside high school                Queens County      1
       beacon high school                 New York County    1
Name: SCH_NAME, dtype: int64

In [110]:
# filtered_and_recovered = pd.concat([filter6_crdc_nces_1516, recovered_schools_filter4])
filtered_and_recovered = filter6_crdc_nces_1516.append(recovered_schools_filter4)

In [111]:
"""Do the numbers of columns match?"""
print(len(filter6_crdc_nces_1516.columns.values))
len(filtered_and_recovered.columns.values)

124


124

In [112]:
"""Because Columns are stored as dictionaries, there is no inherent order to the columns -- Pandas automatically 
uses an alphabetical sort on an append/concatenation.  I reorded the columns to show the SCH Name first"""
schName = ['SCH_NAME']
reorder = schName + [c for c in filtered_and_recovered.columns if c not in schName]
filtered_and_recovered = filtered_and_recovered[reorder]

In [113]:
"""No added duplicate records"""
filtered_and_recovered.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME                    NMCNTY15      
TX     LEE H S                     Harris County     2
       TAYLOR H S                  Harris County     2
       STERLING H S                Harris County     2
MO     ADAIR CO. HIGH              Adair County      2
WY     Wyoming Indian High School  Fremont County    1
Name: SCH_NAME, dtype: int64

In [114]:
# filtered_and_recovered.to_csv('../filtered_data/04_inital_filter.csv')

In [115]:
"How many total high schools in the set?"
len(filtered_and_recovered.index)

15710

In [116]:
filtered_and_recovered.SCH_DUAL_IND.value_counts()

Yes    11558
No      4152
Name: SCH_DUAL_IND, dtype: int64

In [117]:
11558 / (11558 + 4152)

0.7357097390197327

In [118]:
filtered_and_recovered.SCH_APENR_IND.value_counts()

Yes    11520
No      4190
Name: SCH_APENR_IND, dtype: int64

In [119]:
11520 / (11520 + 4190)

0.7332908975175048

In [120]:
filtered_and_recovered.SCH_IBENR_IND.value_counts()

No     14911
Yes      799
Name: SCH_IBENR_IND, dtype: int64

In [125]:
filtered_and_recovered['total_enrollment'] = filtered_and_recovered.TOT_ENR_M + filtered_and_recovered.TOT_ENR_F 

In [126]:
filtered_and_recovered.total_enrollment.sum()

13620341