# <u> NACEP </u>
## 2015-16 CRDC
## High School Filtration

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from my_functions import combokey_converter

%matplotlib inline
sns.set_style('whitegrid')
plt.rc('axes', titlesize = 14, titleweight = 'bold', labelweight = 'bold')

# <font color = green> I. Column Info </font>

#  Column info for crdc_1516 
<b><div class="alert alert-block alert-info"> Contains 111 Fields </b>

In [2]:
crdc_cols = pd.read_csv('../filtered_data/00_crdc_1516_initial_layout.csv')

In [3]:
with pd.option_context('display.max_colwidth', 150, 'display.max_rows', 125):
    display(crdc_cols.drop('Module', axis = 1).set_index('Field_Name'))

Unnamed: 0_level_0,Field_Description
Field_Name,Unnamed: 1_level_1
LEA_STATE,District State Abbreviation
LEA_STATE_NAME,District State Name
LEAID,7 Digit LEAID District Identification Code
LEA_NAME,District Name
SCHID,5 Digit School Identification Code
SCH_NAME,School Name
COMBOKEY,7 Digit LEAID District Identification Code+5 Digit School Identification Code
JJ,"Juvenile Justice Facility: ""Yes"" indicates a long-term secure facility; ""No"" indicates not a JJ facility"
SCH_GRADE_PS,Grades with Students Enrolled: Preschool
SCH_GRADE_KG,Grades with Students Enrolled: Kindergarten


In [4]:
len(crdc_cols.index)

111

# Column info for nces_1516
<div class="alert alert-block alert-info">**17 Fields**

In [5]:
nces_cols = pd.read_csv('../filtered_data/01_nces_1516_initial_ccd_layout.csv')

In [6]:
"""Replace \n literals with commas for readability"""
nces_cols['Categorical Values'] = nces_cols['Categorical Values'].apply(lambda x: x.replace('\n', ', ') if type(x) == str else x)

In [7]:
with pd.option_context('display.max_colwidth', 350, 'display.max_rows', 25):
    display(nces_cols[['Variable Name', 'Description', 'Categorical Values']])

Unnamed: 0,Variable Name,Description,Categorical Values
0,LEAID,NCES Agency Identification Number,
1,LEA_NAME,LEA Name,
2,SCHID,NCES school identifier,
3,STABR,State Abreviation,
4,SCH_NAME,School name,
5,TITLEI,Title I Eligible School. This flag indicates whether a school is eligible for participation in either TAS or SWP program authorized by Title I of Public Law 103-382.,"No, Yes, Missing, Not applicable, -9-Suppressed"
6,SCH_TYPE_TEXT,School type (description),"Alternative Education School, Regular School, Special Education School, Vocational Education School,"
7,SCH_TYPE,School type (code),"1 = Regular school, 2 = Special education school, 3 = Vocational school, 4 = Other/alternative school, 5 = Reportable program (new code starting in 2007–08),"
8,LEVEL,School level,"1 = Primary (low grade = PK through 03; high grade = PK through 08), 2 = Middle (low grade = 04 through 07; high grade = 04 through 09), 3 = High (low grade = 07 through 12; high grade = 12 only), 4 = Other (any other configuration not falling within the above three categories;including ungraded), N = Not applicable, ,"
9,VIRTUAL,Virtual School Status,"Missing, No, Yes"


In [8]:
len(nces_cols.index)

17

# <font color = green> II. Data Cleaning/Joining </font>

# crdc_1516 Data
<div class="alert alert-block alert-info"><b> 96,360 Schools before any filtering <br>
111 Fields (Matches the crdc_cols)</b></div>
<br><br>
Used combokey_convert.converter to create a csv-compatible "COMBOKEY"

In [9]:
crdc_1516 = pd.read_csv('../filtered_data/00_crdc_1516_initial.csv', 
                        dtype = {'LEAID':np.object})

In [10]:
crdc_1516['COMBOKEY'] = combokey_converter.convert(crdc_1516, 'LEAID', 'SCHID')

In [11]:
crdc_1516.head()

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,SCH_IBENR_WH_M,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F
0,AL,ALABAMA,100002,Alabama Youth Services,1705,Wallace Sch - Mt Meigs Campus,='010000201705',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
1,AL,ALABAMA,100002,Alabama Youth Services,1706,McNeel Sch - Vacca Campus,='010000201706',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
2,AL,ALABAMA,100002,Alabama Youth Services,1876,Alabama Youth Services,='010000201876',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
3,AL,ALABAMA,100002,Alabama Youth Services,99995,AUTAUGA CAMPUS,='010000299995',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
4,AL,ALABAMA,100005,Albertville City,870,Albertville Middle School,='010000500870',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9


In [12]:
len(crdc_1516.index)

96360

In [13]:
len(crdc_1516.columns)

111

# nces_1516 Data
<div class="alert alert-block alert-info"><b> The nces_1516 Data was recorded in separate files (each with different numbers of schools), so I will have to join the separate files to avoid corruption/loss of data. </b><br>
    <u>Files</u><br>
    1. Characteristics <br>
    2. Directory <br>
    3. Geographic <br>
</div><div class = 'alert alert-block alert-info'>
Like the crdc data, the combokey field was generated using my combokey_converter.convert function.<br></div>

<div class="alert alert-block alert-warning">
1. **100232 Initial Schools**<br><br>
2. **After first inner join (Directory and Characteristics) --> 100232 schools**<br>
Note: I ran a check to ensure that all of the matching combokeys have matching school names -- 100% identical.<br><br>
3. **After second inner join (above_combined and Geographic) --> 100087**<br> Note:  I ran the same check to ensure that all of the schools matched and nearly 9000 came back as non-matching.  I then compared the first word of each of the two name fields, and only 9 schools came back as non-matching.  After close examination, I decided to cull these 9 schools.<br></div><div class = 'alert alert-block alert-warning'>
**CSV saved to '../filtered_data/01_nces_1516_initial_ccd.csv'**

In [14]:
nces_1516_characteristics = pd.read_csv('../filtered_data/01_nces_1516_initial_school_characteristics.csv')

In [15]:
nces_1516_characteristics['combokey'] = combokey_converter.convert(nces_1516_characteristics, 'LEAID', 'SCHID')

In [16]:
len(nces_1516_characteristics.index)

100232

In [17]:
nces_1516_directory = pd.read_csv('../filtered_data/01_nces_1516_initial_school_directory.csv')

In [18]:
nces_1516_directory['combokey'] = combokey_converter.convert(nces_1516_directory, 'LEAID', 'SCHID')

**First Join:  Directory + Characteristics**

In [19]:
nces_1516 = nces_1516_characteristics.set_index('combokey').join(nces_1516_directory.set_index('combokey'), how = 'inner', lsuffix = 'dir_')

In [20]:
len(nces_1516.index)

100232

In [21]:
len(nces_1516[nces_1516.SCH_NAME == nces_1516.SCH_NAMEdir_].index)

100232

In [22]:
nces_1516 = nces_1516.drop(['LEAIDdir_', 'SCHIDdir_', 'SCH_NAMEdir_'], axis = 1)

**Second Join: combined + geo**

In [23]:
nces_1516_geo = pd.read_csv('../filtered_data/01_nces_1516_initial_geographic.csv',  dtype = {'LOCALE15': np.object})

In [24]:
nces_1516_geo['combokey'] = combokey_converter.convert(nces_1516_geo, 'LEAID', 'SCHID')

In [25]:
nces_1516_test = nces_1516.join(nces_1516_geo.set_index('combokey'), how = 'inner', rsuffix = 'dir_')

In [26]:
len(nces_1516_test.index)

100096

In [27]:
"""How many schools have matching School Names between CRDC and NCES?"""
len(nces_1516_test[nces_1516_test.SCH_NAME == nces_1516_test.NAME].index)

91091

In [28]:
def name_checker(sch1, sch2):
    sch1 = sch1.lower()
    sch2 = sch2.lower()
    
    if sch1[0] == sch2[0]:
        return 0
    return 1

nces_1516_test['no_match_name'] = nces_1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['NAME']), axis = 1)
nces_1516_test[nces_1516_test.no_match_name == 1][['NAME', 'SCH_NAME']]

Unnamed: 0_level_0,NAME,SCH_NAME
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1
='051266001562',HYLTON JUNIOR HIGH SCHOOL,LAKESIDE JUNIOR HIGH SCHOOL
='090147001810',Stowe - Early Learning Center (S,EPS PK STEAM Academy
='090171001700',Alternative High School Programs,Greenwich Alternative High School
='090192001616',STEM Magnet School at Dwight,Betances STEM Magnet School
='090279000148',Hyde School of Health Science an,Cortlandt V.R. Creed Health and Sport Sciences...
='090279001543',Helene Grant Headstart,Dr. Mayo Early Childhood School
='090279001585',Katherine Brennan/Clarence Roger,Brennan Rogers School
='090351201476',Education Connection Special Edu,GFLC/ACCESS School
='090423001808',Hatton Preschool Program,Southington Public Schools Preschool Program a...


In [29]:
nces_1516_full = nces_1516_test[nces_1516_test.no_match_name == 0].drop(['LEAIDdir_', 'SCHIDdir_', 'no_match_name', 'NAME'], axis = 1)

In [30]:
nces_1516_full.head()

Unnamed: 0_level_0,TITLEI,LEAID,LEA_NAME,STABR,SCHID,SCH_NAME,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
='010000200277',-9,100002,Alabama Youth Services,AL,277,Sequoyah Sch - Chalkville Campus,Alternative Education School,4,3,No,7,12,Jefferson County,21,33.673661,-86.628755
='010000201667',-9,100002,Alabama Youth Services,AL,1667,Camps,Alternative Education School,4,3,No,7,12,Autauga County,41,32.521681,-86.530132
='010000201670',-9,100002,Alabama Youth Services,AL,1670,Det Ctr,Alternative Education School,4,3,No,7,12,Clarke County,41,31.938444,-87.750529
='010000201705',-9,100002,Alabama Youth Services,AL,1705,Wallace Sch - Mt Meigs Campus,Alternative Education School,4,3,No,7,12,Montgomery County,41,32.374812,-86.08236
='010000201706',-9,100002,Alabama Youth Services,AL,1706,McNeel Sch - Vacca Campus,Alternative Education School,4,3,No,7,12,Jefferson County,12,33.583385,-86.710058


In [31]:
len(nces_1516_full.index)

100087

In [32]:
# nces_1516_full.to_csv('../filtered_data/01_nces_1516_initial_combined_ccd.csv')

# NCES (combined) and CRDC join
<div class="alert alert-block alert-warning">Out of the 96360 schools in the crdc1516 dataset, <b>3861</b> schools did not have a matching Combokey. These non-matching schools were kept in the dataset.<br><br>

Using the name checker function from above, another <b>182</b> schools were found to have School Names whose first words did not match between the NCES and CRDC sets.  Airing on the side of caution, these schools were indiscriminately culled.<br><br>

**Final school count in the combined dataset:  96178**</div>
<div class = 'alert alert_block alert-info'>Dataset saved to '03_crdc_nces_1516_raw_combined.csv'

In [33]:
crdc_nces1516_test = crdc_1516.set_index('COMBOKEY').join(nces_1516_full, how = 'left', rsuffix=('_'))

In [34]:
crdc_nces1516_test[crdc_nces1516_test.SCH_NAME_.isnull()].LEAID.count()

3861

In [35]:
def name_checker(sch1, sch2):
    if type(sch2) == float:
        return 2
    else:
        sch1 = sch1.lower()
        sch2 = sch2.lower()
        
    if sch1[0] == sch2[0]:
        return 0
    return 1

crdc_nces1516_test['no_match_name'] = crdc_nces1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['SCH_NAME_']), axis = 1)

In [36]:
"""How many schools don't have matching Schools Names (from the CRDC and NCES datasets, respectively)"""
len(crdc_nces1516_test[crdc_nces1516_test.no_match_name == 1][['SCH_NAME', 'SCH_NAME_']].index)

182

In [37]:
crdc_nces_1516 = crdc_nces1516_test[crdc_nces1516_test.no_match_name != 1].drop(['LEA_NAME_', 'LEAID_', 'SCHID_', 'SCH_NAME_', 'no_match_name'], axis = 1)

In [38]:
len(crdc_nces_1516.index)

96178

In [39]:
crdc_nces_1516 = crdc_nces_1516.fillna('Missing')

In [40]:
# crdc_nces_1516.to_csv('../filtered_data/03_crdc_nces_1516_raw_combined.csv')

# <font color = green> IV. Filtration </font>

# Select Non-[Junvile Justice, Special Education, and Alternative Schools]
<div class = 'alert alert-block alert-info'>Schools that answered 'No' to each of those three questions on the CRDC Sruvey.<br><br> 
I also used a keyword filter to remove any remaining "Juvenile Justice"-eque Institutions.</div>
<div class = 'alert alert-block alert-warning'>**90448** Schools Remain</div>

In [41]:
filter1_crdc_nces_1516 = crdc_nces_1516[(crdc_nces_1516.JJ == 'No') & (crdc_nces_1516.SCH_STATUS_ALT == 'No') & (crdc_nces_1516.SCH_STATUS_SPED == 'No')]

In [42]:
def jj_keyword_remove(name):
    kws = ['behavioral', 'juvenile', 'correction']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.SCH_NAME.apply(lambda x: jj_keyword_remove(x))]
filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.LEA_NAME.apply(lambda x: jj_keyword_remove(x))]

In [43]:
len(filter1_crdc_nces_1516.index)

90448

# Select Schools with Lowest Grade (9-12) or Highest Grade (12) or Ungraded HS-students
<div class = 'alert alert-block alert-info'>I made formulas that take in the data from CRDC (because there are no null values for the Grade Enrollment Flags), and determines: (1) if a school's lowest grade offered is above 9, (2) if it contains ungraded HS-aged studnets, and (3) if a school's highest grade offered is 12th.<br><br>While NCES CCD does have columns for lowest and highest grades, there were many null values, so the filtration may have been too intense.</div>
<div class = 'alert alert-block alert-warning'>**21606** Schools Remain</div>

In [44]:
"""How many missing values in the grade columns?"""
np.sum(filter1_crdc_nces_1516.SCH_GRADE_G01.isnull())

0

In [46]:
filter2_crdc_nces_1516 = filter1_crdc_nces_1516.copy()

In [47]:
from my_functions.extra_functions import students_in_11_or_12
filter2_crdc_nces_1516['Students_in_11_12'] = filter2_crdc_nces_1516.apply(lambda row: students_in_11_or_12(row['SCH_GRADE_G11'], row['SCH_GRADE_G12']), axis = 1)

In [48]:
"""Breakdown of Schools with a Lowest Grade Above 9 (or with Ungrade HS-aged Students)"""
filter2_crdc_nces_1516.Students_in_11_12.value_counts()

No     69830
Yes    20618
Name: Students_in_11_12, dtype: int64

In [49]:
filter2_crdc_nces_1516 = filter2_crdc_nces_1516[(filter2_crdc_nces_1516.Students_in_11_12 == 'Yes')]

In [50]:
len(filter2_crdc_nces_1516.index)

20618

# Remove Virtual Schools
<div class = 'alert alert-block alert-info'>
1. Remove any Schools that reported 'Yes' to the Virtual Schools Question<br>
2. Remove Schools that have certain keyword that likely indicate an online school
</div>
<div class = 'alert alert-block alert-warning'>**21209** Schools Remain</div>

In [51]:
filter2_crdc_nces_1516.VIRTUAL.value_counts()

No         16820
Missing     3490
Yes          308
Name: VIRTUAL, dtype: int64

In [52]:
filter3_crdc_nces_1516 = filter2_crdc_nces_1516[filter2_crdc_nces_1516.VIRTUAL != 'Yes']

In [53]:
len(filter3_crdc_nces_1516.index)

20310

In [54]:
def any_missed_virtuals(name):
    kws = ['virtual', 'cyber', 'electronic', 'internet', 'online', 'distance']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter3_crdc_nces_1516 = filter3_crdc_nces_1516[filter3_crdc_nces_1516.SCH_NAME.apply(lambda x: any_missed_virtuals(x))]

In [55]:
len(filter3_crdc_nces_1516.index)

20245

# Remove schools reported as elementary, middle, or 'N'
<div class = 'alert alert-block alert-info'>Even with the Lowest/Highest Grade filter, I wanted to ensure that no non-typical high schools (as reported by the NCES's LEVEL Field) are retained.  The Other category is perhaps the most important to cull here, as many of the very, very large charter-type schools are listed in this category.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**17542** Schools Remain</div>

In [56]:
filter4_crdc_nces_1516 = filter3_crdc_nces_1516.copy()

In [57]:
filter4_crdc_nces_1516.LEVEL.value_counts()

3          16295
4           2791
Missing     1004
1             65
N             65
2             25
Name: LEVEL, dtype: int64

In [58]:
filter4_crdc_nces_1516 = filter4_crdc_nces_1516[(filter4_crdc_nces_1516.LEVEL == 'Missing') | (filter4_crdc_nces_1516.LEVEL == '3') | (filter4_crdc_nces_1516.LEVEL == '4')]

In [59]:
len(filter4_crdc_nces_1516.index)

20090

In [60]:
filter4_crdc_nces_1516[filter4_crdc_nces_1516.LEVEL == '4']

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Students_in_11_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='010000600872',AL,ALABAMA,100006,Marshall County,872,Asbury Sch,No,No,No,No,...,1,4,No,6,12,Marshall County,42,34.3628,-86.1422,Yes
='010004101852',AL,ALABAMA,100041,"Developing Alabama Youth Foundation, (DAY) Inc.",1852,Developing Alabama Youth Prog,No,No,No,No,...,1,4,No,7,11,Shelby County,21,33.2315,-86.8265,Yes
='010004401855',AL,ALABAMA,100044,The Rushton School,1855,The Rushton School,No,No,No,No,...,4,4,No,6,12,Jefferson County,12,33.5454,-86.758,Yes
='010018802194',AL,ALABAMA,100188,Chickasaw City,2194,Chickasaw City High School,No,No,No,No,...,1,4,No,6,12,Mobile County,21,30.7571,-88.0785,Yes
='010024000045',AL,ALABAMA,100240,Autauga County,45,Billingsley High Sch,No,No,Yes,Yes,...,1,4,No,KG,12,Autauga County,42,32.6622,-86.7018,Yes
='010024001408',AL,ALABAMA,100240,Autauga County,1408,Autaugaville Sch,No,No,Yes,Yes,...,1,4,No,KG,12,Autauga County,42,32.4337,-86.6596,Yes
='010039000141',AL,ALABAMA,100390,Birmingham City,141,Huffman High Sch-Magnet,No,No,No,No,...,1,4,No,PK,12,Jefferson County,12,33.6129,-86.682,Yes
='010042000203',AL,ALABAMA,100420,Blount County,203,Appalachian †Sch,No,No,Yes,Yes,...,1,4,No,KG,12,Blount County,42,33.8879,-86.3971,Yes
='010042000211',AL,ALABAMA,100420,Blount County,211,Southeastern Elem Sch,No,No,Yes,Yes,...,1,4,No,KG,12,Blount County,41,33.8314,-86.584,Yes
='010051000229',AL,ALABAMA,100510,Butler County,229,McKenzie High Sch,No,Yes,Yes,Yes,...,1,4,No,PK,12,Butler County,43,31.546,-86.7146,Yes


# Select Schools reported as Regular or Vocational
<div class = 'alert alert-block alert-info'>Removed Schools with a SCH_TYPE that was not 1 (Regular) or 3 (Vocational).  Culls additional "Special Education", and "Alternative/Other" schools.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**16451** Schools Remain</div>

In [61]:
filter5_crdc_nces_1516 = filter4_crdc_nces_1516.copy()

In [62]:
filter5_crdc_nces_1516.SCH_TYPE.value_counts()

1.0        17675
4.0         1034
Missing     1004
3.0          342
2.0           35
Name: SCH_TYPE, dtype: int64

In [63]:
filter5_crdc_nces_1516 = filter5_crdc_nces_1516[(filter5_crdc_nces_1516.SCH_TYPE == 'Missing') | (filter5_crdc_nces_1516.SCH_TYPE == 1) | (filter5_crdc_nces_1516.SCH_TYPE == 3)]

In [64]:
len(filter5_crdc_nces_1516.index)

19021

**Mini-Filter:  Remove schools with 'adult' in the Name **

In [65]:
filter5_crdc_nces_1516 = filter5_crdc_nces_1516[~filter5_crdc_nces_1516.SCH_NAME.str.contains('adult', case=False)]

In [66]:
len(filter5_crdc_nces_1516)

18995

# <font color = green> V. Dealing with Missing Values </font>
<div class = 'alert alert-cell alert-info'> With nearly 1200 schools missing NCES data, including schools from prominent districts like "NEW YORK CITY PUBLIC SCHOOLS" and "Green Dot Public Schools," it is important to try to recover as much of these schools as possible.
<br><br>
The problem that I found was that the CRDC lumped a number of school districts together; therefore, the combokeys of schools in these districts do not match those of the NCES.
</div>

<div class = 'alert alert-cell alert-info'>
**I tried a number of methods to try to properly join these missing schools:**<br>
- Using only the school name:  This had difficulties because there are many schools that share the same name, so when a join is implemented, these schools are given all of the values of the other schools (i.e. it creates a lot of duplicate values).
- Using the NCES data from 2013:  This was also problematic, as most of the same schools that were missing in this dataset were also constrained to the same problem in the 2013-2014 dataset.<br>
- Using the District and the name together:  This also suffered from the fact that the CRDC data combines some school districts; therefore, the names of the districts still did not match up.<br>
- **Finally, I used a combination of the name of the school and the state:  There were only a handfull in the dataset containing the missing values.**<br><br>
</div>

<div class = 'alert alert-cell alert-warning'>
**821 (out of 1194)** Missing Schools were recovered using this method </div>

<div class = 'alert alert-cell alert-info'>
Next, I recovered the remaining schools in the 'New York City Public Schools District', because it was clear that they were simply missing due to a LEA reporting error in the CRDC data.  This process was two-parted:<br>
- First, Because it seemed as though most of these remaining New York schools had the incorrect LEAID, I used the the school id and state abreviation to create a unique identifier.<br>
- Second, I used the NCES database to manually search for the remaining schools correct their combokey
</div>

<div class = 'alert alert-cell alert-warning'>
**36** More High Schools Recovered  </div>

<div class = 'alert alert-cell alert-info'>
I performed the same (nces-provided field)-filtration steps on the recovered data.  Then, I hand-removed duplicate values by checking the original filtered data for matching records. </div>

<div class = 'alert alert-cell alert-warning'>
**468** Recovered High Schools Total  </div>

In [67]:
"""Which districts had the most missing schools?"""
with pd.option_context('display.max_rows', 1200):
    display(filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False))

LEA_NAME
NEW YORK CITY PUBLIC SCHOOLS                                                               477
Green Dot Public Schools                                                                    11
NORMAN                                                                                       7
Dept. of Svs. for Children Youth & Their Families                                            5
OFFICE OF EDUCATION DEPARTMENT OF CHILDREN AND FAMILIES                                      4
Ombudsman Educational Services Ltd. a subsidiary of Educ 2                                   4
TULSA                                                                                        3
Boston                                                                                       3
Cherokee County                                                                              3
Clayton County                                                                               3
Coweta County                            

In [68]:
filter5_missing_leas = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False)

In [69]:
# filter5_missing_leas.to_csv('../filtered_data/04_inital_filter_missing_LEAs.csv')

In [70]:
"""How many missing schools?"""
filter5_missing_schools = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing']
len(filter5_missing_schools.index)

989

In [71]:
# filter5_missing_schools.to_csv('../filtered_data/04_intital_filter_missing_schools.csv')

** Manipulate missing schools and original nces data --> join **

In [72]:
filter5_schname_state = filter5_missing_schools.copy()

In [73]:
filter5_schname_state = filter5_schname_state.reset_index()

In [74]:
filter5_schname_state['SCH_NAME'] = filter5_schname_state['SCH_NAME'].apply(lambda x: x.lower())
filter5_schname_state['SCH_NAME_ST_NUM'] = filter5_schname_state.SCH_NAME + filter5_schname_state.LEA_STATE

In [75]:
"""How many duplicate schools in the filter5 dataset?"""
filter5_schname_state.groupby('SCH_NAME_ST_NUM')['SCH_NAME_ST_NUM'].count().sort_values(ascending = False).head(10)

SCH_NAME_ST_NUM
community collaborative charterCA                                 2
harlem village academies highNY                                   2
performance learning centerGA                                     2
yuba city charterCA                                               1
emma lazarus high schoolNY                                        1
esperanza prepatory academyNY                                     1
escuela popular/center for training and careers, family lrngCA    1
escuela popular accelerated family learning center (k-8)CA        1
escondido charter highCA                                          1
erie 2-chautauqua-cattaraugus boces @ iroquoisNY                  1
Name: SCH_NAME_ST_NUM, dtype: int64

In [76]:
filter5_schname_state[filter5_schname_state.SCH_NAME_ST_NUM == 'performance learning centerGA']

Unnamed: 0,COMBOKEY,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Students_in_11_12,SCH_NAME_ST_NUM
301,='130129003727',GA,GEORGIA,1301290,Cobb County,3727,performance learning center,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,performance learning centerGA
313,='130270003728',GA,GEORGIA,1302700,Harris County,3728,performance learning center,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,performance learning centerGA


In [77]:
nces_1516_schname_state = nces_1516_full.copy()

In [78]:
nces_1516_schname_state = nces_1516_schname_state.reset_index()

In [79]:
nces_1516_schname_state['SCH_NAME'] = nces_1516_schname_state['SCH_NAME'].apply(lambda x: x.lower())
nces_1516_schname_state['SCH_NAME_ST_NUM'] = nces_1516_schname_state.SCH_NAME + nces_1516_schname_state.STABR

In [80]:
"""Join the NCES and filter5 datasets on the SCH_NAME_ST_NUM column"""
schname_combined = filter5_schname_state.set_index('SCH_NAME_ST_NUM').join(nces_1516_schname_state.set_index('SCH_NAME_ST_NUM'), how = 'left', rsuffix = '_')

In [81]:
"""How many schools have duplicated values?"""
schname_combined.SCH_NAME_.value_counts().sort_values(ascending = False).head(10)

tarrant co j j a e p                                          6
community collaborative charter                               4
university high                                               2
beacon high school                                            2
accelerated achievement academy                               2
hart el                                                       2
performance learning center                                   2
energy tech high school                                       1
knowledge and power prep academy international high school    1
life academy high school for film and music                   1
Name: SCH_NAME_, dtype: int64

In [82]:
"""How may more schools were matched?"""
len(schname_combined[schname_combined.SCH_NAME_.notnull()].index)

688

In [83]:
"""How many schools still did not have a match?"""
len(schname_combined[schname_combined.SCH_NAME_.isnull()].index)

312

## Recover the NY Schools

In [84]:
schname_combined_missing = schname_combined.copy()
schname_combined_missing = schname_combined_missing[schname_combined_missing.SCH_NAME_.isnull()]

schname_combined_missing_ny = schname_combined_missing.copy()
schname_combined_missing_ny = schname_combined_missing_ny[schname_combined_missing_ny['LEA_NAME'] == 'NEW YORK CITY PUBLIC SCHOOLS']

In [85]:
print(len(schname_combined_missing_ny.index))
print(schname_combined_missing_ny.SCHID.nunique())

22
22


In [86]:
schname_combined_missing_ny = schname_combined_missing_ny.drop(['TITLEI_', 'STABR_', 'SCH_TYPE_TEXT_', 'SCH_TYPE_',
                                                                'LEVEL_', 'VIRTUAL_', 'GSLO_', 'GSHI_', 
                                            'NMCNTY15_', 'LOCALE15_', 'LAT1516_', 'LON1516_', 'combokey',
                                            'LEAID_', 'LEA_NAME_', 'SCH_NAME_', 'SCHID_'], axis = 1)

In [87]:
def schid_state_maker(schid, state):
    schid = str(schid).zfill(5)
    return schid + state

In [88]:
schname_combined_missing_ny['schid_state'] = schname_combined_missing_ny.apply(lambda row: schid_state_maker(row['SCHID'], row['LEA_STATE']), axis = 1)

In [89]:
nces_for_missing_ny = nces_1516_full.copy()

nces_for_missing_ny['schid_state'] = nces_for_missing_ny.apply(lambda row: schid_state_maker(row['SCHID'], row['STABR']), axis = 1)

In [90]:
missing_ny_joined = schname_combined_missing_ny.set_index('schid_state').join(nces_for_missing_ny.reset_index().set_index('schid_state'), how = 'left', rsuffix = "_")

In [91]:
""" Join the missing NY schools with NCES """
missing_ny_joined[missing_ny_joined.LEVEL_.notnull()][['SCH_NAME','SCH_NAME_']]

Unnamed: 0_level_0,SCH_NAME,SCH_NAME_
schid_state,Unnamed: 1_level_1,Unnamed: 2_level_1
01409NY,"law, government and community service high school",LAW GOVERNMENT AND COMMUNITY SERVICE HIGH SCHOOL
02961NY,"bronx school for law, government and justice",BRONX SCHOOL FOR LAW GOVERNMENT AND JUSTICE
03091NY,"high school of enterprise, business & technology",HIGH SCHOOL OF ENTERPRISE BUSINESS & TECHNOLOGY
04873NY,"new explorations into science,tech and math hi...",NEW EXPLORATIONS INTO SCIENCETECH AND MATH HIG...
05113NY,"high school for law, advocacy and community ju...",HIGH SCHOOL FOR LAW ADVOCACY AND COMMUNITY JUS...
05516NY,"science, tech & research high school at erasmus",SCIENCE TECH & RESEARCH HIGH SCHOOL AT ERASMUS
05521NY,ms 223 laboratory school of finance and techno...,MS 223 LABORATORY SCHOOL OF FINANCE AND TECHNO...
05536NY,"queens high school of teaching, liberal arts a...",QUEENS HIGH SCHOOL OF TEACHING LIBERAL ARTS AN...
05677NY,"marie curie high sch-nursing, medicine & appli...",MARIE CURIE HIGH SCH-NURSING MEDICINE & APPLIE...
05774NY,"high school for arts, imagination and inquiry",HIGH SCHOOL FOR ARTS IMAGINATION AND INQUIRY


In [92]:
""" Dealing with remaining missing NY Schools """
missing_ny_2 = missing_ny_joined.copy()
missing_ny_2 = missing_ny_2[missing_ny_2.LEVEL_.isnull()]

len(missing_ny_2.index)

5

In [93]:
missing_ny_2.columns.values

array(['COMBOKEY', 'LEA_STATE', 'LEA_STATE_NAME', 'LEAID', 'LEA_NAME',
       'SCHID', 'SCH_NAME', 'JJ', 'SCH_GRADE_PS', 'SCH_GRADE_KG',
       'SCH_GRADE_G01', 'SCH_GRADE_G02', 'SCH_GRADE_G03', 'SCH_GRADE_G04',
       'SCH_GRADE_G05', 'SCH_GRADE_G06', 'SCH_GRADE_G07', 'SCH_GRADE_G08',
       'SCH_GRADE_G09', 'SCH_GRADE_G10', 'SCH_GRADE_G11', 'SCH_GRADE_G12',
       'SCH_GRADE_UG', 'SCH_UGDETAIL_HS', 'SCH_STATUS_SPED',
       'SCH_STATUS_MAGNET', 'SCH_STATUS_CHARTER', 'SCH_STATUS_ALT',
       'SCH_ENR_HI_M', 'SCH_ENR_HI_F', 'SCH_ENR_AM_M', 'SCH_ENR_AM_F',
       'SCH_ENR_AS_M', 'SCH_ENR_AS_F', 'SCH_ENR_HP_M', 'SCH_ENR_HP_F',
       'SCH_ENR_BL_M', 'SCH_ENR_BL_F', 'SCH_ENR_WH_M', 'SCH_ENR_WH_F',
       'SCH_ENR_TR_M', 'SCH_ENR_TR_F', 'TOT_ENR_M', 'TOT_ENR_F',
       'SCH_ENR_LEP_M', 'SCH_ENR_LEP_F', 'SCH_ENR_IDEA_M',
       'SCH_ENR_IDEA_F', 'SCH_DUAL_IND', 'SCH_DUALENR_HI_M',
       'SCH_DUALENR_HI_F', 'SCH_DUALENR_AM_M', 'SCH_DUALENR_AM_F',
       'SCH_DUALENR_AS_M', 'SCH_DUALENR_AS_F

In [94]:
missing_ny_2 = missing_ny_2.drop(['TITLEI_', 'STABR_', 'SCH_TYPE_TEXT_', 'SCH_TYPE_',
                   'LEVEL_', 'VIRTUAL_', 'GSLO_', 'GSHI_', 
                   'NMCNTY15_', 'LOCALE15_', 'LAT1516_', 'LON1516_', 'combokey',
                   'LEAID_', 'LEA_NAME_', 'SCH_NAME_', 'SCHID_'], axis = 1)

In [95]:
missing_ny_2['actual_combokey'] = pd.Series(np.resize(0, len(missing_ny_2.index)), dtype = np.object)

# missing_ny_2.at["99780NY", 'actual_combokey'] = "='360012306528'"
# missing_ny_2.at["99796NY", 'actual_combokey'] = "='360012306535'"
# missing_ny_2.at["99775NY", 'actual_combokey'] = "='360012006484'"
# missing_ny_2.at["99776NY", 'actual_combokey'] = "='360010106508'"
# missing_ny_2.at["99805NY", 'actual_combokey'] = "='360008306490'"
missing_ny_2.at["99874NY", 'actual_combokey'] = "='360007706372'"
missing_ny_2.at["99933NY", 'actual_combokey'] = "='360008106380'"
missing_ny_2.at["99968NY", 'actual_combokey'] = "='360007606296'"
missing_ny_2.at["99992NY", 'actual_combokey'] = "='360009706274'"
missing_ny_2.at["99995NY", 'actual_combokey'] = "='360009506273'"

In [96]:
""" Join again on the NCES """
missing_ny_2_joined = missing_ny_2.set_index('actual_combokey').join(nces_1516_full, how = 'left', rsuffix = '_')

In [97]:
"""How many matched?"""
len(missing_ny_2_joined[missing_ny_2_joined.LEVEL_.notnull()].index)

5

## Combine recovered schools and performing filters 

** Concatenate the two recovered Missing NY Schools sets **

In [98]:
missing_ny_joined_matching = missing_ny_joined[missing_ny_joined.LEVEL_.notnull()]

In [99]:
all_missing_ny_recovered = missing_ny_2_joined.append(missing_ny_joined_matching)

**Join the original recovered schools (using schname_st identifier) with the recovered NY schools**

In [100]:
recovered_schools = schname_combined.copy()
recovered_schools = recovered_schools.fillna("Missing")

In [101]:
recovered_schools = recovered_schools[recovered_schools['SCH_NAME_'] != "Missing"]

In [102]:
recovered_schools_all = recovered_schools.append(all_missing_ny_recovered)

** Reformat the Columns ** -- Need to make sure that the recovered schools dataset's columns match the original filtered dataset's columns (required for concatenating the two sets properly)

In [103]:
"""Drop original nces columns (the ones with missing values)"""    
recovered_schools_all = recovered_schools_all.drop(['TITLEI', 'STABR', 'SCH_TYPE_TEXT', 'SCH_TYPE', 'LEVEL', 'VIRTUAL', 'GSLO', 'GSHI', 
                                            'NMCNTY15', 'LOCALE15', 'LAT1516', 'LON1516', 'combokey',
                                            'LEAID_', 'LEA_NAME_', 'SCH_NAME_', 'SCHID_'], axis = 1)
"""Rename new matching columns to replace the columns above (necessary for a proper concatenation later)"""
recovered_schools_all = recovered_schools_all.rename(lambda x: x.strip('_'), axis = 'columns')
recovered_schools_all = recovered_schools_all.set_index('COMBOKEY')
%store recovered_schools_all

Stored 'recovered_schools_all' (DataFrame)


In [104]:
"""Do the columns between the original filtered set and recovered missing values set match"""
print(len(recovered_schools_all.columns.values))
print(len(filter5_crdc_nces_1516.columns.values))

123
123


In [105]:
""" How many schools recovered? """
len(recovered_schools_all.index)

710

In [106]:
recovered_schools_all

Unnamed: 0_level_0,GSHI,GSLO,JJ,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,...,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='362058004305',12,9,No,40.8183,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,138,71,14,9,710,731,-9,-9,No
='359905499999',12,9,No,35.0862,3599054,ABQ Charter Academy,NM,NEW MEXICO,3,11,...,Yes,-9,-9,17,29,130,132,-9,-9,No
='362058001906',12,9,No,40.5824,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,102,97,63,88,800,1307,-9,-9,No
='069998499999',12,9,No,38.4473,699984,Abraxis Charter,CA,CALIFORNIA,3,12,...,Missing,-9,-9,-9,-9,23,20,-9,-9,No
='362058005916',12,6,No,40.6495,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,4,11,...,Yes,16,27,-9,-9,193,262,-9,-9,No
='06CC43900001',12,KG,No,34.5333,06CC439,Academy for Academic Excellence,CA,CALIFORNIA,4,21,...,Missing,52,40,-9,-9,696,683,-9,-9,No
='362058006070',12,9,No,40.7427,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,No,59,47,19,12,279,266,-9,-9,No
='362058006054',12,9,No,40.6336,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,23,30,-9,-9,115,179,-9,-9,No
='362058005866',12,KG,No,40.6968,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,4,11,...,Yes,15,12,-9,-9,149,160,-9,-9,No
='362058006200',12,8,No,40.6759,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,-9,-9,-9,-9,252,109,-9,-9,No


** Non-Virtual Schools **

In [107]:
recovered_schools_filter1 = recovered_schools_all.copy()

In [108]:
recovered_schools_filter1 = recovered_schools_filter1[recovered_schools_filter1.VIRTUAL != 'Yes']

In [109]:
"""How many schools remain?"""
len(recovered_schools_filter1.index)

697

** NCES-Reported High Schools **

In [110]:
recovered_schools_filter2 = recovered_schools_filter1.copy()

In [111]:
recovered_schools_filter2 = recovered_schools_filter2[(recovered_schools_filter2.LEVEL == '3') | (recovered_schools_filter2.LEVEL == '4')]

In [112]:
"""How many schools remain?"""
len(recovered_schools_filter2.index)

692

** NCES-Reported Regular **

In [113]:
recovered_schools_filter3 = recovered_schools_filter2.copy()

In [114]:
recovered_schools_filter3 = recovered_schools_filter3[(recovered_schools_filter3.SCH_TYPE == 1) |\
                                                      (recovered_schools_filter3.SCH_TYPE == 3)]

In [115]:
"""How many schools remain?"""
len(recovered_schools_filter3.index)

665

**Remove Schools with 'Adult' in the Name**

In [116]:
recovered_schools_filter3 = recovered_schools_filter3[~recovered_schools_filter3.SCH_NAME.str.contains('Adult', case=False)]

**Clean Duplicate Values **

In [117]:
recovered_schools_filter3.groupby('SCH_NAME')['SCH_NAME'].count().sort_values(ascending = False).head(5)

SCH_NAME
beacon high school                 2
university high                    2
community collaborative charter    2
performance learning center        2
empire springs charter             1
Name: SCH_NAME, dtype: int64

In [118]:
recovered_schools_filter3

Unnamed: 0_level_0,GSHI,GSLO,JJ,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,...,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='362058004305',12,9,No,40.8183,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,138,71,14,9,710,731,-9,-9,No
='359905499999',12,9,No,35.0862,3599054,ABQ Charter Academy,NM,NEW MEXICO,3,11,...,Yes,-9,-9,17,29,130,132,-9,-9,No
='362058001906',12,9,No,40.5824,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,102,97,63,88,800,1307,-9,-9,No
='069998499999',12,9,No,38.4473,699984,Abraxis Charter,CA,CALIFORNIA,3,12,...,Missing,-9,-9,-9,-9,23,20,-9,-9,No
='362058005916',12,6,No,40.6495,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,4,11,...,Yes,16,27,-9,-9,193,262,-9,-9,No
='06CC43900001',12,KG,No,34.5333,06CC439,Academy for Academic Excellence,CA,CALIFORNIA,4,21,...,Missing,52,40,-9,-9,696,683,-9,-9,No
='362058006070',12,9,No,40.7427,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,No,59,47,19,12,279,266,-9,-9,No
='362058006054',12,9,No,40.6336,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,23,30,-9,-9,115,179,-9,-9,No
='362058005866',12,KG,No,40.6968,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,4,11,...,Yes,15,12,-9,-9,149,160,-9,-9,No
='362058006200',12,8,No,40.6759,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,Yes,-9,-9,-9,-9,252,109,-9,-9,No


In [119]:
recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('beacon')]

Unnamed: 0_level_0,GSHI,GSLO,JJ,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,...,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='362058000592',12,9,No,40.7612,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,...,No,125,58,-9,-9,839,466,-9,-9,No
='362058000592',12,9,No,41.5145,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,21,...,No,125,58,-9,-9,839,466,-9,-9,No


In [120]:
"""Dealing with Beacon"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('beacon')])

Unnamed: 0_level_0,GSHI,GSLO,JJ,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,LON1516,NMCNTY15,SCHID,SCH_APENR_AM_F,SCH_APENR_AM_M,SCH_APENR_AS_F,SCH_APENR_AS_M,SCH_APENR_BL_F,SCH_APENR_BL_M,SCH_APENR_HI_F,SCH_APENR_HI_M,SCH_APENR_HP_F,SCH_APENR_HP_M,SCH_APENR_IDEA_F,SCH_APENR_IDEA_M,SCH_APENR_IND,SCH_APENR_LEP_F,SCH_APENR_LEP_M,SCH_APENR_TR_F,SCH_APENR_TR_M,SCH_APENR_WH_F,SCH_APENR_WH_M,SCH_DUALENR_AM_F,SCH_DUALENR_AM_M,SCH_DUALENR_AS_F,SCH_DUALENR_AS_M,SCH_DUALENR_BL_F,SCH_DUALENR_BL_M,SCH_DUALENR_HI_F,SCH_DUALENR_HI_M,SCH_DUALENR_HP_F,SCH_DUALENR_HP_M,SCH_DUALENR_IDEA_F,SCH_DUALENR_IDEA_M,SCH_DUALENR_LEP_F,SCH_DUALENR_LEP_M,SCH_DUALENR_TR_F,SCH_DUALENR_TR_M,SCH_DUALENR_WH_F,SCH_DUALENR_WH_M,...,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_KG,SCH_GRADE_PS,SCH_GRADE_UG,SCH_IBENR_AM_F,SCH_IBENR_AM_M,SCH_IBENR_AS_F,SCH_IBENR_AS_M,SCH_IBENR_BL_F,SCH_IBENR_BL_M,SCH_IBENR_HI_F,SCH_IBENR_HI_M,SCH_IBENR_HP_F,SCH_IBENR_HP_M,SCH_IBENR_IDEA_F,SCH_IBENR_IDEA_M,SCH_IBENR_IND,SCH_IBENR_LEP_F,SCH_IBENR_LEP_M,SCH_IBENR_TR_F,SCH_IBENR_TR_M,SCH_IBENR_WH_F,SCH_IBENR_WH_M,SCH_NAME,SCH_STATUS_ALT,SCH_STATUS_CHARTER,SCH_STATUS_MAGNET,SCH_STATUS_SPED,SCH_TYPE,SCH_TYPE_TEXT,SCH_UGDETAIL_HS,STABR,Students_in_11_12,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='362058000592',12,9,No,40.7612,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,-73.9952,New York County,592,0,0,17,5,11,5,32,14,0,0,2,2,Yes,0,0,0,2,65,32,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,Yes,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,beacon high school,No,No,No,No,1,Regular School,Yes,NY,Yes,No,125,58,-9,-9,839,466,-9,-9,No
='362058000592',12,9,No,41.5145,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,21,-73.9635,Dutchess County,592,0,0,17,5,11,5,32,14,0,0,2,2,Yes,0,0,0,2,65,32,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,Yes,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,beacon high school,No,No,No,No,1,Regular School,Yes,NY,Yes,No,125,58,-9,-9,839,466,-9,-9,No


In [121]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('BEACON')]

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Students_in_11_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='360414000177',NY,NEW YORK,3604140,BEACON CITY SCHOOL DISTRICT,177,BEACON HIGH SCHOOL,No,No,No,No,...,1,3,No,9,12,Dutchess County,21,41.5145,-73.9635,Yes
='362058000592',NY,NEW YORK,3620580,NEW YORK CITY PUBLIC SCHOOLS,592,BEACON HIGH SCHOOL,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes
='440000800297',RI,RHODE ISLAND,4400008,Beacon Charter School,297,BEACON Charter School,No,No,No,No,...,1,3,No,9,12,Providence County,21,42.0134,-71.502,Yes


In [122]:
"""Beacon High School in Dutchess County is already in the filter5 dataset -- Remove"""
recovered_schools_filter4 = recovered_schools_filter3.copy()
recovered_schools_filter4 = recovered_schools_filter4[(recovered_schools_filter4.SCH_NAME != 'beacon high school') | (recovered_schools_filter4.NMCNTY15 != 'Dutchess County')]

In [123]:
"""Dealing with Performance Learning Center"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('performance')])

Unnamed: 0_level_0,GSHI,GSLO,JJ,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,LON1516,NMCNTY15,SCHID,SCH_APENR_AM_F,SCH_APENR_AM_M,SCH_APENR_AS_F,SCH_APENR_AS_M,SCH_APENR_BL_F,SCH_APENR_BL_M,SCH_APENR_HI_F,SCH_APENR_HI_M,SCH_APENR_HP_F,SCH_APENR_HP_M,SCH_APENR_IDEA_F,SCH_APENR_IDEA_M,SCH_APENR_IND,SCH_APENR_LEP_F,SCH_APENR_LEP_M,SCH_APENR_TR_F,SCH_APENR_TR_M,SCH_APENR_WH_F,SCH_APENR_WH_M,SCH_DUALENR_AM_F,SCH_DUALENR_AM_M,SCH_DUALENR_AS_F,SCH_DUALENR_AS_M,SCH_DUALENR_BL_F,SCH_DUALENR_BL_M,SCH_DUALENR_HI_F,SCH_DUALENR_HI_M,SCH_DUALENR_HP_F,SCH_DUALENR_HP_M,SCH_DUALENR_IDEA_F,SCH_DUALENR_IDEA_M,SCH_DUALENR_LEP_F,SCH_DUALENR_LEP_M,SCH_DUALENR_TR_F,SCH_DUALENR_TR_M,SCH_DUALENR_WH_F,SCH_DUALENR_WH_M,...,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_KG,SCH_GRADE_PS,SCH_GRADE_UG,SCH_IBENR_AM_F,SCH_IBENR_AM_M,SCH_IBENR_AS_F,SCH_IBENR_AS_M,SCH_IBENR_BL_F,SCH_IBENR_BL_M,SCH_IBENR_HI_F,SCH_IBENR_HI_M,SCH_IBENR_HP_F,SCH_IBENR_HP_M,SCH_IBENR_IDEA_F,SCH_IBENR_IDEA_M,SCH_IBENR_IND,SCH_IBENR_LEP_F,SCH_IBENR_LEP_M,SCH_IBENR_TR_F,SCH_IBENR_TR_M,SCH_IBENR_WH_F,SCH_IBENR_WH_M,SCH_NAME,SCH_STATUS_ALT,SCH_STATUS_CHARTER,SCH_STATUS_MAGNET,SCH_STATUS_SPED,SCH_TYPE,SCH_TYPE_TEXT,SCH_UGDETAIL_HS,STABR,Students_in_11_12,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='130129003727',12,9,No,33.4739,1301290,Cobb County,GA,GEORGIA,3,12,-81.9974,Richmond County,3727,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,No,No,No,No,No,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,performance learning center,No,No,No,No,1,Regular School,-9,GA,Yes,Yes,-9,-9,0,0,49,65,-9,-9,No
='130270003728',12,9,No,33.4739,1302700,Harris County,GA,GEORGIA,3,12,-81.9974,Richmond County,3728,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,0,0,0,0,2,5,2,0,0,0,0,0,0,0,2,0,11,5,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,performance learning center,No,No,No,No,1,Regular School,-9,GA,Yes,Yes,-9,-9,17,10,44,47,-9,-9,No


In [124]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('Performance')]

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Students_in_11_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='130129003727',GA,GEORGIA,1301290,Cobb County,3727,Performance Learning Center,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes
='130270003728',GA,GEORGIA,1302700,Harris County,3728,Performance Learning Center,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes
='130438004221',GA,GEORGIA,1304380,Richmond County,4221,Performance Learning Center,No,No,No,No,...,1,3,No,9,12,Richmond County,12,33.4739,-81.9974,Yes
='370053002975',NC,NORTH CAROLINA,3700530,Cabarrus County Schools,2975,Performance Learning Center,No,No,No,No,...,1,4,No,6,12,Cabarrus County,13,35.3649,-80.594,Yes
='370297002842',NC,NORTH CAROLINA,3702970,Charlotte-Mecklenburg Schools,2842,Performance Learning Center,No,No,No,No,...,1,3,No,9,12,Mecklenburg County,11,35.2951,-80.7957,Yes


In [125]:
"""Both of the performance learning centers here actually matched to a different 'performance learning center' record;
therefore, they should both be removed"""
recovered_schools_filter4 = recovered_schools_filter4[recovered_schools_filter4.SCH_NAME != 'performance learning center']

In [126]:
"""Dealing with university high"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('university high')])

Unnamed: 0_level_0,GSHI,GSLO,JJ,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,LON1516,NMCNTY15,SCHID,SCH_APENR_AM_F,SCH_APENR_AM_M,SCH_APENR_AS_F,SCH_APENR_AS_M,SCH_APENR_BL_F,SCH_APENR_BL_M,SCH_APENR_HI_F,SCH_APENR_HI_M,SCH_APENR_HP_F,SCH_APENR_HP_M,SCH_APENR_IDEA_F,SCH_APENR_IDEA_M,SCH_APENR_IND,SCH_APENR_LEP_F,SCH_APENR_LEP_M,SCH_APENR_TR_F,SCH_APENR_TR_M,SCH_APENR_WH_F,SCH_APENR_WH_M,SCH_DUALENR_AM_F,SCH_DUALENR_AM_M,SCH_DUALENR_AS_F,SCH_DUALENR_AS_M,SCH_DUALENR_BL_F,SCH_DUALENR_BL_M,SCH_DUALENR_HI_F,SCH_DUALENR_HI_M,SCH_DUALENR_HP_F,SCH_DUALENR_HP_M,SCH_DUALENR_IDEA_F,SCH_DUALENR_IDEA_M,SCH_DUALENR_LEP_F,SCH_DUALENR_LEP_M,SCH_DUALENR_TR_F,SCH_DUALENR_TR_M,SCH_DUALENR_WH_F,SCH_DUALENR_WH_M,...,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_KG,SCH_GRADE_PS,SCH_GRADE_UG,SCH_IBENR_AM_F,SCH_IBENR_AM_M,SCH_IBENR_AS_F,SCH_IBENR_AS_M,SCH_IBENR_BL_F,SCH_IBENR_BL_M,SCH_IBENR_HI_F,SCH_IBENR_HI_M,SCH_IBENR_HP_F,SCH_IBENR_HP_M,SCH_IBENR_IDEA_F,SCH_IBENR_IDEA_M,SCH_IBENR_IND,SCH_IBENR_LEP_F,SCH_IBENR_LEP_M,SCH_IBENR_TR_F,SCH_IBENR_TR_M,SCH_IBENR_WH_F,SCH_IBENR_WH_M,SCH_NAME,SCH_STATUS_ALT,SCH_STATUS_CHARTER,SCH_STATUS_MAGNET,SCH_STATUS_SPED,SCH_TYPE,SCH_TYPE_TEXT,SCH_UGDETAIL_HS,STABR,Students_in_11_12,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='069902400001',12,9,No,36.8097,699024,University High School,CA,CALIFORNIA,3,11,-119.748,Fresno County,1,2,2,47,29,8,5,26,23,0,0,0,2,Yes,0,0,0,0,50,44,2,5,68,50,8,5,50,38,0,0,0,2,0,0,0,0,68,65,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,university high,No,Yes,No,No,1,Regular School,-9,CA,Yes,Missing,133,103,196,163,271,217,-9,-9,No
='069902400001',12,9,No,33.6513,699024,University High School,CA,CALIFORNIA,3,12,-117.823,Orange County,1,2,2,47,29,8,5,26,23,0,0,0,2,Yes,0,0,0,0,50,44,2,5,68,50,8,5,50,38,0,0,0,2,0,0,0,0,68,65,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,university high,No,Yes,No,No,1,Regular School,-9,CA,Yes,Yes,133,103,196,163,271,217,-9,-9,No


In [127]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('University High')].head()

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516,Students_in_11_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='040852003206',AZ,ARIZONA,408520,Tolleson Union High School District,3206,University High School,No,No,No,No,...,1,3,No,9,12,Maricopa County,21,33.4484,-112.264,Yes
='040880001441',AZ,ARIZONA,408800,Tucson Unified District,1441,University High School,No,No,No,No,...,1,3,No,9,12,Pima County,11,32.2274,-110.89,Yes
='068450007067',CA,CALIFORNIA,684500,Irvine Unified,7067,University High,No,No,No,No,...,1,3,No,9,12,Orange County,12,33.6513,-117.823,Yes
='069902400001',CA,CALIFORNIA,699024,University High School,1,University High,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes
='090192001381',CT,CONNECTICUT,901920,HARTFORD SCHOOL DISTRICT,1381,University High of Science and Engineering,No,No,No,No,...,1,3,No,9,12,Hartford County,12,41.7979,-72.7097,Yes


In [128]:
"""The University High in Irvine was already accounted for; therefore, needs to be removed from the recovered"""
recovered_schools_filter4 = recovered_schools_filter4[(recovered_schools_filter4.SCH_NAME != 'university high') | (recovered_schools_filter4.NMCNTY15 != 'Orange County')]

In [129]:
'How many final recovered values?'
len(recovered_schools_filter4.index)

661

# <font color = green> VI. Concatenating Recovered Missing Values with the original Filtered Dataset </font>
<div class = 'alert alert-cell alert-info'> Finally, I concatenated the recovered high schools with the original filtered set.<br><br>

I ensured that no duplicate values were added in the process.

Then saved the file to "../filtered_data/04_filter_final.csv" </div>
<div class = 'alert alert-cell alert-warning'>
Final Total:  **15725 High Schools**

In [130]:
"""Remove the missing values"""
filter6_crdc_nces_1516 = filter5_crdc_nces_1516.copy()
filter6_crdc_nces_1516 = filter6_crdc_nces_1516[filter6_crdc_nces_1516.LEVEL != "Missing"]

In [131]:
"""How many initial Duplicates?
Interesting enough, these duplicates appear to legitimate; the problem seems to be that the schools actually have 
different names (e.g. "The ADAIR Co. High"'s are actually supposed to be labeled ADAIR Co. R-I High and ADAIR Co. R-II BRASHEAR)"""
filter6_crdc_nces_1516.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME          NMCNTY15     
TX     TAYLOR H S        Harris County    2
       LEE H S           Harris County    2
MO     ADAIR CO. HIGH    Adair County     2
TX     STERLING H S      Harris County    2
KS     South Haven High  Sumner County    1
Name: SCH_NAME, dtype: int64

In [132]:
"""Any dulications in the recovered schools?
    The community collaborative charter schools are two different schools."""
recovered_schools_filter4.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME                                   NMCNTY15         
CA     community collaborative charter            Sacramento County    2
TX     ischool high of hickory creek              Denton County        1
NY     arts and media preparatory academy         Kings County         1
       baccalaureate school for global education  Queens County        1
       aviation career and technical high school  Queens County        1
Name: SCH_NAME, dtype: int64

In [133]:
# filtered_and_recovered = pd.concat([filter6_crdc_nces_1516, recovered_schools_filter4])
filtered_and_recovered = filter6_crdc_nces_1516.append(recovered_schools_filter4)

In [134]:
"""Do the numbers of columns match?"""
print(len(filter6_crdc_nces_1516.columns.values))
len(filtered_and_recovered.columns.values)

123


123

In [135]:
"""Because Columns are stored as dictionaries, there is no inherent order to the columns -- Pandas automatically 
uses an alphabetical sort on an append/concatenation.  I reorded the columns to show the SCH Name first"""
schName = ['SCH_NAME']
reorder = schName + [c for c in filtered_and_recovered.columns if c not in schName]
filtered_and_recovered = filtered_and_recovered[reorder]

In [136]:
"""No added duplicate records"""
filtered_and_recovered.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head(6)

STABR  SCH_NAME                         NMCNTY15         
TX     LEE H S                          Harris County        2
CA     community collaborative charter  Sacramento County    2
TX     TAYLOR H S                       Harris County        2
       STERLING H S                     Harris County        2
MO     ADAIR CO. HIGH                   Adair County         2
WY     Wyoming Indian High School       Fremont County       1
Name: SCH_NAME, dtype: int64

In [137]:
"How many total high schools in the set?"
len(filtered_and_recovered.index)

18667

In [138]:
# filtered_and_recovered.to_csv('../filtered_data/04_filter_final.csv')

# Final Missing Schools
<div class = 'alert alert-cell alert-info'>**348 Schools**<br> Saved to '04_final_missing.csv'

In [139]:
final_missing = schname_combined[(schname_combined.SCH_NAME_.isnull()) & (schname_combined.LEA_NAME != 'NEW YORK CITY PUBLIC SCHOOLS')]

In [140]:
""" How many final missing schools? """
len(final_missing.index)

290

In [141]:
# final_missing.to_csv('../filtered_data/04_final_missing.csv')

In [142]:
""" Top remaining unaccounted districts """
final_missing.groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False).head(10)

LEA_NAME
NORMAN                                                     7
Dept. of Svs. for Children Youth & Their Families          5
OFFICE OF EDUCATION DEPARTMENT OF CHILDREN AND FAMILIES    4
ERIE 2-CHAUTAUQUA-CATTARAUGUS BOCES                        3
Clayton County                                             3
NASSAU BOCES                                               3
Cherokee County                                            3
WINDSOR SCHOOL DISTRICT                                    3
Boston                                                     3
TULSA                                                      3
Name: LEAID, dtype: int64