# <u> NACEP </u>
## 2015-16 CRDC
## High School Filtration

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from my_functions import combokey_converter

%matplotlib inline
sns.set_style('whitegrid')
plt.rc('axes', titlesize = 14, titleweight = 'bold', labelweight = 'bold')

# <font color = green> I. Column Info </font>

#  Column info for crdc_1516 
<b><div class="alert alert-block alert-info"> Contains 111 Fields </b>

In [2]:
crdc_cols = pd.read_csv('../filtered_data/00_crdc_1516_initial_layout.csv')

In [3]:
with pd.option_context('display.max_colwidth', 150, 'display.max_rows', 125):
    display(crdc_cols.drop('Module', axis = 1).set_index('Field_Name'))

Unnamed: 0_level_0,Field_Description
Field_Name,Unnamed: 1_level_1
LEA_STATE,District State Abbreviation
LEA_STATE_NAME,District State Name
LEAID,7 Digit LEAID District Identification Code
LEA_NAME,District Name
SCHID,5 Digit School Identification Code
SCH_NAME,School Name
COMBOKEY,7 Digit LEAID District Identification Code+5 Digit School Identification Code
JJ,"Juvenile Justice Facility: ""Yes"" indicates a long-term secure facility; ""No"" indicates not a JJ facility"
SCH_GRADE_PS,Grades with Students Enrolled: Preschool
SCH_GRADE_KG,Grades with Students Enrolled: Kindergarten


In [4]:
len(crdc_cols.index)

111

# Column info for nces_1516
<div class="alert alert-block alert-info">**17 Fields**

In [5]:
nces_cols = pd.read_csv('../filtered_data/01_nces_1516_initial_ccd_layout.csv')

In [6]:
"""Replace \n literals with commas for readability"""
nces_cols['Categorical Values'] = nces_cols['Categorical Values'].apply(lambda x: x.replace('\n', ', ') if type(x) == str else x)

In [7]:
with pd.option_context('display.max_colwidth', 350, 'display.max_rows', 25):
    display(nces_cols[['Variable Name', 'Description', 'Categorical Values']])

Unnamed: 0,Variable Name,Description,Categorical Values
0,LEAID,NCES Agency Identification Number,
1,LEA_NAME,LEA Name,
2,SCHID,NCES school identifier,
3,STABR,State Abreviation,
4,SCH_NAME,School name,
5,TITLEI,Title I Eligible School. This flag indicates whether a school is eligible for participation in either TAS or SWP program authorized by Title I of Public Law 103-382.,"No, Yes, Missing, Not applicable, -9-Suppressed"
6,SCH_TYPE_TEXT,School type (description),"Alternative Education School, Regular School, Special Education School, Vocational Education School,"
7,SCH_TYPE,School type (code),"1 = Regular school, 2 = Special education school, 3 = Vocational school, 4 = Other/alternative school, 5 = Reportable program (new code starting in 2007–08),"
8,LEVEL,School level,"1 = Primary (low grade = PK through 03; high grade = PK through 08), 2 = Middle (low grade = 04 through 07; high grade = 04 through 09), 3 = High (low grade = 07 through 12; high grade = 12 only), 4 = Other (any other configuration not falling within the above three categories;including ungraded), N = Not applicable, ,"
9,VIRTUAL,Virtual School Status,"Missing, No, Yes"


In [8]:
len(nces_cols.index)

17

# <font color = green> II. Data Cleaning/Joining </font>

# crdc_1516 Data
<div class="alert alert-block alert-info"><b> 96,360 Schools before any filtering <br>
111 Fields (Matches the crdc_cols)</b></div>
<br><br>
Used combokey_convert.converter to create a csv-compatible "COMBOKEY"

In [9]:
crdc_1516 = pd.read_csv('../filtered_data/00_crdc_1516_initial.csv', 
                        dtype = {'LEAID':np.object})

In [10]:
crdc_1516['COMBOKEY'] = combokey_converter.convert(crdc_1516, 'LEAID', 'SCHID')

In [11]:
crdc_1516.head()

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,SCH_IBENR_WH_M,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F
0,AL,ALABAMA,100002,Alabama Youth Services,1705,Wallace Sch - Mt Meigs Campus,='010000201705',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
1,AL,ALABAMA,100002,Alabama Youth Services,1706,McNeel Sch - Vacca Campus,='010000201706',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
2,AL,ALABAMA,100002,Alabama Youth Services,1876,Alabama Youth Services,='010000201876',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
3,AL,ALABAMA,100002,Alabama Youth Services,99995,AUTAUGA CAMPUS,='010000299995',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
4,AL,ALABAMA,100005,Albertville City,870,Albertville Middle School,='010000500870',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9


In [12]:
len(crdc_1516.index)

96360

In [13]:
len(crdc_1516.columns)

111

# nces_1516 Data
<div class="alert alert-block alert-info"><b> The nces_1516 Data was recorded in separate files (each with different numbers of schools), so I will have to join the separate files to avoid corruption/loss of data. </b><br>
    <u>Files</u><br>
    1. Characteristics <br>
    2. Directory <br>
    3. Geographic <br>
</div><div class = 'alert alert-block alert-info'>
Like the crdc data, the combokey field was generated using my combokey_converter.convert function.<br></div>

<div class="alert alert-block alert-warning">
1. **100232 Initial Schools**<br><br>
2. **After first inner join (Directory and Characteristics) --> 100232 schools**<br>
Note: I ran a check to ensure that all of the matching combokeys have matching school names -- 100% identical.<br><br>
3. **After second inner join (above_combined and Geographic) --> 100087**<br> Note:  I ran the same check to ensure that all of the schools matched and nearly 9000 came back as non-matching.  I then compared the first word of each of the two name fields, and only 9 schools came back as non-matching.  After close examination, I decided to cull these 9 schools.<br></div><div class = 'alert alert-block alert-warning'>
**CSV saved to '../filtered_data/01_nces_1516_initial_ccd.csv'**

In [14]:
nces_1516_characteristics = pd.read_csv('../filtered_data/01_nces_1516_initial_school_characteristics.csv')

In [15]:
nces_1516_characteristics['combokey'] = combokey_converter.convert(nces_1516_characteristics, 'LEAID', 'SCHID')

In [16]:
len(nces_1516_characteristics.index)

100232

In [17]:
nces_1516_directory = pd.read_csv('../filtered_data/01_nces_1516_initial_school_directory.csv')

In [18]:
nces_1516_directory['combokey'] = combokey_converter.convert(nces_1516_directory, 'LEAID', 'SCHID')

**First Join:  Directory + Characteristics**

In [19]:
nces_1516 = nces_1516_characteristics.set_index('combokey').join(nces_1516_directory.set_index('combokey'), how = 'inner', lsuffix = 'dir_')

In [20]:
len(nces_1516.index)

100232

In [21]:
len(nces_1516[nces_1516.SCH_NAME == nces_1516.SCH_NAMEdir_].index)

100232

In [22]:
nces_1516 = nces_1516.drop(['LEAIDdir_', 'SCHIDdir_', 'SCH_NAMEdir_'], axis = 1)

**Second Join: combined + geo**

In [23]:
nces_1516_geo = pd.read_csv('../filtered_data/01_nces_1516_initial_geographic.csv',  dtype = {'LOCALE15': np.object})

In [24]:
nces_1516_geo['combokey'] = combokey_converter.convert(nces_1516_geo, 'LEAID', 'SCHID')

In [25]:
nces_1516_test = nces_1516.join(nces_1516_geo.set_index('combokey'), how = 'inner', rsuffix = 'dir_')

In [26]:
len(nces_1516_test.index)

100096

In [27]:
"""How many schools have matching School Names between CRDC and NCES?"""
len(nces_1516_test[nces_1516_test.SCH_NAME == nces_1516_test.NAME].index)

91091

In [28]:
def name_checker(sch1, sch2):
    sch1 = sch1.lower()
    sch2 = sch2.lower()
    
    if sch1[0] == sch2[0]:
        return 0
    return 1

nces_1516_test['no_match_name'] = nces_1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['NAME']), axis = 1)
nces_1516_test[nces_1516_test.no_match_name == 1][['NAME', 'SCH_NAME']]

Unnamed: 0_level_0,NAME,SCH_NAME
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1
='051266001562',HYLTON JUNIOR HIGH SCHOOL,LAKESIDE JUNIOR HIGH SCHOOL
='090147001810',Stowe - Early Learning Center (S,EPS PK STEAM Academy
='090171001700',Alternative High School Programs,Greenwich Alternative High School
='090192001616',STEM Magnet School at Dwight,Betances STEM Magnet School
='090279000148',Hyde School of Health Science an,Cortlandt V.R. Creed Health and Sport Sciences...
='090279001543',Helene Grant Headstart,Dr. Mayo Early Childhood School
='090279001585',Katherine Brennan/Clarence Roger,Brennan Rogers School
='090351201476',Education Connection Special Edu,GFLC/ACCESS School
='090423001808',Hatton Preschool Program,Southington Public Schools Preschool Program a...


In [29]:
nces_1516_full = nces_1516_test[nces_1516_test.no_match_name == 0].drop(['LEAIDdir_', 'SCHIDdir_', 'no_match_name', 'NAME'], axis = 1)

In [30]:
nces_1516_full.head()

Unnamed: 0_level_0,TITLEI,LEAID,LEA_NAME,STABR,SCHID,SCH_NAME,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
='010000200277',-9,100002,Alabama Youth Services,AL,277,Sequoyah Sch - Chalkville Campus,Alternative Education School,4,3,No,7,12,Jefferson County,21,33.673661,-86.628755
='010000201667',-9,100002,Alabama Youth Services,AL,1667,Camps,Alternative Education School,4,3,No,7,12,Autauga County,41,32.521681,-86.530132
='010000201670',-9,100002,Alabama Youth Services,AL,1670,Det Ctr,Alternative Education School,4,3,No,7,12,Clarke County,41,31.938444,-87.750529
='010000201705',-9,100002,Alabama Youth Services,AL,1705,Wallace Sch - Mt Meigs Campus,Alternative Education School,4,3,No,7,12,Montgomery County,41,32.374812,-86.08236
='010000201706',-9,100002,Alabama Youth Services,AL,1706,McNeel Sch - Vacca Campus,Alternative Education School,4,3,No,7,12,Jefferson County,12,33.583385,-86.710058


In [31]:
len(nces_1516_full.index)

100087

** Third Join - Enrollments **

In [32]:
nces_enrollment = pd.read_csv('../filtered_data/01_nces_1516_initial_membership.csv')

In [33]:
nces_enrollment['COMBOKEY'] = combokey_converter.convert(nces_enrollment, 'LEAID', 'SCHID')

In [34]:
def missing_value_mapper(value):
    """Converts any negative number into 0, as these negative numbers represent missing/null values"""
    if isinstance(value, int):
        if value < 0:
            return 0
    return value

In [35]:
nces_enrollment_cleaned = nces_enrollment.applymap(missing_value_mapper)

In [36]:
nces_1516_full = nces_1516_full.join(nces_enrollment_cleaned.set_index('COMBOKEY'), how='left', rsuffix=('_drop'))

In [37]:
"""Drop the schools that did not have matching COMBOKEYS (They were all seemed fairly odd)"""
nces_1516_full = nces_1516_full[nces_1516_full.SCH_NAME_drop.notnull()]

In [38]:
keep_columns = [c for c in nces_1516_full.columns if '_drop' not in c]
nces_1516_full = nces_1516_full[keep_columns]

In [39]:
# nces_1516_full.to_csv('../filtered_data/01_nces_1516_initial_combined_ccd.csv')

# NCES (combined) and CRDC join
<div class="alert alert-block alert-warning">Out of the 96360 schools in the crdc1516 dataset, <b>3861</b> schools did not have a matching Combokey. These non-matching schools were kept in the dataset.<br><br>

Using the name checker function from above, another <b>182</b> schools were found to have School Names whose first words did not match between the NCES and CRDC sets.  Airing on the side of caution, these schools were indiscriminately culled.<br><br>

**Final school count in the combined dataset:  96178**</div>
<div class = 'alert alert_block alert-info'>Dataset saved to '03_crdc_nces_1516_raw_combined.csv'

In [40]:
crdc_nces1516_test = crdc_1516.set_index('COMBOKEY').join(nces_1516_full, how = 'left', rsuffix=('_'))

In [41]:
crdc_nces1516_test[crdc_nces1516_test.SCH_NAME_.isnull()].LEAID.count()

4244

In [42]:
def name_checker(sch1, sch2):
    if type(sch2) == float:
        return 2
    else:
        sch1 = sch1.lower()
        sch2 = sch2.lower()
        
    if sch1[0] == sch2[0]:
        return 0
    return 1

crdc_nces1516_test['no_match_name'] = crdc_nces1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['SCH_NAME_']), axis = 1)

In [43]:
"""How many schools don't have matching Schools Names (from the CRDC and NCES datasets, respectively)"""
len(crdc_nces1516_test[crdc_nces1516_test.no_match_name == 1][['SCH_NAME', 'SCH_NAME_']].index)

178

In [44]:
crdc_nces_1516 = crdc_nces1516_test[crdc_nces1516_test.no_match_name != 1].drop(['LEA_NAME_', 'LEAID_', 'SCHID_', 'SCH_NAME_', 'no_match_name'], axis = 1)

In [45]:
len(crdc_nces_1516.index)

96182

In [46]:
crdc_nces_1516 = crdc_nces_1516.fillna('Missing')

In [47]:
# crdc_nces_1516.to_csv('../filtered_data/03_crdc_nces_1516_raw_combined.csv')

**Check to see how close the Enrollment Numbers between NCES and CRDC are**

In [48]:
crdc_total_enrollments = crdc_nces_1516.TOT_ENR_M + crdc_nces_1516.TOT_ENR_F

In [49]:
nces_total_enrollments = crdc_nces_1516[['MEMBER', 'SCH_NAME', 'LEA_STATE']]

In [50]:
enrollment_compare = pd.concat([crdc_total_enrollments, nces_total_enrollments], axis = 1)
enrollment_compare = enrollment_compare.rename({0:'crdc', 'MEMBER':'nces'}, axis = 1)
enrollment_compare = enrollment_compare[enrollment_compare.nces != 'Missing']
enrollment_compare['diff'] = enrollment_compare['crdc'] - enrollment_compare['nces']
enrollment_compare['pct_diff'] = abs((enrollment_compare['diff'] / enrollment_compare['crdc'])) * 100

In [51]:
enrollment_compare.sort_values('diff').head()

Unnamed: 0_level_0,crdc,nces,SCH_NAME,LEA_STATE,diff,pct_diff
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
='482730003072',2051,4377,LEWISVILLE H S,TX,-2326,113.408
='200336002040',346,2623,Andover eCademy,KS,-2277,658.092
='060001412375',24,1228,Encore Jr./Sr. High Sch for the Perf and Visua...,CA,-1204,5016.67
='180567001031',1270,2400,Lawrence North High School,IN,-1130,88.9764
='180567001029',1298,2388,Lawrence Central High School,IN,-1090,83.9753


In [52]:
enrollment_compare.sort_values('diff', ascending=False).head()

Unnamed: 0_level_0,crdc,nces,SCH_NAME,LEA_STATE,diff,pct_diff
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
='250402002028',3805,667,James M. Quinn School,MA,3138,82.4704
='069107810186',3271,643,Los Angeles County Special Education,CA,2628,80.3424
='293168002216',2551,0,SOUTH CENTRAL CAREER CTR.,MO,2551,100.0
='180567000725',2210,0,McKenzie Career Center,IN,2210,100.0
='401956000932',2468,498,MCLOUD HS,OK,1970,79.8217


In [53]:
enrollment_compare.sort_values('pct_diff', ascending=False)

Unnamed: 0_level_0,crdc,nces,SCH_NAME,LEA_STATE,diff,pct_diff
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
='181305000807',2,492,Western Wayne Elementary School,IN,-490,24500
='064074010681',2,414,MIT Academy,CA,-412,20600
='063213013060',2,239,Connect Community Charter,CA,-237,11850
='360100506110',4,447,EQUALITY CHARTER SCHOOL,NY,-443,11075
='470014802267',2,223,New Consortium of Law and Business,TN,-221,11050
='341599000709',4,441,GOVERNOR CHARLES C STRATTON,NJ,-437,10925
='040636000556',2,164,Picacho School,AZ,-162,8100
='360405005411',7,567,DANA L LYON MIDDLE SCHOOL,NY,-560,8000
='530267002818',2,153,Special Services,WA,-151,7550
='050228000007',4,218,ALPENA HIGH SCHOOL,AR,-214,5350


In [54]:
print(format(int(enrollment_compare.nces.sum()),',d'))
print(format(enrollment_compare.crdc.sum(), ',d'))

48,540,110
48,935,652


# <font color = green> IV. Filtration </font>

# Select Non-[Junvile Justice, Special Education, and Alternative Schools]
<div class = 'alert alert-block alert-info'>Schools that answered 'No' to each of those three questions on the CRDC Sruvey.<br><br> 
I also used a keyword filter to remove any remaining "Juvenile Justice"-eque Institutions.</div>
<div class = 'alert alert-block alert-warning'>**90448** Schools Remain</div>

In [55]:
filter1_crdc_nces_1516 = crdc_nces_1516[(crdc_nces_1516.JJ == 'No') & (crdc_nces_1516.SCH_STATUS_ALT == 'No') & (crdc_nces_1516.SCH_STATUS_SPED == 'No')]

In [56]:
def jj_keyword_remove(name):
    kws = ['behavioral', 'juvenile', 'correction']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.SCH_NAME.apply(lambda x: jj_keyword_remove(x))]
filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.LEA_NAME.apply(lambda x: jj_keyword_remove(x))]

In [57]:
len(filter1_crdc_nces_1516.index)

90449

# Select Schools with Lowest Grade (9-12) or Highest Grade (12) or Ungraded HS-students
<div class = 'alert alert-block alert-info'>I made formulas that take in the data from CRDC (because there are no null values for the Grade Enrollment Flags), and determines: (1) if a school's lowest grade offered is above 9, (2) if it contains ungraded HS-aged studnets, and (3) if a school's highest grade offered is 12th.<br><br>While NCES CCD does have columns for lowest and highest grades, there were many null values, so the filtration may have been too intense.</div>
<div class = 'alert alert-block alert-warning'>**21606** Schools Remain</div>

In [58]:
"""How many missing values in the grade columns?"""
np.sum(filter1_crdc_nces_1516.SCH_GRADE_G01.isnull())

0

In [59]:
def lowest_grade_9orAbove(g9, g10, g11, g12, ug, ug_hs):
    """Inputs all of the grade enrollment flags from the CRDC dataset
        Returns a label representing whether or not a school's lowest grade is 9, 10, 11, or 12 
        (or if a school is ungraded, but has hs-aged students)
        """
    if ug == 'Yes':
        if ug_hs == 'Yes': 
            return 'UG-Yes'
        else:
            return 'UG-No'

    if g9 == 'Yes':
        return 'Yes'
    if g10 == 'Yes':
        return 'Yes'
    if g11 == 'Yes':
        return 'Yes'
    if g12 == 'Yes':
        return 'Yes'
    return 'No'

In [60]:
def highest_grade_12(g12):
    """Inputs all of the grade enrollment flags from the CRDC dataset
        Returns a label representing whether or not a school's highest grade is 12.
        """
    
    if g12 == 'Yes':
        return 'Yes'
    return 'No'

In [61]:
filter2_crdc_nces_1516 = filter1_crdc_nces_1516.copy()
filter2_crdc_nces_1516['Low_Grade_Above9'] = filter2_crdc_nces_1516.apply(lambda row: lowest_grade_9orAbove(row['SCH_GRADE_G09'], row['SCH_GRADE_G10'], row['SCH_GRADE_G11'], row['SCH_GRADE_G12'], row['SCH_GRADE_UG'], row['SCH_UGDETAIL_HS']), axis = 1)

In [62]:
filter2_crdc_nces_1516['High_Grade_12'] = filter2_crdc_nces_1516.apply(lambda row: highest_grade_12(row['SCH_GRADE_G12']), axis = 1)

In [63]:
"""Breakdown of Schools with a Lowest Grade Above 9 (or with Ungrade HS-aged Students)"""
filter2_crdc_nces_1516.Low_Grade_Above9.value_counts()

No        66887
Yes       19771
UG-No      2294
UG-Yes     1497
Name: Low_Grade_Above9, dtype: int64

In [64]:
"""Breakdown of Schools with a Highest Grade of 12"""
filter2_crdc_nces_1516.High_Grade_12.value_counts()

No     70095
Yes    20354
Name: High_Grade_12, dtype: int64

In [65]:
filter2_crdc_nces_1516 = filter2_crdc_nces_1516[(filter2_crdc_nces_1516.Low_Grade_Above9 == 'Yes') | (filter2_crdc_nces_1516.Low_Grade_Above9 == 'UG-Yes') | (filter2_crdc_nces_1516.High_Grade_12 == 'Yes')]

In [66]:
len(filter2_crdc_nces_1516.index)

21606

# Remove Virtual Schools
<div class = 'alert alert-block alert-info'>
1. Remove any Schools that reported 'Yes' to the Virtual Schools Question<br>
2. Remove Schools that have certain keyword that likely indicate an online school
</div>
<div class = 'alert alert-block alert-warning'>**21209** Schools Remain</div>

In [67]:
filter2_crdc_nces_1516.VIRTUAL.value_counts()

No         17338
Missing     3937
Yes          331
Name: VIRTUAL, dtype: int64

In [68]:
filter3_crdc_nces_1516 = filter2_crdc_nces_1516[filter2_crdc_nces_1516.VIRTUAL != 'Yes']

In [69]:
len(filter3_crdc_nces_1516.index)

21275

In [70]:
def any_missed_virtuals(name):
    kws = ['virtual', 'cyber', 'electronic', 'internet', 'online', 'distance']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter3_crdc_nces_1516 = filter3_crdc_nces_1516[filter3_crdc_nces_1516.SCH_NAME.apply(lambda x: any_missed_virtuals(x))]

In [71]:
len(filter3_crdc_nces_1516.index)

21209

# Remove schools reported as elementary, middle, or "Other"
<div class = 'alert alert-block alert-info'>Even with the Lowest/Highest Grade filter, I wanted to ensure that no non-typical high schools (as reported by the NCES's LEVEL Field) are retained.  The Other category is perhaps the most important to cull here, as many of the very, very large charter-type schools are listed in this category.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**17542** Schools Remain</div>

In [72]:
filter4_crdc_nces_1516 = filter3_crdc_nces_1516.copy()

In [73]:
filter4_crdc_nces_1516.LEVEL.value_counts()

3          16348
4           3214
Missing     1264
2            307
1             76
Name: LEVEL, dtype: int64

In [74]:
filter4_crdc_nces_1516 = filter4_crdc_nces_1516[(filter4_crdc_nces_1516.LEVEL == 'Missing') | (filter4_crdc_nces_1516.LEVEL == '3')]

In [75]:
len(filter4_crdc_nces_1516.index)

17612

# Select Schools reported as Regular
<div class = 'alert alert-block alert-info'>Removed Schools with a SCH_TYPE that was not 1 (Regular).  Culls additional "Special Education", "Vocational", and "Alternative/Other" schools.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**16451** Schools Remain</div>

In [76]:
filter5_crdc_nces_1516 = filter4_crdc_nces_1516.copy()

In [77]:
filter5_crdc_nces_1516.SCH_TYPE.value_counts()

1.0        15257
Missing     1264
4.0          744
3.0          332
2.0           15
Name: SCH_TYPE, dtype: int64

In [78]:
filter5_crdc_nces_1516 = filter5_crdc_nces_1516[(filter5_crdc_nces_1516.SCH_TYPE == 'Missing') | (filter5_crdc_nces_1516.SCH_TYPE == 1)]

In [79]:
len(filter5_crdc_nces_1516.index)

16521

# <font color = green> V. Dealing with Missing Values </font>
<div class = 'alert alert-cell alert-info'> With nearly 1200 schools missing NCES data, including schools from prominent districts like "NEW YORK CITY PUBLIC SCHOOLS" and "Green Dot Public Schools," it is important to try to recover as much of these schools as possible.
<br><br>
The problem that I found was that the CRDC lumped a number of school districts together; therefore, the combokeys of schools in these districts do not match those of the NCES.
</div>

<div class = 'alert alert-cell alert-info'>
**I tried a number of methods to try to properly join these missing schools:**<br>
- Using only the school name:  This had difficulties because there are many schools that share the same name, so when a join is implemented, these schools are given all of the values of the other schools (i.e. it creates a lot of duplicate values).
- Using the NCES data from 2013:  This was also problematic, as most of the same schools that were missing in this dataset were also constrained to the same problem in the 2013-2014 dataset.<br>
- Using the District and the name together:  This also suffered from the fact that the CRDC data combines some school districts; therefore, the names of the districts still did not match up.<br>
- **Finally, I used a combination of the name of the school and the state:  There were only a handfull in the dataset containing the missing values.**<br><br>
</div>

<div class = 'alert alert-cell alert-warning'>
**821 (out of 1194)** Missing Schools were recovered using this method </div>

<div class = 'alert alert-cell alert-info'>
Next, I recovered the remaining schools in the 'New York City Public Schools District', because it was clear that they were simply missing due to a LEA reporting error in the CRDC data.  This process was two-parted:<br>
- First, Because it seemed as though most of these remaining New York schools had the incorrect LEAID, I used the the school id and state abreviation to create a unique identifier.<br>
- Second, I used the NCES database to manually search for the remaining schools correct their combokey
</div>

<div class = 'alert alert-cell alert-warning'>
**36** More High Schools Recovered  </div>

<div class = 'alert alert-cell alert-info'>
I performed the same (nces-provided field)-filtration steps on the recovered data.  Then, I hand-removed duplicate values by checking the original filtered data for matching records. </div>

<div class = 'alert alert-cell alert-warning'>
**468** Recovered High Schools Total  </div>

In [80]:
"""Which districts had the most missing schools?"""
with pd.option_context('display.max_rows', 1200):
    display(filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False))

LEA_NAME
NEW YORK CITY PUBLIC SCHOOLS                                                               615
Green Dot Public Schools                                                                    11
OFFICE OF EDUCATION DEPARTMENT OF CHILDREN AND FAMILIES                                     10
NORMAN                                                                                       9
Peters Township SD                                                                           6
Dept. of Svs. for Children Youth & Their Families                                            5
Ombudsman Educational Services Ltd. a subsidiary of Educ 2                                   4
Boston                                                                                       4
LINCOLN PUBLIC SCHOOLS                                                                       4
BOYS TOWN INTERIM PRG SCHS                                                                   4
WINDSOR SCHOOL DISTRICT                  

In [81]:
filter5_missing_leas = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False)

In [82]:
# filter5_missing_leas.to_csv('../filtered_data/04_inital_filter_missing_LEAs.csv')

In [83]:
"""How many missing schools?"""
filter5_missing_schools = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing']
len(filter5_missing_schools.index)

1264

In [84]:
# filter5_missing_schools.to_csv('../filtered_data/04_intital_filter_missing_schools.csv')

** Manipulate missing schools and original nces data --> join **

In [85]:
filter5_schname_state = filter5_missing_schools.copy()

In [86]:
filter5_schname_state = filter5_schname_state.reset_index()

In [87]:
filter5_schname_state['SCH_NAME'] = filter5_schname_state['SCH_NAME'].apply(lambda x: x.lower())
filter5_schname_state['SCH_NAME_ST_NUM'] = filter5_schname_state.SCH_NAME + filter5_schname_state.LEA_STATE

In [88]:
"""How many duplicate schools in the filter5 dataset?"""
filter5_schname_state.groupby('SCH_NAME_ST_NUM')['SCH_NAME_ST_NUM'].count().sort_values(ascending = False).head(10)

SCH_NAME_ST_NUM
performance learning centerGA           2
harlem village academies highNY         2
community collaborative charterCA       2
ferris school for boysDE                1
fiorello h laguardia high schoolNY      1
flint river programGA                   1
flushing high schoolNY                  1
flushing international high schoolNY    1
food and finance high schoolNY          1
fordham high school for the artsNY      1
Name: SCH_NAME_ST_NUM, dtype: int64

In [89]:
filter5_schname_state[filter5_schname_state.SCH_NAME_ST_NUM == 'performance learning centerGA']

Unnamed: 0,COMBOKEY,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,G08,G09,G10,G11,G12,UG,MEMBER,Low_Grade_Above9,High_Grade_12,SCH_NAME_ST_NUM
346,='130129003727',GA,GEORGIA,1301290,Cobb County,3727,performance learning center,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes,performance learning centerGA
358,='130270003728',GA,GEORGIA,1302700,Harris County,3728,performance learning center,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes,performance learning centerGA


In [90]:
nces_1516_schname_state = nces_1516_full.copy()

In [91]:
nces_1516_schname_state = nces_1516_schname_state.reset_index()

In [92]:
nces_1516_schname_state['SCH_NAME'] = nces_1516_schname_state['SCH_NAME'].apply(lambda x: x.lower())
nces_1516_schname_state['SCH_NAME_ST_NUM'] = nces_1516_schname_state.SCH_NAME + nces_1516_schname_state.STABR

In [93]:
"""Join the NCES and filter5 datasets on the SCH_NAME_ST_NUM column"""
schname_combined = filter5_schname_state.set_index('SCH_NAME_ST_NUM').join(nces_1516_schname_state.set_index('SCH_NAME_ST_NUM'), how = 'left', rsuffix = '_')

In [94]:
"""How many schools have duplicated values?"""
schname_combined.SCH_NAME_.value_counts().sort_values(ascending = False).head(10)

tarrant co j j a e p                                 6
community collaborative charter                      4
hart el                                              2
performance learning center                          2
beacon high school                                   2
university high                                      2
accelerated achievement academy                      2
american sign language & english secondary school    1
in-tech academy (ms/hs 368)                          1
manhattan academy for arts and languages             1
Name: SCH_NAME_, dtype: int64

In [95]:
"""How may more schools were matched?"""
len(schname_combined[schname_combined.SCH_NAME_.notnull()].index)

819

In [96]:
"""How mnay schools still did not have a match?"""
len(schname_combined[schname_combined.SCH_NAME_.isnull()].index)

456

## Recover the NY Schools

In [114]:
schname_combined_missing = schname_combined.copy()
schname_combined_missing = schname_combined_missing[schname_combined_missing.SCH_NAME_.isnull()]

schname_combined_missing_ny = schname_combined_missing.copy()
schname_combined_missing_ny = schname_combined_missing_ny[schname_combined_missing_ny['LEA_NAME'] == 'NEW YORK CITY PUBLIC SCHOOLS']

In [115]:
print(len(schname_combined_missing_ny.index))
print(schname_combined_missing_ny.SCHID.nunique())

36
36


In [116]:
schname_combined_missing_ny = schname_combined_missing_ny.drop(['TITLEI_', 'STABR_', 'SCH_TYPE_TEXT_', 'SCH_TYPE_',
                                                                'LEVEL_', 'VIRTUAL_', 'GSLO_', 'GSHI_', 
                                            'NMCNTY15_', 'LOCALE15_', 'LAT1516_', 'LON1516_', 'combokey',
                                            'LEAID_', 'LEA_NAME_', 'SCH_NAME_', 'SCHID_','PK_', 'KG_', 'G01_', 'G02_', 'G03_', 'G04_', 'G05_', 'G06_',
       'G07_', 'G08_', 'G09_', 'G10_', 'G11_', 'G12_', 'UG_', 'MEMBER_'], axis = 1)

In [117]:
def schid_state_maker(schid, state):
    schid = str(schid).zfill(5)
    return schid + state

In [118]:
schname_combined_missing_ny['schid_state'] = schname_combined_missing_ny.apply(lambda row: schid_state_maker(row['SCHID'], row['LEA_STATE']), axis = 1)

In [119]:
nces_for_missing_ny = nces_1516_full.copy()

nces_for_missing_ny['schid_state'] = nces_for_missing_ny.apply(lambda row: schid_state_maker(row['SCHID'], row['STABR']), axis = 1)

In [120]:
missing_ny_joined = schname_combined_missing_ny.set_index('schid_state').join(nces_for_missing_ny.reset_index().set_index('schid_state'), how = 'left', rsuffix = "_")

In [121]:
""" Join the missing NY schools with NCES """
missing_ny_joined[missing_ny_joined.LEVEL_.notnull()][['SCH_NAME','SCH_NAME_']]

Unnamed: 0_level_0,SCH_NAME,SCH_NAME_
schid_state,Unnamed: 1_level_1,Unnamed: 2_level_1
01348NY,is 187 the christa mcauliffe school,IS 187 CHRISTA MCAULIFFE SCHOOL (THE)
01409NY,"law, government and community service high school",LAW GOVERNMENT AND COMMUNITY SERVICE HIGH SCHOOL
02147NY,ps/ms 31 the william lloyd garrison,PS/MS 31 WILLIAM LLOYD GARRISON (THE)
02199NY,ps 45 horace e greene,PS/IS 45 HORACE E GREENE
02316NY,jhs 80 the mosholu parkway,JHS 80 MOSHOLU PARKWAY (THE)
02731NY,ps 225 the eileen e zaglin,PS 225 EILEEN E ZAGLIN (THE)
02829NY,ps 377 alejandina b de gautier,PS 377 ALEJANDRINA B DE GAUTIER
02961NY,"bronx school for law, government and justice",BRONX SCHOOL FOR LAW GOVERNMENT AND JUSTICE
03091NY,"high school of enterprise, business & technology",HIGH SCHOOL OF ENTERPRISE BUSINESS & TECHNOLOGY
04873NY,"new explorations into science,tech and math hi...",NEW EXPLORATIONS INTO SCIENCETECH AND MATH HIG...


In [122]:
""" Dealing with remaining missing NY Schools """
missing_ny_2 = missing_ny_joined.copy()
missing_ny_2 = missing_ny_2[missing_ny_2.LEVEL_.isnull()]

len(missing_ny_2.index)

10

In [125]:
missing_ny_2 = missing_ny_2.drop(['TITLEI_', 'STABR_', 'SCH_TYPE_TEXT_', 'SCH_TYPE_',
                   'LEVEL_', 'VIRTUAL_', 'GSLO_', 'GSHI_', 
                   'NMCNTY15_', 'LOCALE15_', 'LAT1516_', 'LON1516_', 'combokey',
                   'LEAID_', 'LEA_NAME_', 'SCH_NAME_', 'SCHID_', 'PK_', 'KG_', 'G01_', 
                   'G02_', 'G03_', 'G04_', 'G05_', 'G06_', 'G07_', 'G08_', 'G09_', 
                   'G10_', 'G11_', 'G12_', 'UG_', 'MEMBER_'], axis = 1)

In [130]:
missing_ny_2['actual_combokey'] = pd.Series(np.resize(0, len(missing_ny_2)), dtype = np.object)

missing_ny_2.at["99780NY", 'actual_combokey'] = "='360012306528'"
missing_ny_2.at["99796NY", 'actual_combokey'] = "='360012306535'"
missing_ny_2.at["99775NY", 'actual_combokey'] = "='360012006484'"
missing_ny_2.at["99776NY", 'actual_combokey'] = "='360010106508'"
missing_ny_2.at["99805NY", 'actual_combokey'] = "='360008306490'"
missing_ny_2.at["99874NY", 'actual_combokey'] = "='360007706372'"
missing_ny_2.at["99933NY", 'actual_combokey'] = "='360008106380'"
missing_ny_2.at["99968NY", 'actual_combokey'] = "='360007606296'"
missing_ny_2.at["99992NY", 'actual_combokey'] = "='360009706274'"
missing_ny_2.at["99995NY", 'actual_combokey'] = "='360009506273'"

In [131]:
""" Join again on the NCES """
missing_ny_2_joined = missing_ny_2.set_index('actual_combokey').join(nces_1516_full, how = 'left', rsuffix = '_')

In [132]:
"""How many matched?"""
len(missing_ny_2_joined[missing_ny_2_joined.LEVEL_.notnull()].index)

10

## Combine recovered schools and performing filters 

** Concatenate the two recovered Missing NY Schools sets **

In [174]:
missing_ny_joined_matching = missing_ny_joined[missing_ny_joined.LEVEL_.notnull()]

In [175]:
all_missing_ny_recovered = missing_ny_2_joined.append(missing_ny_joined_matching)

**Join the original recovered schools (using schname_st identifier) with the recovered NY schools**

In [176]:
recovered_schools = schname_combined.copy()
recovered_schools = recovered_schools.fillna("Missing")

In [177]:
recovered_schools = recovered_schools[recovered_schools['SCH_NAME_'] != "Missing"]

In [178]:
recovered_schools_all = recovered_schools.append(all_missing_ny_recovered)

** Reformat the Columns ** -- Need to make sure that the recovered schools dataset's columns match the original filtered dataset's columns (required for concatenating the two sets properly)

In [182]:
"""Drop original nces columns (the ones with missing values)"""    
recovered_schools_all = recovered_schools_all.drop(['TITLEI', 'STABR', 'SCH_TYPE_TEXT', 'SCH_TYPE', 'LEVEL', 'VIRTUAL', 'GSLO', 'GSHI', 
                                            'NMCNTY15', 'LOCALE15', 'LAT1516', 'LON1516', 'combokey',
                                            'LEAID_', 'LEA_NAME_', 'SCH_NAME_', 'SCHID_', 'PK', 'KG', 'G01',
                                            'G02', 'G03', 'G04', 'G05', 'G06', 'G07', 'G08', 'G09', 'G10',
                                            'G11', 'G12', 'UG', 'MEMBER'], axis = 1)
"""Rename new matching columns to replace the columns above (necessary for a proper concatenation later)"""
recovered_schools_all = recovered_schools_all.rename(lambda x: x.strip('_'), axis = 'columns')
recovered_schools_all = recovered_schools_all.set_index('COMBOKEY')
# %store recovered_schools_all

In [183]:
"""Do the columns between the original filtered set and recovered missing values set match"""
print(len(recovered_schools_all.columns.values))
print(len(filter5_crdc_nces_1516.columns.values))

140
140


In [184]:
""" How many schools recovered? """
len(recovered_schools_all.index)

855

** Non-Virtual Schools **

In [185]:
recovered_schools_filter1 = recovered_schools_all.copy()

In [186]:
recovered_schools_filter1 = recovered_schools_filter1[recovered_schools_filter1.VIRTUAL != 'Yes']

In [187]:
"""How many schools remain?"""
len(recovered_schools_filter1.index)

842

** NCES-Reported High Schools **

In [189]:
recovered_schools_filter2 = recovered_schools_filter1.copy()

In [190]:
recovered_schools_filter2 = recovered_schools_filter2[(recovered_schools_filter2.LEVEL == '3')]

In [191]:
"""How many schools remain?"""
len(recovered_schools_filter2.index)

508

** NCES-Reported Regular **

In [192]:
recovered_schools_filter3 = recovered_schools_filter2.copy()

In [193]:
recovered_schools_filter3 = recovered_schools_filter3[recovered_schools_filter3.SCH_TYPE == 1]

In [194]:
"""How many schools remain?"""
len(recovered_schools_filter3.index)

472

**Clean Duplicate Values **

In [195]:
recovered_schools_filter3.groupby('SCH_NAME')['SCH_NAME'].count().sort_values(ascending = False).head(4)

SCH_NAME
beacon high school                          2
university high                             2
performance learning center                 2
world academy for total community health    1
Name: SCH_NAME, dtype: int64

In [196]:
"""Dealing with Beacon"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('beacon')])

Unnamed: 0_level_0,G01,G02,G03,G04,G05,G06,G07,G08,G09,G10,G11,G12,GSHI,GSLO,High_Grade_12,JJ,KG,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,LON1516,Low_Grade_Above9,MEMBER,NMCNTY15,PK,SCHID,SCH_APENR_AM_F,SCH_APENR_AM_M,SCH_APENR_AS_F,SCH_APENR_AS_M,SCH_APENR_BL_F,SCH_APENR_BL_M,SCH_APENR_HI_F,SCH_APENR_HI_M,SCH_APENR_HP_F,SCH_APENR_HP_M,SCH_APENR_IDEA_F,SCH_APENR_IDEA_M,SCH_APENR_IND,SCH_APENR_LEP_F,SCH_APENR_LEP_M,SCH_APENR_TR_F,SCH_APENR_TR_M,SCH_APENR_WH_F,SCH_APENR_WH_M,SCH_DUALENR_AM_F,...,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_KG,SCH_GRADE_PS,SCH_GRADE_UG,SCH_IBENR_AM_F,SCH_IBENR_AM_M,SCH_IBENR_AS_F,SCH_IBENR_AS_M,SCH_IBENR_BL_F,SCH_IBENR_BL_M,SCH_IBENR_HI_F,SCH_IBENR_HI_M,SCH_IBENR_HP_F,SCH_IBENR_HP_M,SCH_IBENR_IDEA_F,SCH_IBENR_IDEA_M,SCH_IBENR_IND,SCH_IBENR_LEP_F,SCH_IBENR_LEP_M,SCH_IBENR_TR_F,SCH_IBENR_TR_M,SCH_IBENR_WH_F,SCH_IBENR_WH_M,SCH_NAME,SCH_STATUS_ALT,SCH_STATUS_CHARTER,SCH_STATUS_MAGNET,SCH_STATUS_SPED,SCH_TYPE,SCH_TYPE_TEXT,SCH_UGDETAIL_HS,STABR,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,UG,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='362058000592',0,0,0,0,0,0,0,0,328,321,291,366,12,9,Yes,No,0,40.7612,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,11,-73.9952,UG-Yes,1307,New York County,0,592,0,0,17,5,11,5,32,14,0,0,2,2,Yes,0,0,0,2,65,32,-9,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,Yes,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,beacon high school,No,No,No,No,1,Regular School,Yes,NY,No,125,58,-9,-9,839,466,-9,-9,1,No
='362058000592',0,0,0,0,0,0,0,0,233,208,198,243,12,9,Yes,No,0,41.5145,3620580,NEW YORK CITY PUBLIC SCHOOLS,NY,NEW YORK,3,21,-73.9635,UG-Yes,893,Dutchess County,0,592,0,0,17,5,11,5,32,14,0,0,2,2,Yes,0,0,0,2,65,32,-9,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,Yes,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,beacon high school,No,No,No,No,1,Regular School,Yes,NY,No,125,58,-9,-9,839,466,-9,-9,11,No


In [197]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('BEACON')]

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,G07,G08,G09,G10,G11,G12,UG,MEMBER,Low_Grade_Above9,High_Grade_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='360414000177',NY,NEW YORK,3604140,BEACON CITY SCHOOL DISTRICT,177,BEACON HIGH SCHOOL,No,No,No,No,...,0,0,233,208,198,243,11,893,UG-Yes,Yes
='362058000592',NY,NEW YORK,3620580,NEW YORK CITY PUBLIC SCHOOLS,592,BEACON HIGH SCHOOL,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,UG-Yes,Yes
='440000800297',RI,RHODE ISLAND,4400008,Beacon Charter School,297,BEACON Charter School,No,No,No,No,...,0,0,57,60,57,58,0,232,Yes,Yes


In [198]:
"""Beacon High School in Dutchess County is already in the filter5 dataset -- Remove"""
recovered_schools_filter4 = recovered_schools_filter3.copy()
recovered_schools_filter4 = recovered_schools_filter4[(recovered_schools_filter4.SCH_NAME != 'beacon high school') | (recovered_schools_filter4.NMCNTY15 != 'Dutchess County')]

In [199]:
"""Dealing with Performance Learning Center"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('performance')])

Unnamed: 0_level_0,G01,G02,G03,G04,G05,G06,G07,G08,G09,G10,G11,G12,GSHI,GSLO,High_Grade_12,JJ,KG,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,LON1516,Low_Grade_Above9,MEMBER,NMCNTY15,PK,SCHID,SCH_APENR_AM_F,SCH_APENR_AM_M,SCH_APENR_AS_F,SCH_APENR_AS_M,SCH_APENR_BL_F,SCH_APENR_BL_M,SCH_APENR_HI_F,SCH_APENR_HI_M,SCH_APENR_HP_F,SCH_APENR_HP_M,SCH_APENR_IDEA_F,SCH_APENR_IDEA_M,SCH_APENR_IND,SCH_APENR_LEP_F,SCH_APENR_LEP_M,SCH_APENR_TR_F,SCH_APENR_TR_M,SCH_APENR_WH_F,SCH_APENR_WH_M,SCH_DUALENR_AM_F,...,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_KG,SCH_GRADE_PS,SCH_GRADE_UG,SCH_IBENR_AM_F,SCH_IBENR_AM_M,SCH_IBENR_AS_F,SCH_IBENR_AS_M,SCH_IBENR_BL_F,SCH_IBENR_BL_M,SCH_IBENR_HI_F,SCH_IBENR_HI_M,SCH_IBENR_HP_F,SCH_IBENR_HP_M,SCH_IBENR_IDEA_F,SCH_IBENR_IDEA_M,SCH_IBENR_IND,SCH_IBENR_LEP_F,SCH_IBENR_LEP_M,SCH_IBENR_TR_F,SCH_IBENR_TR_M,SCH_IBENR_WH_F,SCH_IBENR_WH_M,SCH_NAME,SCH_STATUS_ALT,SCH_STATUS_CHARTER,SCH_STATUS_MAGNET,SCH_STATUS_SPED,SCH_TYPE,SCH_TYPE_TEXT,SCH_UGDETAIL_HS,STABR,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,UG,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='130129003727',0,0,0,0,0,0,0,0,63,49,46,49,12,9,Yes,No,0,33.4739,1301290,Cobb County,GA,GEORGIA,3,12,-81.9974,Yes,207,Richmond County,0,3727,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,0,...,No,No,No,No,No,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,performance learning center,No,No,No,No,1,Regular School,-9,GA,Yes,-9,-9,0,0,49,65,-9,-9,0,No
='130270003728',0,0,0,0,0,0,0,0,63,49,46,49,12,9,Yes,No,0,33.4739,1302700,Harris County,GA,GEORGIA,3,12,-81.9974,Yes,207,Richmond County,0,3728,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,0,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,performance learning center,No,No,No,No,1,Regular School,-9,GA,Yes,-9,-9,17,10,44,47,-9,-9,0,No


In [200]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('Performance')]

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,G07,G08,G09,G10,G11,G12,UG,MEMBER,Low_Grade_Above9,High_Grade_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='130129003727',GA,GEORGIA,1301290,Cobb County,3727,Performance Learning Center,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes
='130270003728',GA,GEORGIA,1302700,Harris County,3728,Performance Learning Center,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes
='130438004221',GA,GEORGIA,1304380,Richmond County,4221,Performance Learning Center,No,No,No,No,...,0,0,63,49,46,49,0,207,Yes,Yes
='370297002842',NC,NORTH CAROLINA,3702970,Charlotte-Mecklenburg Schools,2842,Performance Learning Center,No,No,No,No,...,0,0,102,44,37,50,0,233,Yes,Yes


In [201]:
"""Both of the performance learning centers here actually matched to a different 'performance learning center' record;
therefore, they should both be removed"""
recovered_schools_filter4 = recovered_schools_filter4[recovered_schools_filter4.SCH_NAME != 'performance learning center']

In [202]:
"""Dealing with university high"""
with pd.option_context('display.max_columns', 100):
    display(recovered_schools_filter3[recovered_schools_filter3.SCH_NAME.str.startswith('university high')])

Unnamed: 0_level_0,G01,G02,G03,G04,G05,G06,G07,G08,G09,G10,G11,G12,GSHI,GSLO,High_Grade_12,JJ,KG,LAT1516,LEAID,LEA_NAME,LEA_STATE,LEA_STATE_NAME,LEVEL,LOCALE15,LON1516,Low_Grade_Above9,MEMBER,NMCNTY15,PK,SCHID,SCH_APENR_AM_F,SCH_APENR_AM_M,SCH_APENR_AS_F,SCH_APENR_AS_M,SCH_APENR_BL_F,SCH_APENR_BL_M,SCH_APENR_HI_F,SCH_APENR_HI_M,SCH_APENR_HP_F,SCH_APENR_HP_M,SCH_APENR_IDEA_F,SCH_APENR_IDEA_M,SCH_APENR_IND,SCH_APENR_LEP_F,SCH_APENR_LEP_M,SCH_APENR_TR_F,SCH_APENR_TR_M,SCH_APENR_WH_F,SCH_APENR_WH_M,SCH_DUALENR_AM_F,...,SCH_GRADE_G05,SCH_GRADE_G06,SCH_GRADE_G07,SCH_GRADE_G08,SCH_GRADE_G09,SCH_GRADE_G10,SCH_GRADE_G11,SCH_GRADE_G12,SCH_GRADE_KG,SCH_GRADE_PS,SCH_GRADE_UG,SCH_IBENR_AM_F,SCH_IBENR_AM_M,SCH_IBENR_AS_F,SCH_IBENR_AS_M,SCH_IBENR_BL_F,SCH_IBENR_BL_M,SCH_IBENR_HI_F,SCH_IBENR_HI_M,SCH_IBENR_HP_F,SCH_IBENR_HP_M,SCH_IBENR_IDEA_F,SCH_IBENR_IDEA_M,SCH_IBENR_IND,SCH_IBENR_LEP_F,SCH_IBENR_LEP_M,SCH_IBENR_TR_F,SCH_IBENR_TR_M,SCH_IBENR_WH_F,SCH_IBENR_WH_M,SCH_NAME,SCH_STATUS_ALT,SCH_STATUS_CHARTER,SCH_STATUS_MAGNET,SCH_STATUS_SPED,SCH_TYPE,SCH_TYPE_TEXT,SCH_UGDETAIL_HS,STABR,TITLEI,TOT_APENR_F,TOT_APENR_M,TOT_DUALENR_F,TOT_DUALENR_M,TOT_ENR_F,TOT_ENR_M,TOT_IBENR_F,TOT_IBENR_M,UG,VIRTUAL
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1,Unnamed: 82_level_1,Unnamed: 83_level_1,Unnamed: 84_level_1,Unnamed: 85_level_1,Unnamed: 86_level_1,Unnamed: 87_level_1,Unnamed: 88_level_1,Unnamed: 89_level_1,Unnamed: 90_level_1,Unnamed: 91_level_1,Unnamed: 92_level_1,Unnamed: 93_level_1,Unnamed: 94_level_1,Unnamed: 95_level_1,Unnamed: 96_level_1,Unnamed: 97_level_1,Unnamed: 98_level_1,Unnamed: 99_level_1,Unnamed: 100_level_1,Unnamed: 101_level_1
='069902400001',0,0,0,0,0,0,0,0,130,122,110,120,12,9,Yes,No,0,36.8097,699024,University High School,CA,CALIFORNIA,3,11,-119.748,Yes,482,Fresno County,0,1,2,2,47,29,8,5,26,23,0,0,0,2,Yes,0,0,0,0,50,44,2,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,university high,No,Yes,No,No,1,Regular School,-9,CA,Missing,133,103,196,163,271,217,-9,-9,0,No
='069902400001',0,0,0,0,0,0,0,0,674,704,612,627,12,9,Yes,No,0,33.6513,699024,University High School,CA,CALIFORNIA,3,12,-117.823,Yes,2617,Orange County,0,1,2,2,47,29,8,5,26,23,0,0,0,2,Yes,0,0,0,0,50,44,2,...,No,No,No,No,Yes,Yes,Yes,Yes,No,No,No,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9,No,-9,-9,-9,-9,-9,-9,university high,No,Yes,No,No,1,Regular School,-9,CA,Yes,133,103,196,163,271,217,-9,-9,0,No


In [203]:
filter5_crdc_nces_1516[filter5_crdc_nces_1516.SCH_NAME.str.startswith('University High')].head()

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,G07,G08,G09,G10,G11,G12,UG,MEMBER,Low_Grade_Above9,High_Grade_12
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='040852003206',AZ,ARIZONA,408520,Tolleson Union High School District,3206,University High School,No,No,No,No,...,0,0,163,131,107,76,0,477,Yes,Yes
='040880001441',AZ,ARIZONA,408800,Tucson Unified District,1441,University High School,No,No,No,No,...,0,0,302,287,256,211,0,1056,Yes,Yes
='068450007067',CA,CALIFORNIA,684500,Irvine Unified,7067,University High,No,No,No,No,...,0,0,674,704,612,627,0,2617,Yes,Yes
='069902400001',CA,CALIFORNIA,699024,University High School,1,University High,No,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Yes,Yes
='090192001381',CT,CONNECTICUT,901920,HARTFORD SCHOOL DISTRICT,1381,University High of Science and Engineering,No,No,No,No,...,0,0,137,111,95,88,0,431,Yes,Yes


In [204]:
"""The University High in Irvine was already accounted for; therefore, needs to be removed from the recovered"""
recovered_schools_filter4 = recovered_schools_filter4[(recovered_schools_filter4.SCH_NAME != 'university high') | (recovered_schools_filter4.NMCNTY15 != 'Orange County')]

In [205]:
'How many final recovered values?'
len(recovered_schools_filter4.index)

468

# <font color = green> VI. Concatenating Recovered Missing Values with the original Filtered Dataset </font>
<div class = 'alert alert-cell alert-info'> Finally, I concatenated the recovered high schools with the original filtered set.<br><br>

I ensured that no duplicate values were added in the process.

Then saved the file to "../filtered_data/04_filter_final.csv" </div>
<div class = 'alert alert-cell alert-warning'>
Final Total:  **15725 High Schools**

In [206]:
"""Remove the missing values"""
filter6_crdc_nces_1516 = filter5_crdc_nces_1516.copy()
filter6_crdc_nces_1516 = filter6_crdc_nces_1516[filter6_crdc_nces_1516.LEVEL != "Missing"]

In [207]:
"""How many initial Duplicates?
Interesting enough, these duplicates appear to legitimate; the problem seems to be that the schools actually have 
different names (e.g. "The ADAIR Co. High"'s are actually supposed to be labeled ADAIR Co. R-I High and ADAIR Co. R-II BRASHEAR)"""
filter6_crdc_nces_1516.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME                    NMCNTY15      
MO     ADAIR CO. HIGH              Adair County      2
TX     STERLING H S                Harris County     2
       TAYLOR H S                  Harris County     2
       LEE H S                     Harris County     2
WY     Wyoming Indian High School  Fremont County    1
Name: SCH_NAME, dtype: int64

In [208]:
recovered_schools_filter4.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME                        NMCNTY15     
TX     ischool high of hickory creek   Denton County    1
NY     bronx compass high school       Bronx County     1
       benjamin banneker academy       Kings County     1
       benjamin n cardozo high school  Queens County    1
       boys and girls high school      Kings County     1
Name: SCH_NAME, dtype: int64

In [209]:
# filtered_and_recovered = pd.concat([filter6_crdc_nces_1516, recovered_schools_filter4])
filtered_and_recovered = filter6_crdc_nces_1516.append(recovered_schools_filter4)

In [210]:
"""Do the numbers of columns match?"""
print(len(filter6_crdc_nces_1516.columns.values))
len(filtered_and_recovered.columns.values)

140


140

In [211]:
"""Because Columns are stored as dictionaries, there is no inherent order to the columns -- Pandas automatically 
uses an alphabetical sort on an append/concatenation.  I reorded the columns to show the SCH Name first"""
schName = ['SCH_NAME']
reorder = schName + [c for c in filtered_and_recovered.columns if c not in schName]
filtered_and_recovered = filtered_and_recovered[reorder]

In [212]:
"""No added duplicate records"""
filtered_and_recovered.groupby(['STABR','SCH_NAME','NMCNTY15'])['SCH_NAME'].count().sort_values(ascending=False).head()

STABR  SCH_NAME                    NMCNTY15      
TX     STERLING H S                Harris County     2
MO     ADAIR CO. HIGH              Adair County      2
TX     TAYLOR H S                  Harris County     2
       LEE H S                     Harris County     2
WY     Wyoming Indian High School  Fremont County    1
Name: SCH_NAME, dtype: int64

In [213]:
"How many total high schools in the set?"
len(filtered_and_recovered.index)

15725

In [214]:
# filtered_and_recovered.to_csv('../filtered_data/04_filter_final.csv')

## Check to see how close the Enrollment Numbers between NCES and CRDC are

In [223]:
final_crdc_tot_enroll = filtered_and_recovered.TOT_ENR_M + filtered_and_recovered.TOT_ENR_F
final_nces_tot_enroll = filtered_and_recovered[['MEMBER', 'SCH_NAME', 'LEA_STATE']]

final_enroll_compare = pd.concat([final_crdc_tot_enroll, final_nces_tot_enroll], axis = 1)
final_enroll_compare = final_enroll_compare.rename({0:'crdc', 'MEMBER':'nces'}, axis = 1)
final_enroll_compare['diff'] = final_enroll_compare['crdc'] - final_enroll_compare['nces']
final_enroll_compare['pct_diff'] = abs((final_enroll_compare['diff'] / final_enroll_compare['crdc'])) * 100

display(final_enroll_compare.sort_values('diff').head())

display(final_enroll_compare.sort_values('diff', ascending=False).head())

print(format(int(final_enroll_compare.nces.sum()),',d'), 'nces total')
print(format(final_enroll_compare.crdc.sum(), ',d'), 'crdc total')

Unnamed: 0_level_0,crdc,nces,SCH_NAME,LEA_STATE,diff,pct_diff
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
='482730003072',2051,4377,LEWISVILLE H S,TX,-2326,113.408
='060001412375',24,1228,Encore Jr./Sr. High Sch for the Perf and Visua...,CA,-1204,5016.67
='180567001031',1270,2400,Lawrence North High School,IN,-1130,88.9764
='368064099992',31,1132,north tonawanda high school,NY,-1101,3551.61
='180567001029',1298,2388,Lawrence Central High School,IN,-1090,83.9753


Unnamed: 0_level_0,crdc,nces,SCH_NAME,LEA_STATE,diff,pct_diff
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
='401956000932',2468,498,MCLOUD HS,OK,1970,79.8217
='490036000218',4819,3080,GRANGER HIGH,UT,1739,36.0863
='490036000898',3712,2626,HUNTER HIGH,UT,1086,29.2565
='483558004036',1428,436,POTTSBORO H S,TX,992,69.4678
='490036000234',3301,2325,KEARNS HIGH,UT,976,29.5668


13,603,767 nces total
13,627,606 crdc total


In [224]:
display(final_enroll_compare.sort_values('pct_diff', ascending=False))

Unnamed: 0_level_0,crdc,nces,SCH_NAME,LEA_STATE,diff,pct_diff
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
='050228000007',4,218,ALPENA HIGH SCHOOL,AR,-214,5350
='060001412375',24,1228,Encore Jr./Sr. High Sch for the Perf and Visua...,CA,-1204,5016.67
='368064099992',31,1132,north tonawanda high school,NY,-1101,3551.61
='381647000570',2,43,SAWYER HIGH SCHOOL,ND,-41,2050
='060001411899',24,244,Mirus Secondary,CA,-220,916.667
='380001600615',9,69,STRASBURG HIGH SCHOOL,ND,-60,666.667
='060001411118',24,178,Summit Leadership Academy-High Desert,CA,-154,641.667
='040027802168',4,19,Deer Valley Academy,AZ,-15,375
='292166001191',19,87,NORTHEAST NODAWAY HIGH,MO,-68,357.895
='483534003994',37,150,POOLVILLE H S,TX,-113,305.405


In [242]:
"""How many Adult schools in filter 6"""
len(filtered_and_recovered[filtered_and_recovered.SCH_NAME.str.contains('Adult')])

12

In [244]:
with pd.option_context('display.max_columns', 150):
    display(filtered_and_recovered[['MEMBER', 'TOT_ENR_M', 'TOT_ENR_F', 'SCH_NAME', 'LEAID', 'SCHID', 'LEA_STATE']].sort_values('MEMBER'))

Unnamed: 0_level_0,MEMBER,TOT_ENR_M,TOT_ENR_F,SCH_NAME,LEAID,SCHID,LEA_STATE
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
='040382000298',0,337,319,Holbrook High School,403820,298,AZ
='270579004894',0,74,61,Bloomington Career and College Acad,2705790,4894,MN
='270008403592',0,101,143,NORTHEAST SERVICE COOPERATIVE OLL,2700084,3592,MN
='341134003283',0,431,277,Barringer Academy of S.T.E.A.M.,3411340,3283,NJ
='341269003370',0,136,30,GARRETT MORGAN ACADEMY,3412690,3370,NJ
='040308000242',0,47,56,Fredonia High School,403080,242,AZ
='411052001763',0,88,16,ACE Academy,4110520,1763,OR
='040946000975',0,337,367,Winslow High School,409460,975,AZ
='450390701579',0,133,139,Governor's School for Science and Mathematics,4503907,1579,SC
='460696001182',0,25,17,Contract - 02,4606960,1182,SD


# Final Missing Schools
<div class = 'alert alert-cell alert-info'>**348 Schools**<br> Saved to '04_final_missing.csv'

In [215]:
final_missing = schname_combined[(schname_combined.SCH_NAME_.isnull()) & (schname_combined.LEA_NAME != 'NEW YORK CITY PUBLIC SCHOOLS')]

In [216]:
""" How many final missing schools? """
len(final_missing.index)

420

In [217]:
final_missing.to_csv('../filtered_data/04_final_missing.csv')

In [218]:
""" Top remaining unaccounted districts """
final_missing.groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False).head(10)

LEA_NAME
OFFICE OF EDUCATION DEPARTMENT OF CHILDREN AND FAMILIES    10
NORMAN                                                      9
Peters Township SD                                          6
Dept. of Svs. for Children Youth & Their Families           5
BOYS TOWN INTERIM PRG SCHS                                  4
Boston                                                      4
LINCOLN PUBLIC SCHOOLS                                      4
Geauga County Educational Service Center                    3
Learning Tree Inc                                           3
TULSA                                                       3
Name: LEAID, dtype: int64