In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from my_functions import combokey_converter

%matplotlib inline
sns.set_style('whitegrid')
plt.rc('axes', titlesize = 14, titleweight = 'bold', labelweight = 'bold')

# <font color = green> I. Column Info </font>

#  Column info for crdc_1516 
<b><div class="alert alert-block alert-info"> Contains 111 Fields </b>

In [2]:
crdc_cols = pd.read_csv('../filtered_data/00_crdc_1516_initial_layout.csv')

In [3]:
with pd.option_context('display.max_colwidth', 150, 'display.max_rows', 125):
    display(crdc_cols.drop('Module', axis = 1).set_index('Field_Name'))

Unnamed: 0_level_0,Field_Description
Field_Name,Unnamed: 1_level_1
LEA_STATE,District State Abbreviation
LEA_STATE_NAME,District State Name
LEAID,7 Digit LEAID District Identification Code
LEA_NAME,District Name
SCHID,5 Digit School Identification Code
SCH_NAME,School Name
COMBOKEY,7 Digit LEAID District Identification Code+5 Digit School Identification Code
JJ,"Juvenile Justice Facility: ""Yes"" indicates a long-term secure facility; ""No"" indicates not a JJ facility"
SCH_GRADE_PS,Grades with Students Enrolled: Preschool
SCH_GRADE_KG,Grades with Students Enrolled: Kindergarten


In [4]:
len(crdc_cols.index)

111

# Column info for nces_1516
<div class="alert alert-block alert-info">**15 Fields**

In [5]:
nces_cols = pd.read_csv('../filtered_data/01_nces_1516_initial_ccd_layout.csv')

In [6]:
"""Replace \n literals with commas for readability"""
nces_cols['Categorical Values'] = nces_cols['Categorical Values'].apply(lambda x: x.replace('\n', ', ') if type(x) == str else x)

In [7]:
with pd.option_context('display.max_colwidth', 350, 'display.max_rows', 25):
    display(nces_cols[['Variable Name', 'Description', 'Categorical Values']])

Unnamed: 0,Variable Name,Description,Categorical Values
0,LEAID,NCES Agency Identification Number,
1,SCHID,NCES school identifier,
2,SCH_NAME,School name,
3,TITLEI,Title I Eligible School. This flag indicates whether a school is eligible for participation in either TAS or SWP program authorized by Title I of Public Law 103-382.,"No, Yes, Missing, Not applicable, -9-Suppressed"
4,SCH_TYPE_TEXT,School type (description),"Alternative Education School, Regular School, Special Education School, Vocational Education School,"
5,SCH_TYPE,School type (code),"1 = Regular school, 2 = Special education school, 3 = Vocational school, 4 = Other/alternative school, 5 = Reportable program (new code starting in 2007–08),"
6,LEVEL,School level,"1 = Primary (low grade = PK through 03; high grade = PK through 08), 2 = Middle (low grade = 04 through 07; high grade = 04 through 09), 3 = High (low grade = 07 through 12; high grade = 12 only), 4 = Other (any other configuration not falling within the above three categories;including ungraded), N = Not applicable, ,"
7,VIRTUAL,Virtual School Status,"Missing, No, Yes"
8,GSLO,Lowest Grade Offered,
9,GSHI,Highest Grade Offered,


In [8]:
len(nces_cols.index)

15

# <font color = green> II. Data Cleaning/Joining </font>

# crdc_1516 Data
<div class="alert alert-block alert-info"><b> 96,360 Schools before any filtering <br>
111 Fields (Matches the crdc_cols)</b></div>
<br><br>
Used combokey_convert.converter to create a csv-compatible "COMBOKEY"

In [9]:
crdc_1516 = pd.read_csv('../filtered_data/00_crdc_1516_initial.csv', 
                        dtype = {'LEAID':np.object})

In [10]:
crdc_1516['COMBOKEY'] = combokey_converter.convert(crdc_1516, 'LEAID', 'SCHID')

In [11]:
crdc_1516.head()

Unnamed: 0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,COMBOKEY,JJ,SCH_GRADE_PS,SCH_GRADE_KG,...,SCH_IBENR_WH_M,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F
0,AL,ALABAMA,100002,Alabama Youth Services,1705,Wallace Sch - Mt Meigs Campus,='010000201705',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
1,AL,ALABAMA,100002,Alabama Youth Services,1706,McNeel Sch - Vacca Campus,='010000201706',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
2,AL,ALABAMA,100002,Alabama Youth Services,1876,Alabama Youth Services,='010000201876',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
3,AL,ALABAMA,100002,Alabama Youth Services,99995,AUTAUGA CAMPUS,='010000299995',Yes,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9
4,AL,ALABAMA,100005,Albertville City,870,Albertville Middle School,='010000500870',No,No,No,...,-9,-9,-9,-9,-9,-9,-9,-9,-9,-9


In [12]:
len(crdc_1516.index)

96360

In [13]:
len(crdc_1516.columns)

111

# nces_1516 Data
<div class="alert alert-block alert-info"><b> The nces_1516 Data was recorded in separate files (each with different numbers of schools), so I will have to join the separate files to avoid corruption/loss of data. </b><br>
    <u>Files</u><br>
    1. Characteristics <br>
    2. Directory <br>
    3. Geographic <br>
</div><div class = 'alert alert-block alert-info'>
Like the crdc data, the combokey field was generated using my combokey_converter.convert function.<br></div>

<div class="alert alert-block alert-warning">
1. **100232 Initial Schools**<br><br>
2. **After first inner join (Directory and Characteristics) --> 100232 schools**<br>
Note: I ran a check to ensure that all of the matching combokeys have matching school names -- 100% identical.<br><br>
3. **After second inner join (above_combined and Geographic) --> 100087**<br> Note:  I ran the same check to ensure that all of the schools matched, and nearly 9000 came back as non-matching.  I then compared the first word of each of the two name fields, and only 9 schools came back as non-matching.  After close examination, I decided to cull these 9 schools.<br></div><div class = 'alert alert-block alert-warning'>
**CSV saved to '../filtered_data/01_nces_1516_initial_ccd.csv'**

In [14]:
nces_1516_characteristics = pd.read_csv('../filtered_data/01_nces_1516_initial_school_characteristics.csv')

In [15]:
nces_1516_characteristics['combokey'] = combokey_converter.convert(nces_1516_characteristics, 'LEAID', 'SCHID')

In [16]:
len(nces_1516_characteristics.index)

100232

In [17]:
nces_1516_directory = pd.read_csv('../filtered_data/01_nces_1516_initial_school_directory.csv')

In [18]:
nces_1516_directory['combokey'] = combokey_converter.convert(nces_1516_directory, 'LEAID', 'SCHID')

**First Join:  Directory + Characteristics**

In [19]:
nces_1516 = nces_1516_characteristics.set_index('combokey').join(nces_1516_directory.set_index('combokey'), how = 'inner', lsuffix = 'dir_')

In [20]:
len(nces_1516.index)

100232

In [21]:
len(nces_1516[nces_1516.SCH_NAME == nces_1516.SCH_NAMEdir_].index)

100232

In [22]:
nces_1516 = nces_1516.drop(['LEAIDdir_', 'SCHIDdir_', 'SCH_NAMEdir_'], axis = 1)

**Second Join: combined + geo**

In [23]:
nces_1516_geo = pd.read_csv('../filtered_data/01_nces_1516_initial_geographic.csv',  dtype = {'LOCALE15': np.object})

In [24]:
nces_1516_geo['combokey'] = combokey_converter.convert(nces_1516_geo, 'LEAID', 'SCHID')

In [25]:
nces_1516_test = nces_1516.join(nces_1516_geo.set_index('combokey'), how = 'inner', rsuffix = 'dir_')

In [26]:
len(nces_1516_test.index)

100096

In [27]:
len(nces_1516_test[nces_1516_test.SCH_NAME == nces_1516_test.NAME].index)

91091

In [28]:
def name_checker(sch1, sch2):
    sch1 = sch1.lower()
    sch2 = sch2.lower()
    
    if sch1[0] == sch2[0]:
        return 0
    return 1

nces_1516_test['no_match_name'] = nces_1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['NAME']), axis = 1)
nces_1516_test[nces_1516_test.no_match_name == 1][['NAME', 'SCH_NAME']]

Unnamed: 0_level_0,NAME,SCH_NAME
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1
='051266001562',HYLTON JUNIOR HIGH SCHOOL,LAKESIDE JUNIOR HIGH SCHOOL
='090147001810',Stowe - Early Learning Center (S,EPS PK STEAM Academy
='090171001700',Alternative High School Programs,Greenwich Alternative High School
='090192001616',STEM Magnet School at Dwight,Betances STEM Magnet School
='090279000148',Hyde School of Health Science an,Cortlandt V.R. Creed Health and Sport Sciences...
='090279001543',Helene Grant Headstart,Dr. Mayo Early Childhood School
='090279001585',Katherine Brennan/Clarence Roger,Brennan Rogers School
='090351201476',Education Connection Special Edu,GFLC/ACCESS School
='090423001808',Hatton Preschool Program,Southington Public Schools Preschool Program a...


In [29]:
nces_1516_full = nces_1516_test[nces_1516_test.no_match_name == 0].drop(['LEAIDdir_', 'SCHIDdir_', 'no_match_name', 'NAME'], axis = 1)

In [30]:
nces_1516_full.head()

Unnamed: 0_level_0,TITLEI,LEAID,SCHID,SCH_NAME,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
='010000200277',-9,100002,277,Sequoyah Sch - Chalkville Campus,Alternative Education School,4,3,No,7,12,Jefferson County,21,33.673661,-86.628755
='010000201667',-9,100002,1667,Camps,Alternative Education School,4,3,No,7,12,Autauga County,41,32.521681,-86.530132
='010000201670',-9,100002,1670,Det Ctr,Alternative Education School,4,3,No,7,12,Clarke County,41,31.938444,-87.750529
='010000201705',-9,100002,1705,Wallace Sch - Mt Meigs Campus,Alternative Education School,4,3,No,7,12,Montgomery County,41,32.374812,-86.08236
='010000201706',-9,100002,1706,McNeel Sch - Vacca Campus,Alternative Education School,4,3,No,7,12,Jefferson County,12,33.583385,-86.710058


In [31]:
len(nces_1516_full.index)

100087

In [32]:
# nces_1516_full.to_csv('../filtered_data/01_nces_1516_initial_combined_ccd.csv')

# NCES (combined) and CRDC join
<div class="alert alert-block alert-warning">Out of the 96360 schools in the crdc1516 dataset, <b>3861</b> schools did not have a matching Combokey. These non-matching schools were kept in the dataset.<br><br>

Using the name checker function from above, another <b>182</b> schools were found to have School Names whose first words did not match between the NCES and CRDC sets.  Airing on the side of caution, these schools were indiscriminately culled.<br><br>

**Final school count in the combined dataset:  96178**</div>
<div class = 'alert alert_block alert-info'>Dataset saved to '03_crdc_nces_1516_raw_combined.csv'

In [33]:
nces_1516_full.head()

Unnamed: 0_level_0,TITLEI,LEAID,SCHID,SCH_NAME,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
combokey,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
='010000200277',-9,100002,277,Sequoyah Sch - Chalkville Campus,Alternative Education School,4,3,No,7,12,Jefferson County,21,33.673661,-86.628755
='010000201667',-9,100002,1667,Camps,Alternative Education School,4,3,No,7,12,Autauga County,41,32.521681,-86.530132
='010000201670',-9,100002,1670,Det Ctr,Alternative Education School,4,3,No,7,12,Clarke County,41,31.938444,-87.750529
='010000201705',-9,100002,1705,Wallace Sch - Mt Meigs Campus,Alternative Education School,4,3,No,7,12,Montgomery County,41,32.374812,-86.08236
='010000201706',-9,100002,1706,McNeel Sch - Vacca Campus,Alternative Education School,4,3,No,7,12,Jefferson County,12,33.583385,-86.710058


In [34]:
crdc_nces1516_test = crdc_1516.set_index('COMBOKEY').join(nces_1516_full, how = 'left', rsuffix=('_'))

In [35]:
crdc_nces1516_test[crdc_nces1516_test.SCH_NAME_.isnull()].LEAID.count()

3861

In [36]:
def name_checker(sch1, sch2):
    if type(sch2) == float:
        return 2
    else:
        sch1 = sch1.lower()
        sch2 = sch2.lower()
        
    if sch1[0] == sch2[0]:
        return 0
    return 1

crdc_nces1516_test['no_match_name'] = crdc_nces1516_test.apply(lambda row: name_checker(row['SCH_NAME'], row['SCH_NAME_']), axis = 1)

In [37]:
crdc_nces1516_test[crdc_nces1516_test.no_match_name == 1][['SCH_NAME', 'SCH_NAME_']].head()

Unnamed: 0_level_0,SCH_NAME,SCH_NAME_
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1
='010000600880',Brindlee Mountain Elementary School,Grassy Elem Sch
='010000600887',Brindlee Mountain Primary School,Union Grove Elem Sch
='010019402395',Pelham Ridge Elementary,Valley Intermediate School
='010019402396',Pelham Oaks Elementary,Valley Elementary School
='010033000086',Bessemer City Middle Sch,James A Davis Middle Sch


In [38]:
crdc_nces_1516 = crdc_nces1516_test[crdc_nces1516_test.no_match_name != 1].drop(['LEAID_', 'SCHID_', 'SCH_NAME_', 'no_match_name'], axis = 1)

In [39]:
len(crdc_nces_1516.index)

96178

In [40]:
crdc_nces_1516 = crdc_nces_1516.fillna('Missing')

In [41]:
crdc_nces_1516.to_csv('../filtered_data/03_crdc_nces_1516_raw_combined.csv')

# <font color = green>III. Preliminary Exploration</font>

# Answered the AP & DE & IB Flag
<div class = 'alert alert-block alert-info'> First filter removes schools if they did not answer 'Yes' or 'No' to the AP Flag, the DE Flag, and the IB Flag.  <br><br>

Every school that answered the IB Flag answered the AP Flag (completely identical sets); however, there were 580 schools that answered the DE flag, but did not answer either the AP or IB Flag.
<br><br>
Breakdown for 'Yes' or 'No':<br>
- DE: 26,361 Schools
- AP: 25,781 Schools
- IB: 25,781 Schools
</div>

In [42]:
"""Answered DE Flag: Yes or No"""
len(crdc_nces_1516[(crdc_nces_1516.SCH_DUAL_IND == 'Yes') | (crdc_nces_1516.SCH_DUAL_IND =='No')].index)

26361

In [43]:
"""Answered AP Flag: Yes or No"""
len(crdc_nces_1516[(crdc_nces_1516.SCH_APENR_IND == 'Yes') | (crdc_nces_1516.SCH_APENR_IND =='No')].index)

25781

In [44]:
"""Answered IB Flag: Yes or No"""
len(crdc_nces_1516[(crdc_nces_1516.SCH_IBENR_IND == 'Yes') | (crdc_nces_1516.SCH_IBENR_IND =='No')].index)

25781

In [45]:
"""Checking to see if the schools that answer the ap flag match those that answered the ib flag -- 100% identical set"""
np.sum(crdc_nces_1516[(crdc_nces_1516.SCH_APENR_IND == 'Yes') | (crdc_nces_1516.SCH_APENR_IND =='No')]['SCH_NAME'] == crdc_nces_1516[(crdc_nces_1516.SCH_IBENR_IND == 'Yes') | (crdc_nces_1516.SCH_IBENR_IND =='No')]['SCH_NAME'])

25781

In [46]:
"""Number of Schools that answered DE Flag (Yes or No), but not AP/IB Flag"""
de_but_no_ap_answered = crdc_nces_1516[((crdc_nces_1516.SCH_DUAL_IND == 'Yes') | (crdc_nces_1516.SCH_DUAL_IND =='No'))]
de_but_no_ap_answered = de_but_no_ap_answered[(de_but_no_ap_answered.SCH_APENR_IND == '-5') | (de_but_no_ap_answered.SCH_APENR_IND == '-9')]
len(de_but_no_ap_answered.index)

580

In [47]:
"""DE/No-AP Subset Data"""
de_but_no_ap_answered

Unnamed: 0_level_0,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,SCH_NAME,JJ,SCH_GRADE_PS,SCH_GRADE_KG,SCH_GRADE_G01,...,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
COMBOKEY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
='010000201705',AL,ALABAMA,100002,Alabama Youth Services,1705,Wallace Sch - Mt Meigs Campus,Yes,No,No,No,...,Alternative Education School,4,3,No,7,12,Montgomery County,41,32.3748,-86.0824
='010000201706',AL,ALABAMA,100002,Alabama Youth Services,1706,McNeel Sch - Vacca Campus,Yes,No,No,No,...,Alternative Education School,4,3,No,7,12,Jefferson County,12,33.5834,-86.7101
='010000299995',AL,ALABAMA,100002,Alabama Youth Services,99995,AUTAUGA CAMPUS,Yes,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing
='010009001246',AL,ALABAMA,100090,Anniston City,1246,Anniston City Boot Camp Sch,Yes,No,No,No,...,Alternative Education School,4,3,No,7,12,Calhoun County,13,33.7071,-85.8216
='010018302114',AL,ALABAMA,100183,Alabama Clinical School,2114,Alabama Clinical School,Yes,No,Yes,Yes,...,Alternative Education School,4,N,No,N,N,Jefferson County,12,33.5849,-86.646
='010039001761',AL,ALABAMA,100390,Birmingham City,1761,Family Court High Sch,Yes,No,No,No,...,Alternative Education School,4,3,No,9,12,Jefferson County,12,33.5038,-86.8326
='020000100620',AK,ALASKA,200001,Lower Kuskokwim School District,620,Bethel Youth Facility,Yes,No,No,No,...,Alternative Education School,4,4,No,5,12,Bethel Census Area,33,60.7962,-161.762
='020015000449',AK,ALASKA,200150,Ketchikan Gateway Borough School District,449,Ketchikan Regional Youth Facility,Yes,No,No,No,...,Alternative Education School,4,4,No,5,12,Ketchikan Gateway Borough,33,55.354,-131.684
='020018000432',AK,ALASKA,200180,Anchorage School District,432,McLaughlin Secondary School,Yes,No,No,No,...,Alternative Education School,4,4,No,6,12,Anchorage Municipality,11,61.187,-149.834
='020018099999',AK,ALASKA,200180,Anchorage School District,99999,ACE/ACT Program,Yes,No,No,No,...,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing,Missing


In [48]:
crdc_nces_1516_deNoAp = crdc_nces_1516[(crdc_nces_1516.SCH_DUAL_IND == 'Yes') | (crdc_nces_1516.SCH_DUAL_IND =='No')]
crdc_nces_1516_deNoAp = crdc_nces_1516_deNoAp[(crdc_nces_1516_deNoAp.SCH_APENR_IND == 'Yes') | (crdc_nces_1516_deNoAp.SCH_APENR_IND == 'No')]

In [49]:
"""Do all Schools match after removing schools that answered DE, but not AP?"""
np.sum(crdc_nces_1516_deNoAp['SCH_NAME'] == crdc_nces_1516[(crdc_nces_1516.SCH_APENR_IND == 'Yes') | (crdc_nces_1516.SCH_APENR_IND =='No')]['SCH_NAME'])

25781

# <font color = green> IV. Filtration </font>

# Select Non-[Junvile Justice, Special Education, and Alternative Schools]
<div class = 'alert alert-block alert-info'>Schools that answered 'No' to each of those three questions on the CRDC Sruvey.<br><br> 
I also used a keyword filter to remove any remaining "Juvenile Justice"-eque Institutions.</div>
<div class = 'alert alert-block alert-warning'>**90448** Schools Remain</div>

In [50]:
filter1_crdc_nces_1516 = crdc_nces_1516[(crdc_nces_1516.JJ == 'No') & (crdc_nces_1516.SCH_STATUS_ALT == 'No') & (crdc_nces_1516.SCH_STATUS_SPED == 'No')]

In [51]:
def jj_keyword_remove(name):
    kws = ['behavioral', 'juvenile', 'correction']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.SCH_NAME.apply(lambda x: jj_keyword_remove(x))]
filter1_crdc_nces_1516 = filter1_crdc_nces_1516[filter1_crdc_nces_1516.LEA_NAME.apply(lambda x: jj_keyword_remove(x))]

In [52]:
len(filter1_crdc_nces_1516.index)

90448

# Select Schools with Lowest Grade (9-12) or Highest Grade (12) or Ungraded HS-students
<div class = 'alert alert-block alert-info'>I made formulas that take in the data from CRDC (because there are no null values for the Grade Enrollment Flags), and determines: (1) if a school's lowest grade offered is above 9, (2) if it contains ungraded HS-aged studnets, and (3) if a school's highest grade offered is 12th.<br><br>While NCES CCD does have columns for lowest and highest grades, there were many null values, so the filtration may have been too intense.</div>
<div class = 'alert alert-block alert-warning'>**21606** Schools Remain</div>

In [53]:
"""How many missing values in the grade columns?"""
np.sum(filter1_crdc_nces_1516.SCH_GRADE_G01.isnull())

0

In [54]:
def lowest_grade_9orAbove(g9, g10, g11, g12, ug, ug_hs):
    """Inputs all of the grade enrollment flags from the CRDC dataset
        Returns a label representing whether or not a school's lowest grade is 9, 10, 11, or 12 
        (or if a school is ungraded, but has hs-aged students)
        """
    if ug == 'Yes':
        if ug_hs == 'Yes': 
            return 'UG-Yes'
        else:
            return 'UG-No'

    if g9 == 'Yes':
        return 'Yes'
    if g10 == 'Yes':
        return 'Yes'
    if g11 == 'Yes':
        return 'Yes'
    if g12 == 'Yes':
        return 'Yes'
    return 'No'

In [55]:
def highest_grade_12(g12):
    """Inputs all of the grade enrollment flags from the CRDC dataset
        Returns a label representing whether or not a school's highest grade is 12.
        """
    
    if g12 == 'Yes':
        return 'Yes'
    return 'No'

In [56]:
filter2_crdc_nces_1516 = filter1_crdc_nces_1516.copy()
filter2_crdc_nces_1516['Low_Grade_Above9'] = filter2_crdc_nces_1516.apply(lambda row: lowest_grade_9orAbove(row['SCH_GRADE_G09'], row['SCH_GRADE_G10'], row['SCH_GRADE_G11'], row['SCH_GRADE_G12'], row['SCH_GRADE_UG'], row['SCH_UGDETAIL_HS']), axis = 1)

In [57]:
filter2_crdc_nces_1516['High_Grade_12'] = filter2_crdc_nces_1516.apply(lambda row: highest_grade_12(row['SCH_GRADE_G12']), axis = 1)

In [58]:
"""Breakdown of Schools with a Lowest Grade Above 9 (or with Ungrade HS-aged Students)"""
filter2_crdc_nces_1516.Low_Grade_Above9.value_counts()

No        66886
Yes       19771
UG-No      2294
UG-Yes     1497
Name: Low_Grade_Above9, dtype: int64

In [59]:
"""Breakdown of Schools with a Highest Grade of 12"""
filter2_crdc_nces_1516.High_Grade_12.value_counts()

No     70094
Yes    20354
Name: High_Grade_12, dtype: int64

In [60]:
filter2_crdc_nces_1516 = filter2_crdc_nces_1516[(filter2_crdc_nces_1516.Low_Grade_Above9 == 'Yes') | (filter2_crdc_nces_1516.Low_Grade_Above9 == 'UG-Yes') | (filter2_crdc_nces_1516.High_Grade_12 == 'Yes')]

In [61]:
len(filter2_crdc_nces_1516.index)

21606

# Remove Virtual Schools
<div class = 'alert alert-block alert-info'>
1. Remove any Schools that reported 'Yes' to the Virtual Schools Question<br>
2. Remove Schools that have certain keyword that likely indicate an online school
</div>
<div class = 'alert alert-block alert-warning'>**21209** Schools Remain</div>

In [62]:
filter2_crdc_nces_1516.VIRTUAL.value_counts()

No         17408
Missing     3867
Yes          331
Name: VIRTUAL, dtype: int64

In [63]:
filter3_crdc_nces_1516 = filter2_crdc_nces_1516[filter2_crdc_nces_1516.VIRTUAL != 'Yes']

In [64]:
len(filter3_crdc_nces_1516.index)

21275

In [65]:
def any_missed_virtuals(name):
    kws = ['virtual', 'cyber', 'electronic', 'internet', 'online', 'distance']
    for kw in kws:
        if kw in name.strip().lower():
            return False
    return True

filter3_crdc_nces_1516 = filter3_crdc_nces_1516[filter3_crdc_nces_1516.SCH_NAME.apply(lambda x: any_missed_virtuals(x))]

In [66]:
len(filter3_crdc_nces_1516.index)

21209

# Remove schools reported as elementary, middle, or "Other"
<div class = 'alert alert-block alert-info'>Even with the Lowest/Highest Grade filter, I wanted to ensure that no non-typical high schools (as reported by the NCES's LEVEL Field) are retained.  The Other category is perhaps the most important to cull here, as many of the very, very large charter-type schools are listed in this category.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**17542** Schools Remain</div>

In [67]:
filter4_crdc_nces_1516 = filter3_crdc_nces_1516.copy()

In [68]:
filter4_crdc_nces_1516.LEVEL.value_counts()

3          16348
4           3214
Missing     1194
2            307
1             76
N             70
Name: LEVEL, dtype: int64

In [69]:
filter4_crdc_nces_1516 = filter4_crdc_nces_1516[(filter4_crdc_nces_1516.LEVEL == 'Missing') | (filter4_crdc_nces_1516.LEVEL == '3')]

In [70]:
len(filter4_crdc_nces_1516.index)

17542

# Select Schools reported as Regular
<div class = 'alert alert-block alert-info'>Removed Schools with a SCH_TYPE that was not 1 (Regular).  Culls additional "Special Education", "Vocational", and "Alternative/Other" schools.
<br><br>
Schools with Missing Values were retained.
</div>
<div class = 'alert alert-block alert-warning'>**16451** Schools Remain</div>

In [71]:
filter5_crdc_nces_1516 = filter4_crdc_nces_1516.copy()

In [72]:
filter5_crdc_nces_1516.SCH_TYPE.value_counts()

1.0        15257
Missing     1194
4.0          744
3.0          332
2.0           15
Name: SCH_TYPE, dtype: int64

In [73]:
filter5_crdc_nces_1516 = filter5_crdc_nces_1516[(filter5_crdc_nces_1516.SCH_TYPE == 'Missing') | (filter5_crdc_nces_1516.SCH_TYPE == 1)]

In [74]:
len(filter5_crdc_nces_1516.index)

16451

In [101]:
filter5_crdc_nces_1516.to_csv('../filtered_data/04_inital_filter.csv')

# EXPLORATION

In [85]:
"""How many Values missing from final set?"""
with pd.option_context('display.max_columns', 150):
    display(filter5_crdc_nces_1516[filter5_crdc_nces_1516 == 'Missing'].count())

LEA_STATE                0
LEA_STATE_NAME           0
LEAID                    0
LEA_NAME                 0
SCHID                    0
SCH_NAME                 0
JJ                       0
SCH_GRADE_PS             0
SCH_GRADE_KG             0
SCH_GRADE_G01            0
SCH_GRADE_G02            0
SCH_GRADE_G03            0
SCH_GRADE_G04            0
SCH_GRADE_G05            0
SCH_GRADE_G06            0
SCH_GRADE_G07            0
SCH_GRADE_G08            0
SCH_GRADE_G09            0
SCH_GRADE_G10            0
SCH_GRADE_G11            0
SCH_GRADE_G12            0
SCH_GRADE_UG             0
SCH_UGDETAIL_HS          0
SCH_STATUS_SPED          0
SCH_STATUS_MAGNET        0
SCH_STATUS_CHARTER       0
SCH_STATUS_ALT           0
SCH_ENR_HI_M             0
SCH_ENR_HI_F             0
SCH_ENR_AM_M             0
                      ... 
SCH_IBENR_AM_F           0
SCH_IBENR_AS_M           0
SCH_IBENR_AS_F           0
SCH_IBENR_HP_M           0
SCH_IBENR_HP_F           0
SCH_IBENR_BL_M           0
S

In [97]:
with pd.option_context('display.max_rows', 1200):
    display(filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False))

LEA_NAME
NEW YORK CITY PUBLIC SCHOOLS                                                               615
Green Dot Public Schools                                                                    11
NORMAN                                                                                       9
Peters Township SD                                                                           6
Dept. of Svs. for Children Youth & Their Families                                            5
Boston                                                                                       4
Ombudsman Educational Services Ltd. a subsidiary of Educ 2                                   4
OFFICE OF EDUCATION DEPARTMENT OF CHILDREN AND FAMILIES                                      4
Cherokee County                                                                              3
NASSAU BOCES                                                                                 3
WINDSOR SCHOOL DISTRICT                  

In [99]:
filter5_missing_leas = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing'].groupby('LEA_NAME')['LEAID'].count().sort_values(ascending = False)

In [100]:
filter5_missing_leas.to_csv('../filtered_data/04_inital_filter_missing_LEAs.csv')

In [102]:
filter5_missing_schools = filter5_crdc_nces_1516[filter5_crdc_nces_1516.LEVEL == 'Missing']

In [104]:
filter5_missing_schools.to_csv('../filtered_data/04_intital_filter_missing_schools.csv')