In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from my_functions import combokey_converter

%matplotlib inline
sns.set_style('whitegrid')
plt.rc('axes', titlesize = 14, titleweight = 'bold', labelweight = 'bold')

In [2]:
raw = pd.read_csv('../filtered_data/00_crdc_1516_initial.csv', 
                        dtype = {'LEAID':np.object})

In [3]:
filtered = pd.read_csv('../filtered_data/04_filter_final.csv', dtype = {'LEAID':np.object})

In [4]:
raw['COMBOKEY'] = combokey_converter.convert(raw, 'LEAID', 'SCHID')

In [5]:
filtered_combo = filtered[['COMBOKEY','LAT1516']]

In [6]:
filtered_raw_joined = pd.merge(raw, filtered_combo, on = 'COMBOKEY', how = 'left')
filtered_from_raw = filtered_raw_joined.copy()
filtered_from_raw = filtered_from_raw[filtered_from_raw.LAT1516.isnull()]

In [7]:
len(filtered_from_raw)

77693

In [8]:
filtered_from_raw.columns.values

array(['LEA_STATE', 'LEA_STATE_NAME', 'LEAID', 'LEA_NAME', 'SCHID',
       'SCH_NAME', 'COMBOKEY', 'JJ', 'SCH_GRADE_PS', 'SCH_GRADE_KG',
       'SCH_GRADE_G01', 'SCH_GRADE_G02', 'SCH_GRADE_G03', 'SCH_GRADE_G04',
       'SCH_GRADE_G05', 'SCH_GRADE_G06', 'SCH_GRADE_G07', 'SCH_GRADE_G08',
       'SCH_GRADE_G09', 'SCH_GRADE_G10', 'SCH_GRADE_G11', 'SCH_GRADE_G12',
       'SCH_GRADE_UG', 'SCH_UGDETAIL_HS', 'SCH_STATUS_SPED',
       'SCH_STATUS_MAGNET', 'SCH_STATUS_CHARTER', 'SCH_STATUS_ALT',
       'SCH_ENR_HI_M', 'SCH_ENR_HI_F', 'SCH_ENR_AM_M', 'SCH_ENR_AM_F',
       'SCH_ENR_AS_M', 'SCH_ENR_AS_F', 'SCH_ENR_HP_M', 'SCH_ENR_HP_F',
       'SCH_ENR_BL_M', 'SCH_ENR_BL_F', 'SCH_ENR_WH_M', 'SCH_ENR_WH_F',
       'SCH_ENR_TR_M', 'SCH_ENR_TR_F', 'TOT_ENR_M', 'TOT_ENR_F',
       'SCH_ENR_LEP_M', 'SCH_ENR_LEP_F', 'SCH_ENR_IDEA_M',
       'SCH_ENR_IDEA_F', 'SCH_DUAL_IND', 'SCH_DUALENR_HI_M',
       'SCH_DUALENR_HI_F', 'SCH_DUALENR_AM_M', 'SCH_DUALENR_AM_F',
       'SCH_DUALENR_AS_M', 'SCH_DUALENR_AS_F

# Analysis

In [9]:
def missing_value_mapper(value):
    """Converts any negative number into 0, as these negative numbers represent missing/null values"""
    if isinstance(value, int):
        if value < 0:
            return 0
    return value

filtered_from_raw = filtered_from_raw.applymap(missing_value_mapper)

In [10]:
filtered_from_raw['de_total_enrollment'] = filtered_from_raw['TOT_DUALENR_M'] + filtered_from_raw['TOT_DUALENR_F']

In [11]:
"""How many filtered schools?"""
len(filtered_from_raw)

77693

### DE

In [12]:
schools_with_de_students = filtered_from_raw.copy()
schools_with_de_students = schools_with_de_students[schools_with_de_students.de_total_enrollment > 0]

In [13]:
"""How many filtered schools have DE students?"""
len(schools_with_de_students)

959

** Fiiltered-Schools with DE students **

In [14]:
first_columns = ['SCH_NAME', 'de_total_enrollment']
reorder = first_columns + [c for c in schools_with_de_students.columns if c not in first_columns]
schools_with_de_students = schools_with_de_students[reorder].sort_values('de_total_enrollment', ascending = False)
schools_with_de_students

Unnamed: 0,SCH_NAME,de_total_enrollment,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,COMBOKEY,JJ,SCH_GRADE_PS,...,SCH_IBENR_WH_F,SCH_IBENR_TR_M,SCH_IBENR_TR_F,TOT_IBENR_M,TOT_IBENR_F,SCH_IBENR_LEP_M,SCH_IBENR_LEP_F,SCH_IBENR_IDEA_M,SCH_IBENR_IDEA_F,LAT1516
7124,John F. Kennedy High,540,CA,CALIFORNIA,609850,Corona-Norco Unified,11712,='060985011712',No,No,...,0,0,0,0,0,0,0,0,0,
31796,Lowell Senior High School,520,IN,INDIANA,1811460,Tri-Creek School Corporation,1852,='181146001852',No,No,...,0,0,0,0,0,0,0,0,0,
31956,Ben Davis Ninth Grade Center,496,IN,INDIANA,1812810,M S D Wayne Township,2122,='181281002122',No,No,...,0,0,0,0,0,0,0,0,0,
95215,Oak Creek High,443,WI,WISCONSIN,5510830,Oak Creek-Franklin Joint School District,1411,='551083001411',No,No,...,0,0,0,0,0,0,0,0,0,
91731,Tesla STEM High School,395,WA,WASHINGTON,5304230,Lake Washington School District,3432,='530423003432',No,No,...,0,0,0,0,0,0,0,0,0,
3023,Lake Havasu High School,376,AZ,ARIZONA,404280,Lake Havasu Unified School District #1,433,='040428000433',No,No,...,0,0,0,0,0,0,0,0,0,
56749,Arrowhead Park Early College High School,350,NM,NEW MEXICO,3501500,LAS CRUCES PUBLIC SCHOOLS,99999,='350150099999',No,No,...,0,0,0,0,0,0,0,0,0,
88812,MAGGIE L. WALKER GOV. SCH.,328,VA,VIRGINIA,5100061,MAGGIE L. WALKER GOV SCH,2561,='510006102561',No,No,...,0,0,0,0,0,0,0,0,0,
9809,Sherman Oaks Center for Enriched Studies,321,CA,CALIFORNIA,622710,Los Angeles Unified,9151,='062271009151',No,No,...,0,0,0,0,0,0,0,0,0,
92249,Stahl Junior High,298,WA,WASHINGTON,5306960,Puyallup School District,1178,='530696001178',No,No,...,0,0,0,0,0,0,0,0,0,


**Total DE Students**

In [42]:
"""Total DE Students"""
total_filtered_out_de_students = schools_with_de_students.de_total_enrollment.sum()
print(str(total_filtered_out_de_students), 'Total DE Students Filtered Out')

28384 Total DE Students Filtered Out


**Juvenile Justice**

In [16]:
"""Juvenile Justice Schools - How many schools and DE students?"""
jj_de_students = schools_with_de_students[schools_with_de_students['JJ'] == 'Yes'].de_total_enrollment.sum()
print(schools_with_de_students.JJ.value_counts())
print()
print(str(jj_de_students), "DE students.")

No     944
Yes     15
Name: JJ, dtype: int64

388 DE students.


**Alternative**

In [44]:
"""Alternative Schools - How many schools and DE students?"""
alt_de_students_crdc = schools_with_de_students[schools_with_de_students.SCH_STATUS_ALT == 'Yes'].de_total_enrollment.sum()
print(schools_with_de_students.SCH_STATUS_ALT.value_counts())
print()
print(str(alt_de_students_crdc), "DE students.")

No     606
Yes    353
Name: SCH_STATUS_ALT, dtype: int64

7555 DE students.


**Special Education**

In [46]:
"""Special Education - How many schools and DE students?"""
sped_de_students_crdc = schools_with_de_students[schools_with_de_students.SCH_STATUS_SPED == 'Yes'].de_total_enrollment.sum()
print(schools_with_de_students.SCH_STATUS_SPED.value_counts())
print()
print(str(sped_de_students_crdc), "DE students.")

No     876
Yes     83
Name: SCH_STATUS_SPED, dtype: int64

2903 DE students.


**Grade Ranges**

In [19]:
def school_grade_range(df):
    """Calculates the number of schools in a given DataFrame (df) of a particular distribution of grades:
        Example:  If a school (in df) offers grades 9,10,11,12 exclusively, they are placed in the '9-12' 
                    category. """
    df_grade_range = []
    for index, row in df.iterrows():
        grade_range = []

        if row.SCH_GRADE_G12 == 'Yes':
            grade_range.append(12)
        if row.SCH_GRADE_G11 == 'Yes':
            grade_range.append(11)
        if row.SCH_GRADE_G10 == 'Yes':
            grade_range.append(10)
        if row.SCH_GRADE_G09 == 'Yes':
            grade_range.append(9)
        if row.SCH_GRADE_G08 == 'Yes':
            grade_range.append(8)
        if row.SCH_GRADE_G07 == 'Yes':
            grade_range.append(7)
        if row.SCH_GRADE_G06 == 'Yes':
            grade_range.append(6)
        if row.SCH_GRADE_G05 == 'Yes':
            grade_range.append(5)
        if row.SCH_GRADE_G04 == 'Yes':
            grade_range.append(4)
        if row.SCH_GRADE_G03 == 'Yes':
            grade_range.append(3)
        if row.SCH_GRADE_G02 == 'Yes':
            grade_range.append(2)
        if row.SCH_GRADE_G01 == 'Yes':
            grade_range.append(1)
        if row.SCH_GRADE_KG == 'Yes':
            grade_range.append('kg')
        if row.SCH_GRADE_PS == 'Yes':
            grade_range.append('pk')
        
        if grade_range == [12,11,10,9,8,7,6,5,4,3,2,1,'kg','pk']:
            df_grade_range.append('pk-12')
        elif grade_range == [12,11,10,9,8,7,6,5,4,3,2,1,'kg']:
            df_grade_range.append('kg-12')
        elif grade_range == [12,11,10,9,8,7,6,5]:
            df_grade_range.append('05-12')
        elif grade_range == [12,11,10,9,8,7,6]:
            df_grade_range.append('06-12')
        elif grade_range == [12,11,10,9,8,7]:
            df_grade_range.append('07-12')
        elif grade_range == [12,11,10,9,8]:
            df_grade_range.append('08-12')
        elif grade_range == [12,11,10,9]:
            df_grade_range.append('09-12')
        elif grade_range == [11,10,9]:
            df_grade_range.append('09-11')
        elif grade_range == [10,9]:
            df_grade_range.append('09-10')
        elif grade_range == [12,11,10]:
            df_grade_range.append('10-12')
        elif grade_range == [12,11]:
            df_grade_range.append('11-12')
        elif grade_range == [9]:
            df_grade_range.append('9-only')
        elif grade_range == [10]:
            df_grade_range.append('10-only')
        elif grade_range == [11]:
            df_grade_range.append('11-only')
        elif grade_range == [12]:
            df_grade_range.append('12-only')
        else:
            df_grade_range.append('other')
    return pd.DataFrame(df_grade_range, columns=['grade_range'])

In [20]:
de_stud_school_grade_range_list = school_grade_range(schools_with_de_students).join(schools_with_de_students['de_total_enrollment'].reset_index(drop = True), how = 'outer')

de_stud_school_grade_range_dist = de_stud_school_grade_range_list['grade_range'].value_counts().reset_index().sort_values('index').set_index('index')
de_stud_school_grade_range_dist['pct_of_schools'] = round(de_stud_school_grade_range_dist['grade_range'] / len(de_stud_school_grade_range_list), 3)

de_stud_school_grade_range_enrollments = pd.DataFrame(de_stud_school_grade_range_list.groupby('grade_range')['de_total_enrollment'].sum())

de_stud_school_grade_range_dist.join(de_stud_school_grade_range_enrollments).rename({'grade_range': '# schools'}, axis = 1)

Unnamed: 0_level_0,# schools,pct_of_schools,de_total_enrollment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
05-12,7,0.007,262
06-12,83,0.087,1140
07-12,60,0.063,1590
08-12,25,0.026,1062
09-10,19,0.02,1419
09-11,1,0.001,44
09-12,393,0.41,13322
10-12,52,0.054,1249
11-12,22,0.023,869
12-only,7,0.007,43


## Get NCES information and join with schools_with_de_students

In [21]:
schools_with_de_students = schools_with_de_students.drop(['LAT1516'], axis = 1)

In [22]:
nces_1516_full = pd.read_csv('../filtered_data/01_nces_1516_initial_combined_ccd.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [23]:
schools_with_de_students_nces = pd.merge(schools_with_de_students, nces_1516_full, left_on = 'COMBOKEY', right_on = 'combokey', how = 'left', suffixes = ('', '_nces'))

In [24]:
non_matching_schools_with_de = schools_with_de_students_nces[schools_with_de_students_nces.LEVEL.isnull()]
match_schools_with_de = schools_with_de_students_nces[schools_with_de_students_nces.LEVEL.notnull()]

**Recover some of the non-nces-matching schools**

In [25]:
"""Passing in my recovered_schools from 00_initial_filter"""
%store -r recovered_schools_all

In [26]:
recovered_schools_all = recovered_schools_all.reset_index()
nces_columns = ['COMBOKEY', 'SCH_TYPE', 'LEVEL', 'VIRTUAL', 'GSLO', 'GSHI',
       'NMCNTY15', 'LOCALE15', 'LAT1516', 'LON1516']
recovered_schools_all_nces = recovered_schools_all[nces_columns]

In [27]:
non_matching_schools_with_de = non_matching_schools_with_de.drop(['SCH_TYPE', 'LEVEL', 'VIRTUAL', 'GSLO', 'GSHI',
       'NMCNTY15', 'LOCALE15', 'LAT1516', 'LON1516'],axis = 1)

In [28]:
recovered_non_matchings = pd.merge(non_matching_schools_with_de, recovered_schools_all_nces, on='COMBOKEY')

In [29]:
print(recovered_non_matchings.de_total_enrollment.sum(), "DE students Recovered.")

121 DE students Recovered.


In [30]:
match_schools_with_de = match_schools_with_de.append(recovered_non_matchings)

In [31]:
de_students_in_NCES_matching_schools =  match_schools_with_de.de_total_enrollment.sum()
de_students_in_NCES_non_matching_schools = non_matching_schools_with_de.de_total_enrollment.sum() -\
                                           recovered_non_matchings.de_total_enrollment.sum()
print(de_students_in_NCES_matching_schools)
print(de_students_in_NCES_non_matching_schools)

25931
2453


In [32]:
print(format(match_schools_with_de.de_total_enrollment.sum(), ",d"), "out of", format(schools_with_de_students.de_total_enrollment.sum(),',d'), 
     "DE Students Accounted for.")

25,931 out of 28,384 DE Students Accounted for.


**Filtered Schools that match with NCES**

**LEVEL**<br>
1 = Primary (low grade = PK through 03; high grade = PK through 08), 2 = Middle (low grade = 04 through 07; high grade = 04 through 09), 3 = High (low grade = 07 through 12; high grade = 12 only), 4 = Other (any other configuration not falling within the above three categories;including ungraded), N = Not applicable


In [33]:
"""How many schools and de students in each LEVEL of school"""
print(match_schools_with_de.LEVEL.value_counts().reset_index().sort_values('index').set_index('index'))
print()
print(str(match_schools_with_de.groupby('LEVEL').de_total_enrollment.sum()), 'DE students.')

       LEVEL
index       
1         22
2         22
3        466
4        338
N         25

LEVEL
1      291
2     1601
3    15214
4     7796
N     1029
Name: de_total_enrollment, dtype: int64 DE students.


In [34]:
"""Looking just at the level_4 schools, where are most of the students?  Notice, the grade_range (index) comes
    from CRDC grades -- Several inconsistencies"""
"""Basically these are the schools there are filtered out due to being LEVEL=4, regardless of their CRDC grade range"""
level_4_schools = match_schools_with_de[match_schools_with_de.LEVEL == '4'].reset_index(drop=True)

de_stud_school_grade_range_list = school_grade_range(level_4_schools).join(level_4_schools['de_total_enrollment'].reset_index(drop = True), how = 'outer')

de_stud_school_grade_range_dist = de_stud_school_grade_range_list['grade_range'].value_counts().reset_index().sort_values('index').set_index('index')
de_stud_school_grade_range_dist['pct_of_schools'] = round(de_stud_school_grade_range_dist['grade_range'] / len(de_stud_school_grade_range_list), 3)

de_stud_school_grade_range_enrollments = pd.DataFrame(de_stud_school_grade_range_list.groupby('grade_range')['de_total_enrollment'].sum())

de_stud_school_grade_range_dist.join(de_stud_school_grade_range_enrollments).rename({'grade_range': '# schools'}, axis = 1)

Unnamed: 0_level_0,# schools,pct_of_schools,de_total_enrollment
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
05-12,6,0.018,260
06-12,67,0.198,704
07-12,13,0.038,172
08-12,6,0.018,216
09-10,12,0.036,931
09-11,1,0.003,44
09-12,18,0.053,299
10-12,2,0.006,8
9-only,14,0.041,1076
kg-12,107,0.317,2289


In [35]:
"""Misreported 9-12 schools"""
level_4_school_with_grade_range = pd.concat([level_4_schools, de_stud_school_grade_range_list], axis=1,)
level_4_school_with_grade_range[level_4_school_with_grade_range.grade_range == '09-12']
with pd.option_context('display.max_columns',150):
    display(level_4_school_with_grade_range[level_4_school_with_grade_range.grade_range == 'other'].groupby('GSHI').GSHI.value_counts())

GSHI  GSHI
10    10      10
11    11       1
12    12      58
9     9        8
AE    AE       1
Name: GSHI, dtype: int64

**Virtual Schools**

In [48]:
virtual_de_students = match_schools_with_de[match_schools_with_de['VIRTUAL'] == 'Yes'].de_total_enrollment.sum()
print(match_schools_with_de.VIRTUAL.value_counts())
print()
print(match_schools_with_de.groupby('VIRTUAL').de_total_enrollment.sum())

No         544
Missing    181
Yes        148
Name: VIRTUAL, dtype: int64

VIRTUAL
Missing     7744
No         15386
Yes         2801
Name: de_total_enrollment, dtype: int64


**School Type**<br>
1 = Regular school, 2 = Special education school, 3 = Vocational school, 4 = Other/alternative school


In [51]:
sped_de_students_nces = match_schools_with_de[match_schools_with_de['SCH_TYPE'] == 2].de_total_enrollment.sum()
alt_de_students_nces = match_schools_with_de[match_schools_with_de['SCH_TYPE'] == 4].de_total_enrollment.sum()
print(match_schools_with_de.SCH_TYPE.value_counts())
print()
print(match_schools_with_de.groupby('SCH_TYPE').de_total_enrollment.sum())

4.0    464
1.0    350
2.0     49
3.0     10
Name: SCH_TYPE, dtype: int64

SCH_TYPE
1.0    13040
2.0     1167
3.0      498
4.0    11226
Name: de_total_enrollment, dtype: int64


**Students in Schools without offering 11th or 12th grade**

In [53]:
from my_functions.extra_functions import students_in_11_or_12
match_schools_with_de['Students_in_11_12'] = match_schools_with_de.apply(lambda row: students_in_11_or_12(row['SCH_GRADE_G11'], row['SCH_GRADE_G12']), axis = 1)

In [56]:
de_students_schools_without_11_12 = match_schools_with_de[match_schools_with_de.Students_in_11_12 == 'No']\
                                        .de_total_enrollment.sum()
match_schools_with_de.Students_in_11_12.value_counts()
print(str(de_students_schools_without_11_12), 'de students in schools that do not offer grades 11 or 12.')

4553 de students in schools that do not offer grades 11 or 12.


**Filtered schools with the most DE-students**

In [38]:
with pd.option_context('display.max_rows', 125):
    display(match_schools_with_de.sort_values('de_total_enrollment', ascending=False).head(125))

Unnamed: 0,SCH_NAME,de_total_enrollment,LEA_STATE,LEA_STATE_NAME,LEAID,LEA_NAME,SCHID,COMBOKEY,JJ,SCH_GRADE_PS,...,SCH_TYPE_TEXT,SCH_TYPE,LEVEL,VIRTUAL,GSLO,GSHI,NMCNTY15,LOCALE15,LAT1516,LON1516
0,John F. Kennedy High,540,CA,CALIFORNIA,609850,Corona-Norco Unified,11712,='060985011712',No,No,...,Alternative Education School,4,3,No,10,12,Riverside County,21,33.9173,-117.564
1,Lowell Senior High School,520,IN,INDIANA,1811460,Tri-Creek School Corporation,1852,='181146001852',No,No,...,Regular School,1,3,No,8,12,Lake County,31,41.2914,-87.3932
2,Ben Davis Ninth Grade Center,496,IN,INDIANA,1812810,M S D Wayne Township,2122,='181281002122',No,No,...,Regular School,1,4,No,9,9,Marion County,11,39.783,-86.2917
3,Oak Creek High,443,WI,WISCONSIN,5510830,Oak Creek-Franklin Joint School District,1411,='551083001411',No,No,...,Regular School,1,3,No,9,12,Milwaukee County,21,42.8867,-87.9085
4,Tesla STEM High School,395,WA,WASHINGTON,5304230,Lake Washington School District,3432,='530423003432',No,No,...,Alternative Education School,4,3,Missing,9,12,King County,21,47.6487,-122.038
5,Lake Havasu High School,376,AZ,ARIZONA,404280,Lake Havasu Unified School District #1,433,='040428000433',No,No,...,Regular School,1,3,No,9,12,Mohave County,13,34.4944,-114.318
7,MAGGIE L. WALKER GOV. SCH.,328,VA,VIRGINIA,5100061,MAGGIE L. WALKER GOV SCH,2561,='510006102561',No,No,...,Regular School,1,N,No,N,N,Richmond city,12,37.5579,-77.4536
8,Sherman Oaks Center for Enriched Studies,321,CA,CALIFORNIA,622710,Los Angeles Unified,9151,='062271009151',No,No,...,Alternative Education School,4,4,No,4,12,Los Angeles County,11,34.1849,-118.538
9,Stahl Junior High,298,WA,WASHINGTON,5306960,Puyallup School District,1178,='530696001178',No,No,...,Regular School,1,2,Missing,5,9,Pierce County,21,47.1022,-122.3
10,MERCEDES EARLY COLLEGE ACADEMY,293,TX,TEXAS,4830250,MERCEDES ISD,12441,='483025012441',No,No,...,Alternative Education School,4,3,Missing,9,12,Hidalgo County,21,26.1436,-97.9121


In [39]:
# match_schools_with_de.to_csv('../filtered_data/06_filtered_schools_with_de_students.csv')

In [60]:
pd.DataFrame({'DE Students': [total_filtered_out_de_students, de_students_in_NCES_matching_schools, de_students_in_NCES_non_matching_schools,
                              jj_de_students, alt_de_students_crdc, sped_de_students_crdc, de_students_schools_without_11_12,
                              virtual_de_students, sped_de_students_nces, alt_de_students_nces,]},
            index = ['Total Filtered Out DE Students', 'NCES-Matching', 'NCES-Non-Matching', 'Juvenile Justice (CRDC)', 'Alternative Education (CRDC)',
                     'Special Education (CRDC)', 'Schools w/o 11th/12th grade (CRDC)', 'Virtual (NCES)', 
                     'Special Education (NCES)', 'Alternative/Other (NCES)',])

Unnamed: 0,DE Students
Total Filtered Out DE Students,28384
NCES-Matching,25931
NCES-Non-Matching,2453
Juvenile Justice (CRDC),388
Alternative Education (CRDC),7555
Special Education (CRDC),2903
Schools w/o 11th/12th grade (CRDC),4553
Virtual (NCES),2801
Special Education (NCES),1167
Alternative/Other (NCES),11226
