### Download and Store Data Files

In [1]:
import os
import ssl
import sys
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline
pd.set_option('display.max_columns', None)

#### Downloading the Files

In [2]:
gradurls = {'https://infohub.nyced.org/docs/default-source/default-document-library/2020-graduation_rates_public_school.xlsx': 'grad_rates.xlsx',
            'https://infohub.nyced.org/docs/default-source/default-document-library/2020-graduation_rates_public_charters.xlsx': 'charter_grad_rates.xlsx'}
sqrurls = {'https://infohub.nyced.org/docs/default-source/doe-employees-only/202021-hs-sqr-results.xlsx': 'sqr2021.xlsx',
           'https://infohub.nyced.org/docs/default-source/default-document-library/201920_hs_sqr_results.xlsx': 'sqr2020.xlsx',
           'https://infohub.nyced.org/docs/default-source/default-document-library/201819_hs_sqr_results.xlsx': 'sqr2019.xlsx',
           'https://infohub.nyced.org/docs/default-source/default-document-library/201718_hs_sqr_results.xlsx': 'sqr2018.xlsx',
           'http://infohub.nyced.org/docs/default-source/default-document-library/2016-17_hs_sqr.xlsx': 'sqr2017.xlsx',
           'http://infohub.nyced.org/docs/default-source/default-document-library/2015_2016_hs_sqr_results_2017_01_05.xlsx': 'sqr2016.xlsx',
           'http://infohub.nyced.org/docs/default-source/default-document-library/2014_2015_hs_sqr_results_2016_04_08.xlsx': 'sqr2015.xlsx'}
demourls = {'https://infohub.nyced.org/docs/default-source/default-document-library/demographic-snapshot-2015-16-to-2019-20-(public).xlsx': 'demo.xlsx',
            'https://infohub.nyced.org/docs/default-source/default-document-library/demographic-snapshot-2016-17-to-2020-21---public.xlsx' : 'demo2.xlsx'}
urls = [gradurls, sqrurls, demourls]

In [3]:
#import ssl
#ssl._create_default_https_context = ssl._create_unverified_context
#for url in urls:
#    try:
#        [urllib.request.urlretrieve(x, 'data/'+url[x]) for x in url]
#    except:
#        print("Trying the next url")

#### Graduation Rate Files

In [4]:
gradrates = pd.read_excel('data/grad_rates.xlsx', sheet_name = "All")
charterrates = pd.read_excel('data/charter_grad_rates.xlsx', sheet_name = "All")

gradrates_eth = pd.read_excel('data/grad_rates.xlsx', sheet_name = "Ethnicity")

In [5]:
gradrates[gradrates['Cohort']=='4 year June'].groupby(['Cohort Year']).count()['DBN']

Cohort Year
2001    262
2002    318
2003    346
2004    368
2005    389
2006    406
2007    419
2008    436
2009    449
2010    473
2011    467
2012    474
2013    477
2014    479
2015    480
2016    475
Name: DBN, dtype: int64

In [6]:
charterrates[charterrates['Cohort']=='4 year June'].groupby(['Cohort Year']).count()['DBN']

Cohort Year
2007     9
2008    18
2009    29
2010    35
2011    38
2012    47
2013    50
2014    57
2015    61
2016    65
Name: DBN, dtype: int64

In [7]:
all_grad_rates = pd.concat([gradrates[['DBN', 'School Name', 'Cohort Year', 'Cohort', '# Total Cohort', '# Grads', '% Grads']],
                            charterrates[['DBN', 'School Name', 'Cohort Year', 'Cohort', '# Total Cohort', '# Grads', '% Grads']]], 
                           ignore_index=True)
all_grad_rates[['# Total Cohort', '# Grads', '% Grads']] = all_grad_rates[['# Total Cohort', '# Grads', '% Grads']].apply(pd.to_numeric, errors='coerce')

In [8]:
all_grad_rates.rename(columns={'DBN':'dbn',
                               'School Name':'schoolName',
                               'Cohort Year':'cohortYear', 
                               'Cohort':'cohort', 
                               '# Total Cohort':'numTotalCohort',
                               '# Grads':'numGrads', '% Grads':'pctGrad', '# Still Enrolled':'stillEnrolledNum'}, inplace=True)

In [9]:
all_grad_rates['pctGrad'] = all_grad_rates['pctGrad']/100 

In [10]:
all_grad_rates.to_csv('data/clean/all_grad_rates.csv',index=False)

In [11]:
all_grad_rates.head()

Unnamed: 0,dbn,schoolName,cohortYear,cohort,numTotalCohort,numGrads,pctGrad
0,01M292,ORCHARD COLLEGIATE ACADEMY,2016,4 year August,31,29.0,0.935484
1,01M292,ORCHARD COLLEGIATE ACADEMY,2015,4 year August,28,25.0,0.892857
2,01M292,ORCHARD COLLEGIATE ACADEMY,2014,4 year August,29,28.0,0.965517
3,01M292,ORCHARD COLLEGIATE ACADEMY,2013,4 year August,36,25.0,0.694444
4,01M292,ORCHARD COLLEGIATE ACADEMY,2012,4 year August,44,24.0,0.545455


In [12]:
all_grad_rates[(all_grad_rates['dbn']=='12X267') & (all_grad_rates['cohort']=='4 year August')]

Unnamed: 0,dbn,schoolName,cohortYear,cohort,numTotalCohort,numGrads,pctGrad
14167,12X267,BRONX LATIN,2016,4 year August,83,78.0,0.939759
14168,12X267,BRONX LATIN,2015,4 year August,91,82.0,0.901099
14169,12X267,BRONX LATIN,2014,4 year August,81,76.0,0.938272
14170,12X267,BRONX LATIN,2013,4 year August,71,68.0,0.957747
14171,12X267,BRONX LATIN,2012,4 year August,62,56.0,0.903226
14172,12X267,BRONX LATIN,2011,4 year August,58,49.0,0.844828
14173,12X267,BRONX LATIN,2010,4 year August,56,50.0,0.892857
14174,12X267,BRONX LATIN,2009,4 year August,40,29.0,0.725
14175,12X267,BRONX LATIN,2008,4 year August,50,38.0,0.76
14176,12X267,BRONX LATIN,2007,4 year August,40,31.0,0.775


#### SQR Files

In [13]:
tabs = ['Summary', 'Student Achievement', 'Closing the Achievement Gap', 'Framework', 'Additional Info']
sqr_files = ['sqr2021.xlsx', 'sqr2020.xlsx', 'sqr2019.xlsx', 'sqr2018.xlsx', 'sqr2017.xlsx', 'sqr2016.xlsx', 'sqr2015.xlsx']

In [14]:
summary = {}
stud_achieve = {}
closing_gap = {}
framework = {}
add_info = {}

for f in sqr_files:
    summary[f] = pd.read_excel('data/'+f, sheet_name='Summary')
    stud_achieve[f] = pd.read_excel('data/'+f, sheet_name='Student Achievement')
    closing_gap[f] = pd.read_excel('data/'+f, sheet_name='Closing the Achievement Gap')
    framework[f] = pd.read_excel('data/'+f, sheet_name='Framework')
    add_info[f] = pd.read_excel('data/'+f, sheet_name='Additional Info')

In [15]:
summary['sqr2021.xlsx'].head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26,Unnamed: 27,Unnamed: 28,Unnamed: 29,Unnamed: 30,Unnamed: 31,Unnamed: 32,Unnamed: 33,Unnamed: 34,Unnamed: 35,Unnamed: 36,Unnamed: 37,Unnamed: 38,Unnamed: 39,Unnamed: 40,Unnamed: 41,Unnamed: 42,Unnamed: 43,Unnamed: 44,Unnamed: 45,Unnamed: 46,Unnamed: 47,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57,Unnamed: 58,Unnamed: 59,Unnamed: 60,Unnamed: 61,Unnamed: 62,Unnamed: 63,Unnamed: 64,Unnamed: 65,Unnamed: 66,Unnamed: 67,Unnamed: 68,Unnamed: 69,Unnamed: 70,Unnamed: 71,Unnamed: 72,Unnamed: 73,Unnamed: 74,Unnamed: 75,Unnamed: 76,Unnamed: 77,Unnamed: 78,Unnamed: 79,Unnamed: 80,Unnamed: 81,Unnamed: 82
0,,Summary,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,"Due to COVID-19, Framework ratings and scores ...",,DBN,School Name,School Type,Enrollment,Rigorous Instruction - Percent Positive,Collaborative Teachers - Percent Positive,Supportive Environment - Percent Positive,Effective School Leadership - Percent Positive,Strong Family-Community Ties - Percent Positive,Trust - Percent Positive,Quality Review - How interesting and challengi...,Quality Review - How effective is the teaching...,Quality Review - How well does the school asse...,Quality Review - How clearly are high expectat...,Quality Review - How well do teachers work wit...,Quality Review - How safe and inclusive is the...,Quality Review - How well does the school allo...,Quality Review - How well does the school iden...,Quality Review - How thoughtful is the school'...,Quality Review - How well are school decisions...,Quality Review - Dates of Review,Percent Female,Percent Male,Percent English Language Learners,Percent Students with Disabilities,Percentage of students recommended for general...,Percentage of students recommended for integra...,Percentage of students recommended for special...,Economic Need Index,Percent Overage / Undercredited,Percent in Temp Housing,Percent HRA Eligible,Student Percent - Asian,Student Percent - Black,Student Percent - Hispanic,Student Percent - Native American,Student Percent - Native Hawaiian or Pacific I...,Student Percent - White,Teacher Percent - Asian,Teacher Percent - Black,Teacher Percent - Hispanic,Teacher Percent - Native American,Teacher Percent - Native Hawaiian or Pacific I...,Teacher Percent - White,Nearby Student Distance (mi),Nearby Student Percent - Asian,Nearby Student Percent - Black,Nearby Student Percent - Hispanic,Nearby Student Percent - Native American,Nearby Student Percent - Native Hawaiian or Pa...,Nearby Student Percent - White,Borough Percent - Asian,Borough Percent - Black,Borough Percent - Hispanic,Borough Percent - Native American,Borough Percent - Native Hawaiian or Pacific I...,Borough Percent - White,Percentage of Students Enrolled in Advanced Co...,Percentage of Students Enrolled in Advanced Co...,Percentage of Students Enrolled in Advanced Co...,Percentage of Students Enrolled in Advanced Co...,Percentage of Students Enrolled in Advanced Co...,Percentage of Students Enrolled in Advanced Co...,Percentage of Students Enrolled in AP or IB Co...,Percentage of Students Enrolled in AP or IB Co...,Percentage of Students Enrolled in AP or IB Co...,Percentage of Students Enrolled in AP or IB Co...,Percentage of Students Enrolled in AP or IB Co...,Percentage of Students Enrolled in AP or IB Co...,Years of principal experience at this school,Percent of teachers with 3 or more years of ex...,Average Student Attendance,Percent of Students Chronically Absent,Teacher Attendance Rate,Percentage of students with IEPs receiving all...,Percentage of students with IEPs receiving som...,Percentage of students with IEPs receiving no ...,Percentage of students with IEPs receiving all...,Percentage of students with IEPs receiving som...,Percentage of students with IEPs receiving no ...
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,01M292,Orchard Collegiate Academy,High School,296,0.76,0.84,0.83,0.89,0.91,0.91,Proficient,Proficient,Proficient,Proficient,Proficient,Well Developed,Proficient,Proficient,Well Developed,Proficient,April 2017,0.483,0.517,0.074,0.267,0,0.213,0.003,0.822,0.037,0.142,0.726,0.091,0.253,0.588,0.02,0.01,0.014,0.238,N<5,N<5,0,0,0.429,1.1,0.371,0.163,0.325,0.005,0.006,0.102,0.133,0.225,0.46,0.011,0.004,0.136,0.155,0.155,0.638,0.052,0,0,0.139,0.194,0.611,0.056,0,0,6.1,0.714,0.807,0.566,0.982,0.79,,,0.38,0,0.62


In [16]:
summary['sqr2021.xlsx'].columns = summary['sqr2021.xlsx'].iloc[2]
summary['sqr2020.xlsx'].columns = summary['sqr2020.xlsx'].iloc[2]
summary['sqr2019.xlsx'].columns = summary['sqr2019.xlsx'].iloc[2]
summary['sqr2018.xlsx'].columns = summary['sqr2018.xlsx'].iloc[0]
summary['sqr2017.xlsx'].columns = summary['sqr2017.xlsx'].iloc[0]
summary['sqr2016.xlsx'].columns = summary['sqr2016.xlsx'].iloc[0]
summary['sqr2015.xlsx'].columns = summary['sqr2015.xlsx'].iloc[0]

In [17]:
summary_cols = ['DBN','Enrollment', 'Percent Asian', 'Percent Black','Percent Hispanic', 'Percent White',
                'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                'Student Attendance Rate', 'Percent of Students Chronically Absent']
summary['sqr2019.xlsx'] = summary['sqr2019.xlsx'][summary_cols]
summary['sqr2018.xlsx'] = summary['sqr2018.xlsx'][summary_cols]
summary['sqr2017.xlsx'] = summary['sqr2017.xlsx'][summary_cols]
summary['sqr2016.xlsx'] = summary['sqr2016.xlsx'][summary_cols]
summary['sqr2015.xlsx'] = summary['sqr2015.xlsx'][summary_cols]

In [18]:
summary['sqr2021.xlsx'] = summary['sqr2021.xlsx'][['DBN','Enrollment',
                         'Student Percent - Asian','Student Percent - Black','Student Percent - Hispanic',
                         'Student Percent - White', 'Percent English Language Learners','Percent Students with Disabilities',
                         'Economic Need Index',
                         'Average Student Attendance',
                         'Percent of Students Chronically Absent']]

In [19]:
summary['sqr2020.xlsx'] = summary['sqr2020.xlsx'][['DBN','Enrollment',
                         'Student Percent - Asian','Student Percent - Black','Student Percent - Hispanic',
                         'Student Percent - White', 'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                         'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                         'Average Student Attendance (through Feb-2020 only)',
                         'Percent of Students Chronically Absent (through Feb-2020 only)']]

In [20]:
summary['sqr2020.xlsx'].rename(columns={'Student Percent - Asian':'Percent Asian',
                                        'Student Percent - Black':'Percent Black',
                                        'Student Percent - Hispanic': 'Percent Hispanic',
                                        'Student Percent - White': 'Percent White', 
                                        'Average Student Attendance (through Feb-2020 only)' : 'Student Attendance Rate',
                                        'Percent of Students Chronically Absent (through Feb-2020 only)':'Percent of Students Chronically Absent'}, inplace=True)

In [21]:
summary['sqr2021.xlsx']['SQR Year'] = '2021'
summary['sqr2020.xlsx']['SQR Year'] = '2020'
summary['sqr2019.xlsx']['SQR Year'] = '2019'
summary['sqr2018.xlsx']['SQR Year'] = '2018'
summary['sqr2017.xlsx']['SQR Year'] = '2017'
summary['sqr2016.xlsx']['SQR Year'] = '2016'
summary['sqr2015.xlsx']['SQR Year'] = '2015'

In [22]:
summary_df = pd.concat(summary).reset_index().drop(['level_0', 'level_1'], axis = 1)
summary_df = summary_df[summary_df.DBN != 'DBN']

In [23]:
summary_df[['Enrollment', 'Percent Asian', 'Percent Black','Percent Hispanic', 'Percent White',
                'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                'Student Attendance Rate', 'Percent of Students Chronically Absent']] = summary_df[['Enrollment', 'Percent Asian', 'Percent Black','Percent Hispanic', 'Percent White',
                'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                'Student Attendance Rate', 'Percent of Students Chronically Absent']].apply(pd.to_numeric, errors='coerce')

In [24]:
summary_df.drop(columns=['Percent Asian', 'Percent Black',
       'Percent Hispanic', 'Percent White',
       'Percent English Language Learners',
       'Percent Students with Disabilities', 'Percent Self-Contained'], inplace=True)

summary_df.rename(columns={"DBN":'dbn', 'Enrollment':'sqrEnrollment',
                       "Average Grade 8 English Proficiency": 'grd8Ela',
                       "Average Grade 8 Math Proficiency": 'grd8Math',
                       "Student Attendance Rate": 'attdRate',
                       "Percent of Students Chronically Absent" : 'pctChronAbs',
                           "Economic Need Index": 'econNeedInd',
                       "SQR Year": 'sqrYear'}, inplace=True)

In [25]:
stud_achieve['sqr2021.xlsx'].columns = stud_achieve['sqr2021.xlsx'].iloc[2]
stud_achieve['sqr2020.xlsx'].columns = stud_achieve['sqr2020.xlsx'].iloc[2]
stud_achieve['sqr2019.xlsx'].columns = stud_achieve['sqr2019.xlsx'].iloc[2]
stud_achieve['sqr2018.xlsx'].columns = stud_achieve['sqr2018.xlsx'].iloc[0]
stud_achieve['sqr2017.xlsx'].columns = stud_achieve['sqr2017.xlsx'].iloc[0]
stud_achieve['sqr2016.xlsx'].columns = stud_achieve['sqr2016.xlsx'].iloc[0]
stud_achieve['sqr2015.xlsx'].columns = stud_achieve['sqr2015.xlsx'].iloc[0]

In [26]:
stud_achieve['sqr2021.xlsx'].head(2)

2,NaN,"Due to COVID-19, Framework ratings and scores are not available for any schools for the 2020-21 school year.",NaN.1,DBN,School Name,School Type,N count - 10+ Credits in 1st Year - All Students,Metric Value - 10+ Credits in 1st Year - All Students,N count - 10+ Credits in 2nd Year - All Students,Metric Value - 10+ Credits in 2nd Year - All Students,N count - 10+ Credits in 2nd Year - School's Lowest Third,Metric Value - 10+ Credits in 2nd Year - School's Lowest Third,N count - 10+ Credits in 3rd Year - All Students,Metric Value - 10+ Credits in 3rd Year - All Students,N count - 10+ Credits in 3rd Year - School's Lowest Third,Metric Value - 10+ Credits in 3rd Year - School's Lowest Third,N count - 4-Year Graduation Rate - All Students,Metric Value - 4-Year Graduation Rate - All Students,N count - 6-Year Graduation Rate - All Students,Metric Value - 6-Year Graduation Rate - All Students,N count - 4-Year High School Persistence Rate,Metric Value - 4-Year High School Persistence Rate,N count - 6-Year High School Persistence Rate,Metric Value - 6-Year High School Persistence Rate,N count - Postsecondary Enrollment Rate - 18 Months,Metric Value - Postsecondary Enrollment Rate - 18 Months,N count - Postsecondary Enrollment Rate - 6 Months,Metric Value - Postsecondary Enrollment Rate - 6 Months,N count - College and Career Preparatory Course Index,Metric Value - College and Career Preparatory Course Index,N count - Percentage of Students with 90%+ Attendance,Metric Value - Percentage of Students with 90%+ Attendance,N count - Movement of Students with Disabilities to Less Restrictive Environments,Metric Value - Movement of Students with Disabilities to Less Restrictive Environments
0,,Student Achievement,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [27]:
sqrcols = ["DBN", "N count - 10+ Credits in 1st Year - All Students",
                              "Metric Value - 10+ Credits in 1st Year - All Students",
                              "N count - 10+ Credits in 2nd Year - All Students",
                              "Metric Value - 10+ Credits in 2nd Year - All Students",
                              "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                              "N count - 10+ Credits in 3rd Year - All Students",
                              "Metric Value - 10+ Credits in 3rd Year - All Students",
                              "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third",
           "N count - 4-Year Graduation Rate - All Students",
           "Metric Value - 4-Year Graduation Rate - All Students"]
stud_achieve['sqr2021.xlsx'] = stud_achieve['sqr2021.xlsx'][sqrcols]

sqrcols = ["DBN", "N count - 10+ Credits in 1st Year - All Students",
                              "Metric Value - 10+ Credits in 1st Year - All Students",
                              "N count - 10+ Credits in 1st Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
                              "N count - 10+ Credits in 2nd Year - All Students",
                              "Metric Value - 10+ Credits in 2nd Year - All Students",
                              "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                              "N count - 10+ Credits in 3rd Year - All Students",
                              "Metric Value - 10+ Credits in 3rd Year - All Students",
                              "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third",
           "N count - 4-Year Graduation Rate - All Students",
           "Metric Value - 4-Year Graduation Rate - All Students"]

stud_achieve['sqr2020.xlsx'] = stud_achieve['sqr2020.xlsx'][sqrcols]
stud_achieve['sqr2019.xlsx'] = stud_achieve['sqr2019.xlsx'][sqrcols]

sqrcols = ["DBN", "N count - 10+ Credits in 1st Year - All Students",
                              "Metric Value - 10+ Credits in 1st Year - All Students",
                              "N count - 10+ Credits in 1st Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
                              "N count - 10+ Credits in 2nd Year - All Students",
                              "Metric Value - 10+ Credits in 2nd Year - All Students",
                              "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                              "N count - 10+ Credits in 3rd Year - All Students",
                              "Metric Value - 10+ Credits in 3rd Year - All Students",
                              "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third",
           "N count - 4-Year Graduation Rate",
           "Metric Value - 4-Year Graduation Rate"]

stud_achieve['sqr2018.xlsx'] = stud_achieve['sqr2018.xlsx'][sqrcols]
stud_achieve['sqr2017.xlsx'] = stud_achieve['sqr2017.xlsx'][sqrcols]

sqrcols = ["DBN", "Metric Value - Percentage Earning 10+ Credits in First Year",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in First Year",
           "Metric Value - Percentage Earning 10+ Credits in Second Year",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Second Year",
           "Metric Value - Percentage Earning 10+ Credits in Third Year",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Third Year",
           "Metric Value - Graduation Rate, 4 year"]
stud_achieve['sqr2015.xlsx'] = stud_achieve['sqr2015.xlsx'][sqrcols]
stud_achieve['sqr2016.xlsx'] = stud_achieve['sqr2016.xlsx'][sqrcols]

In [28]:
sqrcols = {"N count - 4-Year Graduation Rate":"N count - 4-Year Graduation Rate - All Students",
           "Metric Value - 4-Year Graduation Rate":"Metric Value - 4-Year Graduation Rate - All Students"}
stud_achieve['sqr2018.xlsx'].rename(columns = sqrcols, inplace=True)
stud_achieve['sqr2017.xlsx'].rename(columns = sqrcols, inplace=True)

In [29]:
sqrcols = {"Metric Value - Percentage Earning 10+ Credits in First Year":"Metric Value - 10+ Credits in 1st Year - All Students",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in First Year": "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
           "Metric Value - Percentage Earning 10+ Credits in Second Year" : "Metric Value - 10+ Credits in 2nd Year - All Students",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Second Year": "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
           "Metric Value - Percentage Earning 10+ Credits in Third Year": "Metric Value - 10+ Credits in 3rd Year - All Students",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Third Year": "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third",
          "Metric Value - Graduation Rate, 4 year":"Metric Value - 4-Year Graduation Rate - All Students"}
stud_achieve['sqr2015.xlsx'].rename(columns = sqrcols, inplace=True)
stud_achieve['sqr2016.xlsx'].rename(columns = sqrcols, inplace=True)

In [30]:
stud_achieve['sqr2015.xlsx'].columns

Index(['DBN', 'Metric Value - 10+ Credits in 1st Year - All Students',
       'Metric Value - 10+ Credits in 1st Year - School's Lowest Third',
       'Metric Value - 10+ Credits in 2nd Year - All Students',
       'Metric Value - 10+ Credits in 2nd Year - School's Lowest Third',
       'Metric Value - 10+ Credits in 3rd Year - All Students',
       'Metric Value - 10+ Credits in 3rd Year - School's Lowest Third',
       'Metric Value - 4-Year Graduation Rate - All Students'],
      dtype='object', name=0)

In [31]:
stud_achieve['sqr2021.xlsx']['SQR Year'] = '2021'
stud_achieve['sqr2020.xlsx']['SQR Year'] = '2020'
stud_achieve['sqr2019.xlsx']['SQR Year'] = '2019'
stud_achieve['sqr2018.xlsx']['SQR Year'] = '2018'
stud_achieve['sqr2017.xlsx']['SQR Year'] = '2017'
stud_achieve['sqr2016.xlsx']['SQR Year'] = '2016'
stud_achieve['sqr2015.xlsx']['SQR Year'] = '2015'

In [32]:
stud_achieve_df = pd.concat(stud_achieve).reset_index().drop(['level_0', 'level_1'], axis=1)
stud_achieve_df = stud_achieve_df[stud_achieve_df.DBN != 'DBN']

In [33]:
stud_achieve_df.head(2)

Unnamed: 0,DBN,N count - 10+ Credits in 1st Year - All Students,Metric Value - 10+ Credits in 1st Year - All Students,N count - 10+ Credits in 2nd Year - All Students,Metric Value - 10+ Credits in 2nd Year - All Students,N count - 10+ Credits in 2nd Year - School's Lowest Third,Metric Value - 10+ Credits in 2nd Year - School's Lowest Third,N count - 10+ Credits in 3rd Year - All Students,Metric Value - 10+ Credits in 3rd Year - All Students,N count - 10+ Credits in 3rd Year - School's Lowest Third,Metric Value - 10+ Credits in 3rd Year - School's Lowest Third,N count - 4-Year Graduation Rate - All Students,Metric Value - 4-Year Graduation Rate - All Students,SQR Year,N count - 10+ Credits in 1st Year - School's Lowest Third,Metric Value - 10+ Credits in 1st Year - School's Lowest Third
0,,,,,,,,,,,,,,2021,,
1,,,,,,,,,,,,,,2021,,


In [34]:
stud_achieve_df[["N count - 10+ Credits in 1st Year - All Students", "Metric Value - 10+ Credits in 1st Year - All Students",
                 "N count - 10+ Credits in 1st Year - School's Lowest Third",
                 "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
                 "N count - 10+ Credits in 2nd Year - All Students",
                 "Metric Value - 10+ Credits in 2nd Year - All Students",
                 "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                 "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                 "N count - 10+ Credits in 3rd Year - All Students",
                 "Metric Value - 10+ Credits in 3rd Year - All Students",
                 "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                 "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third",
                 "N count - 4-Year Graduation Rate - All Students", 
                 "Metric Value - 4-Year Graduation Rate - All Students"]] = stud_achieve_df[["N count - 10+ Credits in 1st Year - All Students",
                              "Metric Value - 10+ Credits in 1st Year - All Students",
                              "N count - 10+ Credits in 1st Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
                              "N count - 10+ Credits in 2nd Year - All Students",
                              "Metric Value - 10+ Credits in 2nd Year - All Students",
                              "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                              "N count - 10+ Credits in 3rd Year - All Students",
                              "Metric Value - 10+ Credits in 3rd Year - All Students",
                              "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third",
                              "N count - 4-Year Graduation Rate - All Students", 
                              "Metric Value - 4-Year Graduation Rate - All Students"]].apply(pd.to_numeric, errors='coerce')

In [35]:
stud_achieve_df.rename(columns={"DBN":'dbn', 'Enrollment':'sqrEnrollment',
                       "Average Grade 8 English Proficiency": 'grd8Ela',
                       "Average Grade 8 Math Proficiency": 'grd8Math',
                       "Student Attendance Rate": 'attdRate',
                       "Percent of Students Chronically Absent" : 'pctChronAbs',
                       "SQR Year": 'sqrYear',
                       "Metric Value - 10+ Credits in 1st Year - All Students" : 'pct10PlusYear1',
                       "Metric Value - 10+ Credits in 1st Year - School's Lowest Third" : 'pct10PlusYear1Lowest3rd',
                       "Metric Value - 10+ Credits in 2nd Year - All Students" : 'pct10PlusYear2',
                       "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third" : 'pct10PlusYear2Lowest3rd',
                       "Metric Value - 10+ Credits in 3rd Year - All Students" : 'pct10PlusYear3',
                       "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third" : 'pct10PlusYear3Lowest3rd',
                       "N count - 10+ Credits in 1st Year - All Students" : 'numYear1',
                       "N count - 10+ Credits in 1st Year - School's Lowest Third" : 'numYear1Lowest3rd',
                       "N count - 10+ Credits in 2nd Year - All Students" : 'numYear2',
                       "N count - 10+ Credits in 2nd Year - School's Lowest Third" : 'numYear2Lowest3rd',
                       "N count - 10+ Credits in 3rd Year - All Students" : 'numYear3',
                       "N count - 10+ Credits in 3rd Year - School's Lowest Third": 'numYear3Lowest3rd'}, inplace=True)

In [36]:
closing_gap['sqr2021.xlsx'].head(2)

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13,Unnamed: 14,Unnamed: 15,Unnamed: 16,Unnamed: 17,Unnamed: 18,Unnamed: 19,Unnamed: 20,Unnamed: 21,Unnamed: 22,Unnamed: 23,Unnamed: 24,Unnamed: 25,Unnamed: 26
0,,Closing the Achievement Gap,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [37]:
closing_gap['sqr2021.xlsx'].columns = closing_gap['sqr2021.xlsx'].iloc[2]
closing_gap['sqr2020.xlsx'].columns = closing_gap['sqr2020.xlsx'].iloc[2]
closing_gap['sqr2019.xlsx'].columns = closing_gap['sqr2019.xlsx'].iloc[2]
closing_gap['sqr2018.xlsx'].columns = closing_gap['sqr2018.xlsx'].iloc[0]
closing_gap['sqr2017.xlsx'].columns = closing_gap['sqr2017.xlsx'].iloc[0]
closing_gap['sqr2016.xlsx'].columns = closing_gap['sqr2016.xlsx'].iloc[0]
closing_gap['sqr2015.xlsx'].columns = closing_gap['sqr2015.xlsx'].iloc[0]

closing_gap['sqr2021.xlsx']['sqrYear'] = '2021'
closing_gap['sqr2020.xlsx']['sqrYear'] = '2020'
closing_gap['sqr2019.xlsx']['sqrYear'] = '2019'
closing_gap['sqr2018.xlsx']['sqrYear'] = '2018'
closing_gap['sqr2017.xlsx']['sqrYear'] = '2017'
closing_gap['sqr2016.xlsx']['sqrYear'] = '2016'
closing_gap['sqr2015.xlsx']['sqrYear'] = '2015'

In [38]:
closing_gap['sqr2021.xlsx'].columns

Index([                                                                                                           nan,
       'Due to COVID-19, Framework ratings and scores are not available for any schools for the 2020-21 school year.',
                                                                                                                  nan,
                                                                                                                'DBN',
                                                                                                        'School Name',
                                                                                                        'School Type',
                                                                           'N count - 4-Year Graduation Rate - Black',
                                                                      'Metric Value - 4-Year Graduation Rate - Black',
                                                

In [39]:
closing_gap['sqr2021.xlsx'] = closing_gap['sqr2021.xlsx'][['DBN', 'N count - 4-Year Graduation Rate - Lowest Third Citywide',
                             'Metric Value - 4-Year Graduation Rate - Lowest Third Citywide',
                             'N count - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide',
                             'Metric Value - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide',
                             'N count - College and Career Preparatory Course Index, Lowest Third Citywide',
                             'Metric Value - College and Career Preparatory Course Index, Lowest Third Citywide', 'sqrYear']]
closing_gap['sqr2021.xlsx'].rename(columns={'DBN':'dbn',
                                            'N count - 4-Year Graduation Rate - Lowest Third Citywide': 'numGradLowest3rd',
                             'Metric Value - 4-Year Graduation Rate - Lowest Third Citywide': 'pctGradLowest3rd',
                             'N count - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide': 'numPERLowest3rd' ,
                             'Metric Value - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide': 'pctPERLowest3rd',
                             'N count - College and Career Preparatory Course Index, Lowest Third Citywide': 'numCCPCILowest3rd',
                             'Metric Value - College and Career Preparatory Course Index, Lowest Third Citywide': 'pctCCPCILowest3rd'}, inplace=True)

In [40]:
closing_gap['sqr2020.xlsx'] = closing_gap['sqr2020.xlsx'][['DBN', 'N count - 4-Year Graduation Rate - Lowest Third Citywide',
                             'Metric Value - 4-Year Graduation Rate - Lowest Third Citywide',
                             'N count - 4-Year College Readiness Index, Lowest Third Citywide (without CAT)',
                             'Metric Value - 4-Year College Readiness Index, Lowest Third Citywide (without CAT)',
                             'N count - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide',
                             'Metric Value - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide',
                             'N count - College and Career Preparatory Course Index, Lowest Third Citywide',
                             'Metric Value - College and Career Preparatory Course Index, Lowest Third Citywide', 'sqrYear']]
closing_gap['sqr2020.xlsx'].rename(columns={'DBN':'dbn',
                                            'N count - 4-Year Graduation Rate - Lowest Third Citywide': 'numGradLowest3rd',
                             'Metric Value - 4-Year Graduation Rate - Lowest Third Citywide': 'pctGradLowest3rd',
                             'N count - 4-Year College Readiness Index, Lowest Third Citywide (without CAT)': 'numCollegeReadyLowest3rd',
                             'Metric Value - 4-Year College Readiness Index, Lowest Third Citywide (without CAT)': 'pctCollegeReadyLowest3rd',
                             'N count - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide': 'numPERLowest3rd' ,
                             'Metric Value - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide': 'pctPERLowest3rd',
                             'N count - College and Career Preparatory Course Index, Lowest Third Citywide': 'numCCPCILowest3rd',
                             'Metric Value - College and Career Preparatory Course Index, Lowest Third Citywide': 'pctCCPCILowest3rd'}, inplace=True)

In [41]:
closing_gap_cols = ['DBN', 'N count - 4-Year Graduation Rate - Lowest Third Citywide',
                    'Metric Value - 4-Year Graduation Rate - Lowest Third Citywide',
                    'N count - 4-Year College Readiness Index, Lowest Third Citywide',
                    'Metric Value - 4-Year College Readiness Index, Lowest Third Citywide',
                    'N count - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide',
                    'Metric Value - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide',
                    'N count - College and Career Preparatory Course Index, Lowest Third Citywide',
                    'Metric Value - College and Career Preparatory Course Index, Lowest Third Citywide', 'sqrYear']
closing_gap['sqr2019.xlsx'] = closing_gap['sqr2019.xlsx'][closing_gap_cols]
closing_gap['sqr2018.xlsx'] = closing_gap['sqr2018.xlsx'][closing_gap_cols]
closing_gap['sqr2017.xlsx'] = closing_gap['sqr2017.xlsx'][closing_gap_cols]

In [42]:
temp = pd.concat([closing_gap['sqr2019.xlsx'], closing_gap['sqr2018.xlsx'], closing_gap['sqr2017.xlsx']], axis=0)
temp.rename(columns = {'DBN': 'dbn',
                       'N count - 4-Year Graduation Rate - Lowest Third Citywide': 'numGradLowest3rd',
                             'Metric Value - 4-Year Graduation Rate - Lowest Third Citywide': 'pctGradLowest3rd',
                             'N count - 4-Year College Readiness Index, Lowest Third Citywide': 'numCollegeReadyLowest3rd',
                             'Metric Value - 4-Year College Readiness Index, Lowest Third Citywide': 'pctCollegeReadyLowest3rd',
                             'N count - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide': 'numPERLowest3rd' ,
                             'Metric Value - Postsecondary Enrollment Rate - 6 Months, Lowest Third Citywide': 'pctPERLowest3rd',
                             'N count - College and Career Preparatory Course Index, Lowest Third Citywide': 'numCCPCILowest3rd',
                             'Metric Value - College and Career Preparatory Course Index, Lowest Third Citywide': 'pctCCPCILowest3rd'}, inplace=True)

In [43]:
closing_gap_cols = ['DBN', 'Metric Value - Graduation Rate, 4-year, lowest third city',
                    'Metric Value - Percentage in Lowest Third City, 4-year College Readiness Index',
                    'Metric Value - Percentage in Lowest Third City, PER, 6 Months After High School',
                    'Metric Value - Percentage in Lowest Third City, College and Career Prep Course Index', 'sqrYear']

closing_gap['sqr2016.xlsx'] = closing_gap['sqr2016.xlsx'][closing_gap_cols]
closing_gap['sqr2016.xlsx'].rename(columns = {'DBN':'dbn',
                                              'Metric Value - Graduation Rate, 4-year, lowest third city':'pctGradLowest3rd',
                    'Metric Value - Percentage in Lowest Third City, 4-year College Readiness Index':'pctCollegeReadyLowest3rd',
                    'Metric Value - Percentage in Lowest Third City, PER, 6 Months After High School':'pctPERLowest3rd',
                    'Metric Value - Percentage in Lowest Third City, College and Career Prep Course Index':'pctCCPCILowest3rd'}, inplace=True)

In [44]:
closing_gap_cols = ['DBN', 'Metric Value - Weighted Diploma Rate, 4 Year - Lowest Third City',
                    'Metric Value - Percentage in Lowest Third City, 4-year Non-Remediation Index',
                    'Metric Value - Percentage in Lowest Third City, PER, 6 Months After High School',
                    'Metric Value - Percentage in Lowest Third City, College and Career Prep Course Index', 'sqrYear']

closing_gap['sqr2015.xlsx'] = closing_gap['sqr2015.xlsx'][closing_gap_cols]
closing_gap['sqr2015.xlsx'].rename(columns = {'DBN':'dbn',
                                              'Metric Value - Weighted Diploma Rate, 4 Year - Lowest Third City':'pctGradLowest3rd',
                    'Metric Value - Percentage in Lowest Third City, 4-year Non-Remediation Index':'pctCollegeReadyLowest3rd',
                    'Metric Value - Percentage in Lowest Third City, PER, 6 Months After High School':'pctPERLowest3rd',
                    'Metric Value - Percentage in Lowest Third City, College and Career Prep Course Index':'pctCCPCILowest3rd'}, inplace=True)

In [45]:
closing_gap_df = pd.concat([closing_gap['sqr2021.xlsx'],closing_gap['sqr2020.xlsx'], temp, closing_gap['sqr2016.xlsx'],closing_gap['sqr2015.xlsx']], axis=0)

In [46]:
closing_gap_df.columns

Index(['dbn', 'numGradLowest3rd', 'pctGradLowest3rd', 'numPERLowest3rd',
       'pctPERLowest3rd', 'numCCPCILowest3rd', 'pctCCPCILowest3rd', 'sqrYear',
       'numCollegeReadyLowest3rd', 'pctCollegeReadyLowest3rd'],
      dtype='object')

In [47]:
closing_gap_df = closing_gap_df[((closing_gap_df.dbn != 'DBN') & (closing_gap_df.dbn.notnull()))]
closing_gap_df[['numCCPCILowest3rd', 'numCollegeReadyLowest3rd',
       'numGradLowest3rd', 'numPERLowest3rd', 'pctCCPCILowest3rd',
       'pctCollegeReadyLowest3rd', 'pctGradLowest3rd', 'pctPERLowest3rd']] = closing_gap_df[['numCCPCILowest3rd', 'numCollegeReadyLowest3rd',
       'numGradLowest3rd', 'numPERLowest3rd', 'pctCCPCILowest3rd',
       'pctCollegeReadyLowest3rd', 'pctGradLowest3rd', 'pctPERLowest3rd']].apply(pd.to_numeric, errors='coerce')

In [48]:
closing_gap_df.drop(columns=['numCCPCILowest3rd', 'numCollegeReadyLowest3rd'], inplace=True)

In [49]:
closing_gap_df.head()

Unnamed: 0,dbn,numGradLowest3rd,pctGradLowest3rd,numPERLowest3rd,pctPERLowest3rd,pctCCPCILowest3rd,sqrYear,pctCollegeReadyLowest3rd
4,01M292,30.0,0.9,6.0,0.333,0.167,2021,
5,01M448,37.0,0.946,31.0,0.71,0.838,2021,
6,01M450,19.0,0.895,23.0,0.652,0.842,2021,
7,01M539,8.0,1.0,13.0,0.615,0.75,2021,
8,01M696,5.0,0.6,2.0,,0.8,2021,


In [50]:
add_info['sqr2021.xlsx'].columns = add_info['sqr2021.xlsx'].iloc[2]
add_info['sqr2020.xlsx'].columns = add_info['sqr2020.xlsx'].iloc[2]
add_info['sqr2019.xlsx'].columns = add_info['sqr2019.xlsx'].iloc[2]
add_info['sqr2018.xlsx'].columns = add_info['sqr2018.xlsx'].iloc[0]
add_info['sqr2017.xlsx'].columns = add_info['sqr2017.xlsx'].iloc[0]
add_info['sqr2016.xlsx'].columns = add_info['sqr2016.xlsx'].iloc[0]
add_info['sqr2015.xlsx'].columns = add_info['sqr2015.xlsx'].iloc[0]

In [51]:
add_info['sqr2021.xlsx'].head(2)

2,NaN,NaN.1,NaN.2,DBN,School Name,School Type,N Count - Average Student Attendance,Metric Value - Average Student Attendance,N Count - Average Student Attendance - In-person days,Metric Value - Average Student Attendance - In-person days,N Count - Average Student Attendance - Remote days,Metric Value - Average Student Attendance - Remote days,N Count - Percentage of Students with 90%+ Attendance - Asian,Metric Value - Percentage of Students with 90%+ Attendance - Asian,N Count - Percentage of Students with 90%+ Attendance - Black,Metric Value - Percentage of Students with 90%+ Attendance - Black,N Count - Percentage of Students with 90%+ Attendance - Hispanic,Metric Value - Percentage of Students with 90%+ Attendance - Hispanic,N Count - Percentage of Students with 90%+ Attendance - Native American,Metric Value - Percentage of Students with 90%+ Attendance - Native American,N Count - Percentage of Students with 90%+ Attendance - Multiracial,Metric Value - Percentage of Students with 90%+ Attendance - Multiracial,N Count - Percentage of Students with 90%+ Attendance - Native Hawaiian or Pacific Islander,Metric Value - Percentage of Students with 90%+ Attendance - Native Hawaiian or Pacific Islander,N Count - Percentage of Students with 90%+ Attendance - White,Metric Value - Percentage of Students with 90%+ Attendance - White,N Count - Percentage of Students with 90%+ Attendance - Female,Metric Value - Percentage of Students with 90%+ Attendance - Female,N Count - Percentage of Students with 90%+ Attendance - Male,Metric Value - Percentage of Students with 90%+ Attendance - Male,N Count - % of cohort who took the ACT English exam,Metric Value - % of cohort who took the ACT English exam,N Count - % of cohort who took the ACT Math exam,Metric Value - % of cohort who took the ACT Math exam,N Count - % of cohort who took the ACT Reading exam,Metric Value - % of cohort who took the ACT Reading exam,N Count - % of cohort who took the ACT Science exam,Metric Value - % of cohort who took the ACT Science exam,N Count - 10+ Credits in 1st Year - Asian,Metric Value - 10+ Credits in 1st Year - Asian,N Count - 10+ Credits in 1st Year - Black,Metric Value - 10+ Credits in 1st Year - Black,N Count - 10+ Credits in 1st Year - Hispanic or Latinx,Metric Value - 10+ Credits in 1st Year - Hispanic or Latinx,N Count - 10+ Credits in 1st Year - Native American,Metric Value - 10+ Credits in 1st Year - Native American,N Count - 10+ Credits in 1st Year -Multiracial,Metric Value - 10+ Credits in 1st Year -Multiracial,N Count - 10+ Credits in 1st Year - Native Hawaiian or Pacific Islander,N Count - 10+ Credits in 1st Year - White,Metric Value - 10+ Credits in 1st Year - White,N Count - 10+ Credits in 1st Year - Female,Metric Value - 10+ Credits in 1st Year - Female,N Count - 10+ Credits in 1st Year - Male,Metric Value - 10+ Credits in 1st Year - Male,N Count - 10+ Credits in 2nd Year - Asian,Metric Value - 10+ Credits in 2nd Year - Asian,N Count - 10+ Credits in 2nd Year - Black,Metric Value - 10+ Credits in 2nd Year - Black,N Count - 10+ Credits in 2nd Year - Hispanic or Latinx,Metric Value - 10+ Credits in 2nd Year - Hispanic or Latinx,N Count - 10+ Credits in 2nd Year - Native American,Metric Value - 10+ Credits in 2nd Year - Native American,N Count - 10+ Credits in 2nd Year - Multiracial,Metric Value - 10+ Credits in 2nd Year - Multiracial,N Count - 10+ Credits in 2nd Year - Native Hawaiian or Pacific Islander,N Count - 10+ Credits in 2nd Year - White,Metric Value - 10+ Credits in 2nd Year - White,N Count - 10+ Credits in 2nd Year - Female,Metric Value - 10+ Credits in 2nd Year - Female,N Count - 10+ Credits in 2nd Year - Male,Metric Value - 10+ Credits in 2nd Year - Male,N Count - 10+ Credits in 3rd Year - Asian,Metric Value - 10+ Credits in 3rd Year - Asian,N Count - 10+ Credits in 3rd Year - Black,Metric Value - 10+ Credits in 3rd Year - Black,N Count - 10+ Credits in 3rd Year - Hispanic,Metric Value - 10+ Credits in 3rd Year - Hispanic,N Count - 10+ Credits in 3rd Year - Native American,Metric Value - 10+ Credits in 3rd Year - Native American,N Count - 10+ Credits in 3rd Year - Multiracial,Metric Value - 10+ Credits in 3rd Year - Multiracial,N Count - 10+ Credits in 3rd Year - Native Hawaiian or Pacific Islander,Metric Value - 10+ Credits in 3rd Year - Native Hawaiian or Pacific Islander,N Count - 10+ Credits in 3rd Year - White,Metric Value - 10+ Credits in 3rd Year - White,N Count - 10+ Credits in 3rd Year - Female,Metric Value - 10+ Credits in 3rd Year - Female,N Count - 10+ Credits in 3rd Year - Male,Metric Value - 10+ Credits in 3rd Year - Male,N Count - 4-Year Graduation Rate - Asian,Metric Value - 4-Year Graduation Rate - Asian,N Count - 4-Year Graduation Rate - Multiracial,Metric Value - 4-Year Graduation Rate - Multiracial,N Count - 4-Year Graduation Rate - White,Metric Value - 4-Year Graduation Rate - White,N Count - 4-Year Graduation Rate - Female,Metric Value - 4-Year Graduation Rate - Female,N Count - 4-Year Graduation Rate - Male,Metric Value - 4-Year Graduation Rate - Male,N Count - 4-Year Graduation Rate - Excluding NYSAA,Metric Value - 4-Year Graduation Rate - Excluding NYSAA,N Count - 6-Year Graduation Rate - Asian,Metric Value - 6-Year Graduation Rate - Asian,N Count - 6-Year Graduation Rate - Black,Metric Value - 6-Year Graduation Rate - Black,N Count - 6-Year Graduation Rate - Hispanic or Latinx,Metric Value - 6-Year Graduation Rate - Hispanic or Latinx,N Count - 6-Year Graduation Rate - Native American,Metric Value - 6-Year Graduation Rate - Native American,N Count - 6-Year Graduation Rate - Multiracial,Metric Value - 6-Year Graduation Rate - Multiracial,N Count - 6-Year Graduation Rate - Native Hawaiian or Pacific Islander,Metric Value - 6-Year Graduation Rate - Native Hawaiian or Pacific Islander,N Count - 6-Year Graduation Rate - White,Metric Value - 6-Year Graduation Rate - White,N Count - 6-Year Graduation Rate - Female,Metric Value - 6-Year Graduation Rate - Female,N Count - 6-Year Graduation Rate - Male,Metric Value - 6-Year Graduation Rate - Male,N Count - 6-Year Graduation Rate - Excluding NYSAA,Metric Value - 6-Year Graduation Rate - Excluding NYSAA,N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade),Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade),N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade),Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade),N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade),Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade),N Count - Average score of students in the current cohort who took the ACT English exam,Metric Value - Average score of students in the current cohort who took the ACT English exam,N Count - Average score of students in the current cohort who took the ACT Math exam,Metric Value - Average score of students in the current cohort who took the ACT Math exam,N Count - Average score of students in the current cohort who took the ACT Reading exam,Metric Value - Average score of students in the current cohort who took the ACT Reading exam,N Count - Average score of students in the current cohort who took the ACT Science exam,Metric Value - Average score of students in the current cohort who took the ACT Science exam,N Count - Average score of students in the current cohort who took the SAT Math exam,Metric Value - Average score of students in the current cohort who took the SAT Math exam,N Count - Average score of students in the current cohort who took the SAT Reading and Writing exam,Metric Value - Average score of students in the current cohort who took the SAT Reading and Writing exam,N Count - 4-Year High School Persistence Rate - Asian,Metric Value - 4-Year High School Persistence Rate - Asian,N Count - 4-Year High School Persistence Rate - Black,Metric Value - 4-Year High School Persistence Rate - Black,N Count - 4-Year High School Persistence Rate - Hispanic,Metric Value - 4-Year High School Persistence Rate - Hispanic,N Count - 4-Year High School Persistence Rate - Native American,Metric Value - 4-Year High School Persistence Rate - Native American,N Count - 4-Year High School Persistence Rate - Multiracial,Metric Value - 4-Year High School Persistence Rate - Multiracial,N Count - 4-Year High School Persistence Rate - Native Hawaiian or Pacific Islander,N Count - 4-Year High School Persistence Rate - White,Metric Value - 4-Year High School Persistence Rate - White,N Count - 4-Year High School Persistence Rate - Female,Metric Value - 4-Year High School Persistence Rate - Female,N Count - 4-Year High School Persistence Rate - Male,Metric Value - 4-Year High School Persistence Rate - Male,N Count - 6-Year High School Persistence Rate - Asian,Metric Value - 6-Year High School Persistence Rate - Asian,N Count - 6-Year High School Persistence Rate - Black,Metric Value - 6-Year High School Persistence Rate - Black,N Count - 6-Year High School Persistence Rate - Hispanic,Metric Value - 6-Year High School Persistence Rate - Hispanic,N Count - 6-Year High School Persistence Rate - Native American,Metric Value - 6-Year High School Persistence Rate - Native American,N Count - 6-Year High School Persistence Rate - Multiracial,Metric Value - 6-Year High School Persistence Rate - Multiracial,N Count - 6-Year High School Persistence Rate - Native Hawaiian or Pacific Islander,Metric Value - 6-Year High School Persistence Rate - Native Hawaiian or Pacific Islander,N Count - 6-Year High School Persistence Rate - White,Metric Value - 6-Year High School Persistence Rate - White,N Count - 6-Year High School Persistence Rate - Female,Metric Value - 6-Year High School Persistence Rate - Female,N Count - 6-Year High School Persistence Rate - Male,Metric Value - 6-Year High School Persistence Rate - Male,N Count - CUNY 2-Year (18 months),Metric Value - CUNY 2-Year (18 months),N Count - CUNY 4-Year (18 months),Metric Value - CUNY 4-Year (18 months),N Count - Out of State (18 months),Metric Value - Out of State (18 months),N Count - NYS Private (18 months),Metric Value - NYS Private (18 months),N Count - NYS Public (18 months),Metric Value - NYS Public (18 months),N Count - Public Service (18 months),Metric Value - Public Service (18 months),N Count - Vocational Program (18 months),Metric Value - Vocational Program (18 months),N Count - Postsecondary Enrollment Rate - 6 Months - Asian,Metric Value - Postsecondary Enrollment Rate - 6 Months - Asian,N Count - Postsecondary Enrollment Rate - 6 Months - Black,Metric Value - Postsecondary Enrollment Rate - 6 Months - Black,N Count - Postsecondary Enrollment Rate - 6 Months - Hispanic or Latinx,Metric Value - Postsecondary Enrollment Rate - 6 Months - Hispanic or Latinx,N Count - Postsecondary Enrollment Rate - 6 Months - Native American,Metric Value - Postsecondary Enrollment Rate - 6 Months - Native American,N Count - Postsecondary Enrollment Rate - 6 Months - Multiracial,Metric Value - Postsecondary Enrollment Rate - 6 Months - Multiracial,N Count - Postsecondary Enrollment Rate - 6 Months - Native Hawaiian or Pacific Islander,Metric Value - Postsecondary Enrollment Rate - 6 Months - Native Hawaiian or Pacific Islander,N Count - Postsecondary Enrollment Rate - 6 Months - White,Metric Value - Postsecondary Enrollment Rate - 6 Months - White,N Count - Postsecondary Enrollment Rate - 6 Months - Female,Metric Value - Postsecondary Enrollment Rate - 6 Months - Female,N Count - Postsecondary Enrollment Rate - 6 Months - Male,Metric Value - Postsecondary Enrollment Rate - 6 Months - Male,N Count - Postsecondary Enrollment in CUNY 2-Year (6 months),Metric Value - Postsecondary Enrollment in CUNY 2-Year (6 months),N Count - Postsecondary Enrollment in CUNY 4-Year (6 months),Metric Value - Postsecondary Enrollment in CUNY 4-Year (6 months),N Count - Postsecondary Enrollment in Out of State (6 months),Metric Value - Postsecondary Enrollment in Out of State (6 months),N Count - Postsecondary Enrollment in NYS Private (6 months),Metric Value - Postsecondary Enrollment in NYS Private (6 months),N Count - Postsecondary Enrollment in NYS Public (6 months),Metric Value - Postsecondary Enrollment in NYS Public (6 months),N Count - Postsecondary Enrollment in Other (6 months),Metric Value - Postsecondary Enrollment in Other (6 months),N Count - Public Service (6 months),Metric Value - Public Service (6 months),N Count - Vocational Program (6 months),Metric Value - Vocational Program (6 months),N Count - % of students in the current cohort who took the ACT English exam who scored 20+,Metric Value - % of students in the current cohort who took the ACT English exam who scored 20+,N Count - % of students in the current cohort who took the ACT Math exam who scored 21+,Metric Value - % of students in the current cohort who took the ACT Math exam who scored 21+,N Count - % of students in the current cohort who took the SAT Math exam who passed the college ready threshold,Metric Value - % of students in the current cohort who took the SAT Math exam who passed the college ready threshold,N Count - % of students in the current cohort who took the SAT Reading and Writing exam and scored 480+,Metric Value - % of students in the current cohort who took the SAT Reading and Writing exam and scored 480+,N Count - College and Career Preparatory Course Index - Asian,Metric Value - College and Career Preparatory Course Index - Asian,N Count - College and Career Preparatory Course Index - Black,Metric Value - College and Career Preparatory Course Index - Black,N Count - College and Career Preparatory Course Index -Hispanic or Latinx,Metric Value - College and Career Preparatory Course Index -Hispanic or Latinx,N Count - College and Career Preparatory Course Index - Native American,Metric Value - College and Career Preparatory Course Index - Native American,N Count - College and Career Preparatory Course Index - Multiracial,Metric Value - College and Career Preparatory Course Index - Multiracial,N Count - College and Career Preparatory Course Index - Native Hawaiian or Pacific Islander,N Count - College and Career Preparatory Course Index - White,Metric Value - College and Career Preparatory Course Index - White,N Count - College and Career Preparatory Course Index - Female,Metric Value - College and Career Preparatory Course Index - Female,N Count - College and Career Preparatory Course Index - Male,Metric Value - College and Career Preparatory Course Index - Male,N Count - % Scoring 3+ on any AP Exam,Metric Value - % Scoring 3+ on any AP Exam,N Count - % Passing a DOE-certified CPCC Course,Metric Value - % Passing a DOE-certified CPCC Course,N Count - % Passing an Industry-Recognized Technical Assessment,Metric Value - % Passing an Industry-Recognized Technical Assessment,N Count - % Scoring 4+ on any IB Exam,Metric Value - % Scoring 4+ on any IB Exam,N Count - % Earning a Grade of 'C' or Higher for College Credit,Metric Value - % Earning a Grade of 'C' or Higher for College Credit,N Count - % Earning a Diploma with an Arts Endorsement,Metric Value - % Earning a Diploma with an Arts Endorsement,N Count - % Earning a Diploma with a CTE Endorsement,Metric Value - % Earning a Diploma with a CTE Endorsement,N Count - % Attaining Regents Diploma (4 year),Metric Value - % Attaining Regents Diploma (4 year),N Count - % Attaining Regents Diploma (6 year),Metric Value - % Attaining Regents Diploma (6 year)
0,,Additional Info,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [52]:
add_info_cols = ["DBN", "N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade)", 
                          "Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade)",
                          "N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade)",
                          "Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade)",
                          "N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade)",
                          "Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade)"]
add_info['sqr2021.xlsx'] = add_info['sqr2021.xlsx'][add_info_cols]
add_info['sqr2020.xlsx'] = add_info['sqr2020.xlsx'][add_info_cols]

In [53]:
add_info_cols = ["DBN", "Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade)",
                 "N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade)",
                 "Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade)",
                 "N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade)", 
                 "Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade)", 
                 "N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade)"]

add_info['sqr2019.xlsx'] = add_info['sqr2019.xlsx'][add_info_cols]
add_info['sqr2018.xlsx'] = add_info['sqr2018.xlsx'][add_info_cols]

In [54]:
add_info['sqr2020.xlsx']['sqrYear'] = '2020'
add_info['sqr2019.xlsx']['sqrYear'] = '2019'
add_info['sqr2018.xlsx']['sqrYear'] = '2018'

In [55]:
add_info_df = pd.concat([add_info['sqr2020.xlsx'], add_info['sqr2019.xlsx'], add_info['sqr2018.xlsx']], axis=0)

In [56]:
add_info_df.rename(columns={'DBN':'dbn',
       'Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade)':'pctGradLevel1',
       'Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade)':'pctGradLevel2',
       'Metric Value - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade)':'pctGradLevel34',
       'N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 1 in 8th Grade)':'numGradLevel1',
       'N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 2 in 8th Grade)':'numGradLevel2',
       'N Count - Percentage of Students Who Graduated in 4 Years (Out of Students at Level 3 or 4 in 8th Grade)':'numGradLevel34'}, inplace=True)

add_info_df = add_info_df[((add_info_df.dbn != 'DBN') & (add_info_df.dbn.notnull()))]

In [57]:
sqr_df = summary_df.merge(stud_achieve_df, how='inner', left_on = ['dbn', 'sqrYear'], right_on = ['dbn', 'sqrYear'])

In [58]:
sqr_df = sqr_df.merge(closing_gap_df, how='inner', left_on = ['dbn', 'sqrYear'], right_on = ['dbn', 'sqrYear'])

In [59]:
sqr_df = sqr_df.merge(add_info_df, how='left', left_on = ['dbn', 'sqrYear'], right_on = ['dbn', 'sqrYear'])

In [60]:
sqr_df.head()

Unnamed: 0,dbn,sqrEnrollment,Student Percent - Asian,Student Percent - Black,Student Percent - Hispanic,Student Percent - White,econNeedInd,Average Student Attendance,pctChronAbs,sqrYear,grd8Ela,grd8Math,attdRate,numYear1,pct10PlusYear1,numYear2,pct10PlusYear2,numYear2Lowest3rd,pct10PlusYear2Lowest3rd,numYear3,pct10PlusYear3,numYear3Lowest3rd,pct10PlusYear3Lowest3rd,N count - 4-Year Graduation Rate - All Students,Metric Value - 4-Year Graduation Rate - All Students,numYear1Lowest3rd,pct10PlusYear1Lowest3rd,numGradLowest3rd,pctGradLowest3rd,numPERLowest3rd,pctPERLowest3rd,pctCCPCILowest3rd,pctCollegeReadyLowest3rd,numGradLevel1,pctGradLevel1,numGradLevel2,pctGradLevel2,numGradLevel34,pctGradLevel34
0,01M292,296.0,0.091,0.253,0.588,0.014,0.822,0.807,0.566,2021,,,,106.0,0.736,57.0,0.789,18.0,0.889,65.0,0.892,21.0,0.762,65.0,0.938,,,30.0,0.9,6.0,0.333,0.167,,,,,,,
1,01M448,570.0,0.318,0.181,0.44,0.046,0.811,0.86,0.382,2021,,,,151.0,0.993,135.0,0.993,43.0,1.0,141.0,0.957,42.0,0.952,134.0,0.985,,,37.0,0.946,31.0,0.71,0.838,,,,,,,
2,01M450,377.0,0.077,0.154,0.568,0.17,0.628,0.975,0.048,2021,,,,97.0,0.938,100.0,0.93,29.0,0.862,92.0,0.924,28.0,0.964,79.0,0.924,,,19.0,0.895,23.0,0.652,0.842,,,,,,,
3,01M539,664.0,0.271,0.093,0.181,0.413,0.381,0.947,0.123,2021,,,,171.0,0.977,199.0,0.99,64.0,0.984,150.0,0.967,44.0,0.932,133.0,0.985,,,8.0,1.0,13.0,0.615,0.75,,,,,,,
4,01M696,636.0,0.236,0.132,0.195,0.393,0.403,0.942,0.198,2021,,,,149.0,0.973,157.0,0.955,50.0,0.92,164.0,0.963,50.0,0.94,161.0,0.95,,,5.0,0.6,2.0,,0.8,,,,,,,


In [61]:
sqr_df.to_csv('data/clean/sqr.csv',index=False)

#### Location Data

In [62]:
lcgms = pd.read_excel('data/lcgms.xlsx', sheet_name='lcgms')

In [63]:
# lcgms['DBN'] = lcgms['ATS System Code'].str.strip()

In [64]:
lcgms.columns

Index(['ATS System Code', 'Location Code', 'Location Name', 'BEDS Number',
       'Managed By Name', 'Location Type Description',
       'Location Category Description', 'Grades', 'Grades Final', 'Open Date',
       'Status Description', 'Building Code', 'Primary Address', 'City',
       'State Code', 'Zip', 'Borough Block Lot', 'Census Tract',
       'Community District', 'Council District', 'NTA', 'NTA_Name',
       'Principal Name', 'Principal Title', 'Principal Phone Number',
       'Fax Number', 'Geographical District Code',
       'Administrative District Code', 'Administrative District Location Code',
       'Administrative District Name', 'Superintendent',
       'Superintendent Title', 'Superintendent Location Code',
       'Community School Sup Name', 'HighSchool Network Location Code',
       'HighSchool Network Name', 'HighSchool Network Superintendent',
       'Executive Superintendent Location code',
       'Executive Superintendent Name', 'Executive Superintendent Title'

In [65]:
lcgms.head()

Unnamed: 0,ATS System Code,Location Code,Location Name,BEDS Number,Managed By Name,Location Type Description,Location Category Description,Grades,Grades Final,Open Date,Status Description,Building Code,Primary Address,City,State Code,Zip,Borough Block Lot,Census Tract,Community District,Council District,NTA,NTA_Name,Principal Name,Principal Title,Principal Phone Number,Fax Number,Geographical District Code,Administrative District Code,Administrative District Location Code,Administrative District Name,Superintendent,Superintendent Title,Superintendent Location Code,Community School Sup Name,HighSchool Network Location Code,HighSchool Network Name,HighSchool Network Superintendent,Executive Superintendent Location code,Executive Superintendent Name,Executive Superintendent Title,BCO Location Code,BCO Location Name,BCO Executive Director,BCO Executive Director Title
0,01M015,M015,P.S. 015 Roberto Clemente,310100000000.0,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",Jul 1 1904,Open,M015,333 EAST 4 STREET,MANHATTAN,NY,10009,1003740000.0,2601.0,103.0,2.0,MN28,Lower East Side,IRENE SANCHEZ,Principal,212-228-8730,212-477-0931,1,1,M801,COMMUNITY SCHOOL DISTRICT 01,"CHAN, CARRY",CSD Superintendent,M801,"CHAN, CARRY",,,,SS04,"ROSALES, MARISOL",Executive Superintendent,MFSC,Manhattan NYCDOE Borough Office,Steven Aragona,FSC Executive Director
1,01M019,M019,P.S. 019 Asher Levy,310100000000.0,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",Jul 1 1956,Open,M019,185 1 AVENUE,MANHATTAN,NY,10003,1004530000.0,4000.0,103.0,2.0,MN22,East Village,JACQUELINE FLANAGAN,Principal,212-533-5340,212-673-1477,1,1,M801,COMMUNITY SCHOOL DISTRICT 01,"CHAN, CARRY",CSD Superintendent,M801,"CHAN, CARRY",,,,SS04,"ROSALES, MARISOL",Executive Superintendent,MFSC,Manhattan NYCDOE Borough Office,Steven Aragona,FSC Executive Director
2,01M020,M020,P.S. 020 Anna Silver,310100000000.0,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",Jul 1 1963,Open,M020,166 ESSEX STREET,MANHATTAN,NY,10002,1003550000.0,3001.0,103.0,1.0,MN27,Chinatown,SARAH PINTO VIAGRAN,Principal,212-254-9577,212-254-3526,1,1,M801,COMMUNITY SCHOOL DISTRICT 01,"CHAN, CARRY",CSD Superintendent,M801,"CHAN, CARRY",,,,SS04,"ROSALES, MARISOL",Executive Superintendent,MFSC,Manhattan NYCDOE Borough Office,Steven Aragona,FSC Executive Director
3,01M034,M034,P.S. 034 Franklin D. Roosevelt,310100000000.0,DOE,General Academic,K-8,"PK,0K,01,02,03,04,05,06,07,08,SE","PK,0K,01,02,03,04,05,06,07,08",Jul 1 1955,Open,M034,730 East 12 Street,Manhattan,NY,10009,1003810000.0,2800.0,103.0,2.0,MN28,Lower East Side,Bryan Glover,Principal,212-228-4433,212-353-1973,1,1,M801,COMMUNITY SCHOOL DISTRICT 01,"CHAN, CARRY",CSD Superintendent,M801,"CHAN, CARRY",,,,SS04,"ROSALES, MARISOL",Executive Superintendent,MFSC,Manhattan NYCDOE Borough Office,Steven Aragona,FSC Executive Director
4,01M063,M063,The STAR Academy - P.S.63,310100000000.0,DOE,General Academic,Elementary,"PK,0K,01,02,03,04,05,SE","PK,0K,01,02,03,04,05",Jul 1 1905,Open,M063,121 EAST 3 STREET,MANHATTAN,NY,10009,1004310000.0,3200.0,103.0,2.0,MN22,East Village,DARLENE CAMERON,Principal,212-674-3180,212-420-9018,1,1,M801,COMMUNITY SCHOOL DISTRICT 01,"CHAN, CARRY",CSD Superintendent,M801,"CHAN, CARRY",,,,SS04,"ROSALES, MARISOL",Executive Superintendent,MFSC,Manhattan NYCDOE Borough Office,Steven Aragona,FSC Executive Director


In [66]:
lcgms.groupby(['Administrative District Location Code']).count()

Unnamed: 0_level_0,ATS System Code,Location Code,Location Name,BEDS Number,Managed By Name,Location Type Description,Location Category Description,Grades,Grades Final,Open Date,Status Description,Building Code,Primary Address,City,State Code,Zip,Borough Block Lot,Census Tract,Community District,Council District,NTA,NTA_Name,Principal Name,Principal Title,Principal Phone Number,Fax Number,Geographical District Code,Administrative District Code,Administrative District Name,Superintendent,Superintendent Title,Superintendent Location Code,Community School Sup Name,HighSchool Network Location Code,HighSchool Network Name,HighSchool Network Superintendent,Executive Superintendent Location code,Executive Superintendent Name,Executive Superintendent Title,BCO Location Code,BCO Location Name,BCO Executive Director,BCO Executive Director Title
Administrative District Location Code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1
DA75,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,61,60,61,61,60,60,61,61,61,61,61,61,61,61,61,61,61,0,0,0,0,0,0,61,61,61,61
DA79,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,1,0,1,1,1,1,1,1,1,1,1,1
HS01,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,25,0,25,25,25,0,25,25,25,25,25,25,25,25,25,25
HS02,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,39,0,39,39,39,0,39,39,39,39,39,39,39,39,39,39
HS03,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,41,0,41,41,41,0,41,41,41,41,41,41,41,41,41,41
HS04,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,0,47,47,47,0,47,47,47,47,47,47,47,47,47,47
HS05,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,47,46,46,47,47,0,47,47,47,0,47,47,47,47,47,47,47,47,47,47
HS06,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,26,0,26,26,26,0,26,26,26,26,26,26,26,26,26,26
HS07,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,60,58,60,60,0,60,60,60,0,60,60,60,60,60,60,60,60,60,60
HS08,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,29,0,29,29,29,0,29,29,29,29,29,29,29,29,29,29


In [67]:
lcgms.drop(columns = ['ATS System Code','Location Code', 'Location Name', 'BEDS Number',
       'Managed By Name', 'Grades', 'Grades Final', 'Open Date',
       'Status Description', 'Building Code', 'Primary Address', 'City',
       'State Code', 
       'Community District', 'Council District', 'NTA', 'NTA_Name',
       'Principal Name', 'Principal Title', 'Principal Phone Number',
       'Fax Number', 
       'Administrative District Code', 'Administrative District Location Code',
       'Administrative District Name', 'Superintendent',
       'Superintendent Title', 'Superintendent Location Code',
       'Community School Sup Name', 'HighSchool Network Location Code',
       'HighSchool Network Name', 'HighSchool Network Superintendent',
       'Executive Superintendent Location code',
       'Executive Superintendent Name', 'Executive Superintendent Title',
       'BCO Location Code', 'BCO Location Name', 'BCO Executive Director',
       'BCO Executive Director Title'], inplace=True)

In [68]:
lcgms.rename(columns={'Location Type Description':'type', 'Location Category Description':'desc',
                      'Zip':'zip', 'Borough Block Lot':'boroughBlockLot', 'Census Tract':'censusTract', 'DBN':'dbn',
                     'Geographical District Code':'geoDis'}, inplace=True)

In [69]:
lcgms.to_csv('data/clean/lcgms.csv',index=False)

#### Demographics Data

In [70]:
demo = pd.read_excel('data/demo.xlsx', sheet_name = 'School')
demo[['sy','sqrYear']] = demo.Year.str.split("-", expand=True)
demo['sqrYear'] = '20'+demo['sqrYear']

demo2 = pd.read_excel('data/demo2.xlsx', sheet_name = 'School')
demo2[['sy','sqrYear']] = demo2.Year.str.split("-", expand=True)
demo2['sqrYear'] = '20'+demo2['sqrYear']

In [71]:
demo.columns.intersection(demo2.columns)

Index(['DBN', 'School Name', 'Year', 'Total Enrollment',
       'Grade 3K+PK (Half Day & Full Day)', 'Grade K', 'Grade 1', 'Grade 2',
       'Grade 3', 'Grade 4', 'Grade 5', 'Grade 6', 'Grade 7', 'Grade 8',
       'Grade 9', 'Grade 10', 'Grade 11', 'Grade 12', '# Female', '% Female',
       '# Male', '% Male', '# Asian', '% Asian', '# Black', '% Black',
       '# Hispanic', '% Hispanic', '# White', '% White',
       '# Students with Disabilities', '% Students with Disabilities',
       '# English Language Learners', '% English Language Learners',
       '# Poverty', '% Poverty', 'Economic Need Index', 'sy', 'sqrYear'],
      dtype='object')

In [72]:
demo.columns.difference(demo2.columns)

Index(['# Multiple Race Categories Not Represented', '% Multiple Race Categories Not Represented'], dtype='object')

In [73]:
demo2.columns.difference(demo.columns)

Index(['# Missing Race/Ethnicity Data', '# Multi-Racial', '# Native American',
       '% Missing Race/Ethnicity Data', '% Multi-Racial', '% Native American'],
      dtype='object')

In [74]:
demo.drop(columns = ['School Name', 'Year', 
       'Grade 3K+PK (Half Day & Full Day)', 'Grade K', 'Grade 1', 'Grade 2',
       'Grade 3', 'Grade 4', 'Grade 5', 'Grade 6', 'Grade 7', 'Grade 8',
       '# Multiple Race Categories Not Represented',
       '% Multiple Race Categories Not Represented','sy'], inplace=True)

In [75]:
demo2.drop(columns = ['School Name', 'Year', 
       'Grade 3K+PK (Half Day & Full Day)', 'Grade K', 'Grade 1', 'Grade 2',
       'Grade 3', 'Grade 4', 'Grade 5', 'Grade 6', 'Grade 7', 'Grade 8',
        '# Missing Race/Ethnicity Data', '# Multi-Racial', '# Native American',
       '% Missing Race/Ethnicity Data', '% Multi-Racial', '% Native American',
       'sy'], inplace=True)

In [76]:
demoCols = {'DBN':'dbn', 'Total Enrollment':'enrollment',  'Grade 9':'gr9', 'Grade 10':'gr10', 'Grade 11':'gr11',
                     'Grade 12':'gr12', '# Female':'female', '% Female': 'pctFemale', 
                     '# Male':'male', '% Male':'pctMale', '# Asian' : 'asian', '% Asian' : 'pctAsian',
                     '# Black': 'black', '% Black':'pctBlack', '# Hispanic':'hispanic', '% Hispanic':'pctHispanic',
                     '# White':'white', '% White':'pctWhite', '# Students with Disabilities':'swd',
                     '% Students with Disabilities':'pctSwd', '# English Language Learners':'ell', 
                     '% English Language Learners':'pctEll', '# Poverty':'poverty', '% Poverty':'pctPoverty', 
                    'Economic Need Index': 'econInd'}
demo.rename(columns=demoCols, inplace=True)
demo2.rename(columns=demoCols, inplace=True)

In [77]:
demo.loc[demo.econInd == 'Below 5%', 'econInd'] = 0.05
demo.loc[demo.econInd == 'Above 95%', 'econInd'] = 0.95
demo2.loc[demo2.econInd == 'Below 5%', 'econInd'] = 0.05
demo2.loc[demo2.econInd == 'Above 95%', 'econInd'] = 0.95

In [78]:
demo.iloc[:, 1:-1] = demo.iloc[:, 1:-1].apply(pd.to_numeric, errors='coerce')
demo2.iloc[:, 1:-1] = demo2.iloc[:, 1:-1].apply(pd.to_numeric, errors='coerce')

In [79]:
demo_df = pd.concat([demo[demo.sqrYear == '2016'], demo2])

In [80]:
demo_df.to_csv('data/clean/demo.csv',index=False)