### Download and Store Data Files

In [1]:
import os
import ssl
import sys
import urllib.request
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

%matplotlib inline

#### Downloading the Files

In [2]:
gradurls = {'https://infohub.nyced.org/docs/default-source/default-document-library/2020-graduation_rates_public_school.xlsx': 'grad_rates.xlsx',
            'https://infohub.nyced.org/docs/default-source/default-document-library/2020-graduation_rates_public_charters.xlsx': 'charter_grad_rates.xlsx'}
sqrurls = {'https://infohub.nyced.org/docs/default-source/default-document-library/201920_hs_sqr_results.xlsx': 'sqr2020.xlsx',
           'https://infohub.nyced.org/docs/default-source/default-document-library/201819_hs_sqr_results.xlsx': 'sqr2019.xlsx',
           'https://infohub.nyced.org/docs/default-source/default-document-library/201718_hs_sqr_results.xlsx': 'sqr2018.xlsx',
           'http://infohub.nyced.org/docs/default-source/default-document-library/2016-17_hs_sqr.xlsx': 'sqr2017.xlsx',
           'http://infohub.nyced.org/docs/default-source/default-document-library/2015_2016_hs_sqr_results_2017_01_05.xlsx': 'sqr2016.xlsx',
           'http://infohub.nyced.org/docs/default-source/default-document-library/2014_2015_hs_sqr_results_2016_04_08.xlsx': 'sqr2015.xlsx'}
demourls = {'https://infohub.nyced.org/docs/default-source/default-document-library/demographic-snapshot-2015-16-to-2019-20-(public).xlsx': 'demo.xlsx'}
urls = [gradurls, sqrurls, demourls]

In [3]:
for url in urls:
    try:
        [urllib.request.urlretrieve(x, 'data/'+url[x]) for x in url]
    except:
        print("Trying the next url")

Trying the next url
Trying the next url
Trying the next url


#### Graduation Rate Files

In [4]:
gradrates = pd.read_excel('data/grad_rates.xlsx', sheet_name = "All")
charterrates = pd.read_excel('data/charter_grad_rates.xlsx', sheet_name = "All")

In [5]:
gradrates[['DBN', 'School Name', 'Cohort Year', 'Cohort', '# Total Cohort', '# Grads', '# Dropout', '# Still Enrolled']].head()

Unnamed: 0,DBN,School Name,Cohort Year,Cohort,# Total Cohort,# Grads,# Dropout,# Still Enrolled
0,01M292,ORCHARD COLLEGIATE ACADEMY,2016,4 year August,31,29,1,1
1,01M292,ORCHARD COLLEGIATE ACADEMY,2015,4 year August,28,25,3,0
2,01M292,ORCHARD COLLEGIATE ACADEMY,2014,4 year August,29,28,1,0
3,01M292,ORCHARD COLLEGIATE ACADEMY,2013,4 year August,36,25,7,3
4,01M292,ORCHARD COLLEGIATE ACADEMY,2012,4 year August,44,24,10,10


In [6]:
charterrates[['DBN', 'School Name', 'Cohort Year', '# Total Cohort', '# Grads', '# Dropout', '# Still Enrolled']].head()

Unnamed: 0,DBN,School Name,Cohort Year,# Total Cohort,# Grads,# Dropout,# Still Enrolled
0,84K355,WILLIAMSBURG COLLEGIATE CHARTER SCHOOL,2012,37,34,0,3
1,84K355,WILLIAMSBURG COLLEGIATE CHARTER SCHOOL,2011,37,28,1,8
2,84K355,WILLIAMSBURG COLLEGIATE CHARTER SCHOOL,2010,31,26,0,5
3,84K355,WILLIAMSBURG COLLEGIATE CHARTER SCHOOL,2009,31,20,0,3
4,84K355,WILLIAMSBURG COLLEGIATE CHARTER SCHOOL,2012,37,34,0,3


In [7]:
all_grad_rates = pd.concat([gradrates[['DBN', 'School Name', 'Cohort Year', 'Cohort', '# Total Cohort', '# Grads', '# Dropout', '# Still Enrolled']],
                            charterrates[['DBN', 'School Name', 'Cohort Year', 'Cohort', '# Total Cohort', '# Grads', '# Dropout', '# Still Enrolled']]], 
                           ignore_index=True)
all_grad_rates[['# Total Cohort', '# Grads', '# Dropout', '# Still Enrolled']] = all_grad_rates[['# Total Cohort', '# Grads', '# Dropout', '# Still Enrolled']].apply(pd.to_numeric, errors='coerce')

In [8]:
all_grad_rates.to_csv('data/clean/all_grad_rates.csv')

In [9]:
all_grad_rates.head()

Unnamed: 0,DBN,School Name,Cohort Year,Cohort,# Total Cohort,# Grads,# Dropout,# Still Enrolled
0,01M292,ORCHARD COLLEGIATE ACADEMY,2016,4 year August,31,29.0,1.0,1.0
1,01M292,ORCHARD COLLEGIATE ACADEMY,2015,4 year August,28,25.0,3.0,0.0
2,01M292,ORCHARD COLLEGIATE ACADEMY,2014,4 year August,29,28.0,1.0,0.0
3,01M292,ORCHARD COLLEGIATE ACADEMY,2013,4 year August,36,25.0,7.0,3.0
4,01M292,ORCHARD COLLEGIATE ACADEMY,2012,4 year August,44,24.0,10.0,10.0


#### SQR Files

In [10]:
tabs = ['Summary', 'Student Achievement', 'Closing the Achievement Gap', 'Framework', 'Additional Info']
sqr_files = ['sqr2020.xlsx', 'sqr2019.xlsx', 'sqr2018.xlsx', 'sqr2017.xlsx', 'sqr2016.xlsx', 'sqr2015.xlsx']

In [11]:
summary = {}
stud_achieve = {}
closing_gap = {}
framework = {}
add_info = {}

for f in sqr_files:
    summary[f] = pd.read_excel('data/'+f, sheet_name='Summary')
    stud_achieve[f] = pd.read_excel('data/'+f, sheet_name='Student Achievement')
    closing_gap[f] = pd.read_excel('data/'+f, sheet_name='Closing the Achievement Gap')
    framework[f] = pd.read_excel('data/'+f, sheet_name='Framework')
    add_info[f] = pd.read_excel('data/'+f, sheet_name='Additional Info')

In [12]:
summary['sqr2020.xlsx'].columns = summary['sqr2020.xlsx'].iloc[2]
summary['sqr2019.xlsx'].columns = summary['sqr2019.xlsx'].iloc[2]
summary['sqr2018.xlsx'].columns = summary['sqr2018.xlsx'].iloc[0]
summary['sqr2017.xlsx'].columns = summary['sqr2017.xlsx'].iloc[0]
summary['sqr2016.xlsx'].columns = summary['sqr2016.xlsx'].iloc[0]
summary['sqr2015.xlsx'].columns = summary['sqr2015.xlsx'].iloc[0]

In [13]:
summary_cols = ['DBN','Enrollment', 'Percent Asian', 'Percent Black','Percent Hispanic', 'Percent White',
                'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                'Student Attendance Rate', 'Percent of Students Chronically Absent']
summary['sqr2019.xlsx'] = summary['sqr2019.xlsx'][summary_cols]
summary['sqr2018.xlsx'] = summary['sqr2018.xlsx'][summary_cols]
summary['sqr2017.xlsx'] = summary['sqr2017.xlsx'][summary_cols]
summary['sqr2016.xlsx'] = summary['sqr2016.xlsx'][summary_cols]
summary['sqr2015.xlsx'] = summary['sqr2015.xlsx'][summary_cols]

In [14]:
summary['sqr2020.xlsx'] = summary['sqr2020.xlsx'][['DBN','Enrollment',
                         'Student Percent - Asian','Student Percent - Black','Student Percent - Hispanic',
                         'Student Percent - White', 'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                         'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                         'Average Student Attendance (through Feb-2020 only)',
                         'Percent of Students Chronically Absent (through Feb-2020 only)']]

In [15]:
summary['sqr2020.xlsx'].rename(columns={'Student Percent - Asian':'Percent Asian',
                                        'Student Percent - Black':'Percent Black',
                                        'Student Percent - Hispanic': 'Percent Hispanic',
                                        'Student Percent - White': 'Percent White', 
                                        'Average Student Attendance (through Feb-2020 only)' : 'Student Attendance Rate',
                                        'Percent of Students Chronically Absent (through Feb-2020 only)':'Percent of Students Chronically Absent'}, inplace=True)

In [16]:
summary['sqr2020.xlsx']['SQR Year'] = '2020'
summary['sqr2019.xlsx']['SQR Year'] = '2019'
summary['sqr2018.xlsx']['SQR Year'] = '2018'
summary['sqr2017.xlsx']['SQR Year'] = '2017'
summary['sqr2016.xlsx']['SQR Year'] = '2016'
summary['sqr2015.xlsx']['SQR Year'] = '2015'

In [17]:
summary_df = pd.concat(summary).reset_index().drop(['level_0', 'level_1'], axis = 1)
summary_df = summary_df[summary_df.DBN != 'DBN']

In [18]:
summary_df[['Enrollment', 'Percent Asian', 'Percent Black','Percent Hispanic', 'Percent White',
                'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                'Student Attendance Rate', 'Percent of Students Chronically Absent']] = summary_df[['Enrollment', 'Percent Asian', 'Percent Black','Percent Hispanic', 'Percent White',
                'Percent English Language Learners','Percent Students with Disabilities','Percent Self-Contained',
                'Economic Need Index','Average Grade 8 English Proficiency','Average Grade 8 Math Proficiency',
                'Student Attendance Rate', 'Percent of Students Chronically Absent']].apply(pd.to_numeric, errors='coerce')

In [19]:
stud_achieve['sqr2020.xlsx'].columns = stud_achieve['sqr2020.xlsx'].iloc[2]
stud_achieve['sqr2019.xlsx'].columns = stud_achieve['sqr2019.xlsx'].iloc[2]
stud_achieve['sqr2018.xlsx'].columns = stud_achieve['sqr2018.xlsx'].iloc[0]
stud_achieve['sqr2017.xlsx'].columns = stud_achieve['sqr2017.xlsx'].iloc[0]
stud_achieve['sqr2016.xlsx'].columns = stud_achieve['sqr2016.xlsx'].iloc[0]
stud_achieve['sqr2015.xlsx'].columns = stud_achieve['sqr2015.xlsx'].iloc[0]

In [20]:
sqrcols = ["DBN", "N count - 10+ Credits in 1st Year - All Students",
                              "Metric Value - 10+ Credits in 1st Year - All Students",
                              "N count - 10+ Credits in 1st Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
                              "N count - 10+ Credits in 2nd Year - All Students",
                              "Metric Value - 10+ Credits in 2nd Year - All Students",
                              "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                              "N count - 10+ Credits in 3rd Year - All Students",
                              "Metric Value - 10+ Credits in 3rd Year - All Students",
                              "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third"]
stud_achieve['sqr2020.xlsx'] = stud_achieve['sqr2020.xlsx'][sqrcols]
stud_achieve['sqr2019.xlsx'] = stud_achieve['sqr2019.xlsx'][sqrcols]
stud_achieve['sqr2018.xlsx'] = stud_achieve['sqr2018.xlsx'][sqrcols]
stud_achieve['sqr2017.xlsx'] = stud_achieve['sqr2017.xlsx'][sqrcols]

sqrcols = ["DBN", "Metric Value - Percentage Earning 10+ Credits in First Year",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in First Year",
           "Metric Value - Percentage Earning 10+ Credits in Second Year",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Second Year",
           "Metric Value - Percentage Earning 10+ Credits in Third Year",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Third Year"]
stud_achieve['sqr2015.xlsx'] = stud_achieve['sqr2015.xlsx'][sqrcols]
stud_achieve['sqr2016.xlsx'] = stud_achieve['sqr2016.xlsx'][sqrcols]

In [21]:
sqrcols = {"Metric Value - Percentage Earning 10+ Credits in First Year":"Metric Value - 10+ Credits in 1st Year - All Students",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in First Year": "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
           "Metric Value - Percentage Earning 10+ Credits in Second Year" : "Metric Value - 10+ Credits in 2nd Year - All Students",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Second Year": "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
           "Metric Value - Percentage Earning 10+ Credits in Third Year": "Metric Value - 10+ Credits in 3rd Year - All Students",
           "Metric Value - Lowest Third School, Percentage Earning 10+ Credits in Third Year": "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third"}
stud_achieve['sqr2015.xlsx'].rename(columns = sqrcols, inplace=True)
stud_achieve['sqr2016.xlsx'].rename(columns = sqrcols, inplace=True)

In [22]:
stud_achieve['sqr2020.xlsx']['SQR Year'] = '2020'
stud_achieve['sqr2019.xlsx']['SQR Year'] = '2019'
stud_achieve['sqr2018.xlsx']['SQR Year'] = '2018'
stud_achieve['sqr2017.xlsx']['SQR Year'] = '2017'
stud_achieve['sqr2016.xlsx']['SQR Year'] = '2016'
stud_achieve['sqr2015.xlsx']['SQR Year'] = '2015'

In [23]:
stud_achieve_df = pd.concat(stud_achieve).reset_index().drop(['level_0', 'level_1'], axis=1)
stud_achieve_df = stud_achieve_df[stud_achieve_df.DBN != 'DBN']

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


In [24]:
stud_achieve_df[["N count - 10+ Credits in 1st Year - All Students", "Metric Value - 10+ Credits in 1st Year - All Students",
                 "N count - 10+ Credits in 1st Year - School's Lowest Third",
                 "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
                 "N count - 10+ Credits in 2nd Year - All Students",
                 "Metric Value - 10+ Credits in 2nd Year - All Students",
                 "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                 "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                 "N count - 10+ Credits in 3rd Year - All Students",
                 "Metric Value - 10+ Credits in 3rd Year - All Students",
                 "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                 "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third"]] = stud_achieve_df[["N count - 10+ Credits in 1st Year - All Students",
                              "Metric Value - 10+ Credits in 1st Year - All Students",
                              "N count - 10+ Credits in 1st Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 1st Year - School's Lowest Third",
                              "N count - 10+ Credits in 2nd Year - All Students",
                              "Metric Value - 10+ Credits in 2nd Year - All Students",
                              "N count - 10+ Credits in 2nd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 2nd Year - School's Lowest Third",
                              "N count - 10+ Credits in 3rd Year - All Students",
                              "Metric Value - 10+ Credits in 3rd Year - All Students",
                              "N count - 10+ Credits in 3rd Year - School's Lowest Third",
                              "Metric Value - 10+ Credits in 3rd Year - School's Lowest Third"]].apply(pd.to_numeric, errors='coerce')

In [30]:
summary_df.merge(stud_achieve_df, how='inner', left_on = ['DBN', 'SQR Year'], right_on = ['DBN', 'SQR Year'])

Unnamed: 0,DBN,Enrollment,Percent Asian,Percent Black,Percent Hispanic,Percent White,Percent English Language Learners,Percent Students with Disabilities,Percent Self-Contained,Economic Need Index,...,Metric Value - 10+ Credits in 2nd Year - All Students,Metric Value - 10+ Credits in 2nd Year - School's Lowest Third,Metric Value - 10+ Credits in 3rd Year - All Students,Metric Value - 10+ Credits in 3rd Year - School's Lowest Third,N count - 10+ Credits in 1st Year - All Students,N count - 10+ Credits in 1st Year - School's Lowest Third,N count - 10+ Credits in 2nd Year - All Students,N count - 10+ Credits in 2nd Year - School's Lowest Third,N count - 10+ Credits in 3rd Year - All Students,N count - 10+ Credits in 3rd Year - School's Lowest Third
0,01M292,255.0,0.132,0.244,0.566,0.039,0.127,0.298,0.015,0.832,...,0.568,0.438,0.700,0.333,,,,,,
1,01M448,304.0,0.299,0.250,0.411,0.033,0.194,0.220,0.003,0.812,...,0.872,0.720,0.742,0.533,,,,,,
2,01M450,666.0,0.094,0.221,0.579,0.083,0.018,0.234,0.000,0.610,...,0.870,0.818,0.824,0.700,,,,,,
3,01M509,363.0,0.058,0.399,0.512,0.025,0.091,0.284,0.146,0.767,...,0.627,0.542,0.583,0.407,,,,,,
4,01M539,1735.0,0.385,0.133,0.180,0.286,0.002,0.043,0.000,0.257,...,0.984,0.945,0.984,0.962,,,,,,
5,01M696,545.0,0.171,0.172,0.187,0.453,0.000,0.011,0.000,0.314,...,0.952,0.882,0.963,0.974,,,,,,
6,02M047,169.0,0.012,0.284,0.645,0.053,0.047,0.254,0.083,0.733,...,0.875,0.800,0.881,0.947,,,,,,
7,02M135,193.0,0.057,0.285,0.622,0.026,0.073,0.197,0.010,0.713,...,0.663,0.438,,,,,,,,
8,02M139,193.0,0.036,0.249,0.611,0.088,0.078,0.295,0.016,0.714,...,0.880,0.815,,,,,,,,
9,02M280,97.0,0.072,0.320,0.433,0.155,0.010,0.227,0.000,0.558,...,,,,,,,,,,


In [32]:
stud_achieve_df

Unnamed: 0,DBN,Metric Value - 10+ Credits in 1st Year - All Students,Metric Value - 10+ Credits in 1st Year - School's Lowest Third,Metric Value - 10+ Credits in 2nd Year - All Students,Metric Value - 10+ Credits in 2nd Year - School's Lowest Third,Metric Value - 10+ Credits in 3rd Year - All Students,Metric Value - 10+ Credits in 3rd Year - School's Lowest Third,N count - 10+ Credits in 1st Year - All Students,N count - 10+ Credits in 1st Year - School's Lowest Third,N count - 10+ Credits in 2nd Year - All Students,N count - 10+ Credits in 2nd Year - School's Lowest Third,N count - 10+ Credits in 3rd Year - All Students,N count - 10+ Credits in 3rd Year - School's Lowest Third,SQR Year
1,01M292,0.857,0.867,0.568,0.438,0.700,0.333,,,,,,,2015
2,01M448,0.889,0.800,0.872,0.720,0.742,0.533,,,,,,,2015
3,01M450,0.946,0.828,0.870,0.818,0.824,0.700,,,,,,,2015
4,01M509,0.652,0.619,0.627,0.542,0.583,0.407,,,,,,,2015
5,01M539,0.975,0.912,0.984,0.945,0.984,0.962,,,,,,,2015
6,01M696,0.984,0.973,0.952,0.882,0.963,0.974,,,,,,,2015
7,02M047,1.000,1.000,0.875,0.800,0.881,0.947,,,,,,,2015
8,02M135,0.716,0.552,0.663,0.438,,,,,,,,,2015
9,02M139,0.868,0.730,0.880,0.815,,,,,,,,,2015
10,02M280,0.928,0.844,,,,,,,,,,,2015
