## California School Enrollment

This pipeline downloads historical school enrollment data from California Department of Education (https://dq.cde.ca.gov/dataquest) and saves them in corresponding text files.
Data dictionary: https://www.cde.ca.gov/ds/sd/sd/fsenr.asp

In [1]:
import requests

years = list(range(2009, 2019))

for year in years:
    url = ('http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear={}-{}&cCat=Enrollment&cPage=filesenr.asp'
            .format(year, str(year+1)[2:]))
    outfilename = 'data/{}filesenr.asp.txt'.format(year)
    print('url, outfilename: ', url, outfilename)
    r = requests.get(url, allow_redirects=True)
    open(outfilename, 'wb').write(r.content)

url, outfilename:  http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2009-10&cCat=Enrollment&cPage=filesenr.asp data/2009filesenr.asp.txt
url, outfilename:  http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2010-11&cCat=Enrollment&cPage=filesenr.asp data/2010filesenr.asp.txt
url, outfilename:  http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2011-12&cCat=Enrollment&cPage=filesenr.asp data/2011filesenr.asp.txt
url, outfilename:  http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2012-13&cCat=Enrollment&cPage=filesenr.asp data/2012filesenr.asp.txt
url, outfilename:  http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2013-14&cCat=Enrollment&cPage=filesenr.asp data/2013filesenr.asp.txt
url, outfilename:  http://dq.cde.ca.gov/dataquest/dlfile/dlfile.aspx?cLevel=School&cYear=2014-15&cCat=Enrollment&cPage=filesenr.asp data/2014filesenr.asp.txt
url, outfilename:  http://dq.cde.ca.gov/dataquest/dl

In [2]:
key_columns = ['CDS_CODE', 'ETHNIC', 'GENDER']
grade_columns = ['KDGN', 'GR_1', 'GR_2', 'GR_3', 'GR_4', 'GR_5', 
                 'GR_6', 'GR_7', 'GR_8', 'GR_9', 'GR_10', 'GR_11', 'GR_12',
                 'UNGR_ELM', 'UNGR_SEC']
ethnic_lkup_list = ['Not reported',
                    'American Indian or Alaska Native',
                    'Asian',
                    'Pacific Islander',
                    'Filipino',
                    'Hispanic or Latino',
                    'African American',
                    'White',
                    'N/A',
                    'Two or More Races']

In [5]:
import os
import re
import pandas as pd

input_dir = 'data'
enrollment_filenames = sorted([filename for filename in os.listdir(input_dir) if re.match('\d+filesenr', filename)])
enrollment_dfs = []
for filename in enrollment_filenames:
    df = pd.read_csv(input_dir + "/" + filename, delimiter="\t")
    df['YEAR'] = int(filename[:4]) # add the year column
    enrollment_dfs.append(df)

combined_df = pd.concat(enrollment_dfs, axis=0)
print(combined_df.shape)

(1282384, 24)


In [6]:
# update the ethnic column to be the corresponding description
combined_df['ETHNIC'] = combined_df['ETHNIC'].apply(lambda x: ethnic_lkup_list[x])

In [7]:
# output the historical enrollment data into one file
combined_df.to_csv(input_dir + "/" +'school_enrollment_demographics.csv')