Skip to content

Commit

Permalink
Script for digesting TEA Excel data
Browse files Browse the repository at this point in the history
  • Loading branch information
johntyree committed Nov 15, 2015
1 parent 2030d50 commit 0a0fcba
Showing 1 changed file with 136 additions and 0 deletions.
136 changes: 136 additions & 0 deletions work.py
@@ -0,0 +1,136 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from __future__ import division, print_function

import os
from multiprocessing.pool import Pool

import pandas as pd

race_columns = [
u'Asian', u'Black_or_African_American', u'Hispanic_Latino',
u'Two_or_more_races', u'White', u'American_Indian_or_Alaska_Nat',
u'Native_Hawaiian_Other_Pacific'
]

categorical_columns = {'DISTNAME', 'CATEGORY', 'ACTION_GROUP'}


def as_categories(df, cols):
df = df.copy()
for col in categorical_columns.intersection(cols).intersection(df.columns):
df[col] = df[col].astype('category')
return df


def read_enrollment(fn):
df = pd.read_excel(fn, index_col='DISTRICT', na_values=-99)
# df = as_categories(df, categorical_columns)
df.rename(columns={"ECONOMIC": "TOTAL"}, inplace=True)
return df


def read_discipline(fn):
df = pd.read_excel(fn, na_values=-99)

# The values of discipline.CATEGORY are all columns in enrollment
# Remove special chars so they match exactly
df.CATEGORY.replace(
regex=True,
to_replace=r'[^0-9a-zA-Z]+',
value='_',
inplace=True
)

# df = as_categories(df, categorical_columns)

return df


def unique_actions(df):
return df.query('CATEGORY == "MALE" or CATEGORY == "FEMALE"')


def actions_by_district(df, fillna=1):
df = unique_actions(df.fillna(fillna))
actions = df.groupby(['DISTRICT', 'ACTION_GROUP']).ACTIONS.sum()
actions = actions.reset_index(name='ACTIONS')
actions = actions.pivot(
index='DISTRICT', columns='ACTION_GROUP', values='ACTIONS'
)
return actions.fillna(0)


def fraction_of_pop(enrollment, discipline, fillna=1):
actions = actions_by_district(discipline, fillna=fillna)
totals = enrollment['TOTAL']
frac = (actions.T / totals).T
frac = frac.fillna(0)
frac['TOTAL'] = enrollment['TOTAL']
return frac


def create_hdf5(fn):
pool = Pool(3)
files = [
'2012 District Enrollment data.xlsx',
'2013 District Enrollment Data.xlsx',
'2014 District Enrollment Data.xlsx',
]
result = pool.map(read_enrollment, files)
e = {12: result[0], 13: result[1], 14: result[2]}
files = [
'2012 District Discipline Data.xlsx',
'2013 District Discipline Data.xlsx',
'2014 District Discipline data.xlsx',
]
result = pool.map(read_discipline, files)
d = {12: result[0], 13: result[1], 14: result[2]}

pool.close()
pool.join()

for year in 12, 13, 14:
key = '20{}/district/'.format(year)
e[year].to_hdf(fn, key + 'enrollment')
d[year].to_hdf(fn, key + 'discipline')


def load_data(fn):
store = pd.HDFStore(fn)
e = {}
d = {}
for year in 12, 13, 14:
key = '20{}/district/'.format(year)
e[year] = store[key + 'enrollment']
d[year] = store[key + 'discipline']
return e, d


def output_reports(e, d):
for year in (12, 13, 14):
frac = fraction_of_pop(e[year], d[year])
name = "20{}_actions_per_capita.csv".format(year)
frac.to_csv(name)

d_all = pd.concat(d.values())

def drop(e):
return e.drop(['YEAR', 'DISTNAME'], axis=1).fillna(1)
e_all = drop(e[12]) + drop(e[13]) + drop(e[14])

frac = fraction_of_pop(e_all, d_all)
name = "all_actions_per_capita.csv"
frac.to_csv(name)


def main():
datafile = 'discipline.h5'
if not os.path.isfile(datafile):
create_hdf5(datafile)
e, d = load_data(datafile)
output_reports(e, d)


if __name__ == '__main__':
main()

0 comments on commit 0a0fcba

Please sign in to comment.