In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

random_state = 42

In [2]:
path = ''

# Data Loading

In [3]:
coi = pd.read_csv(path+'data_raw/COI_raw.csv', dtype={'geoid':'str'})

In [4]:
pop = pd.read_csv(path+'data_raw/DECENNIALPL2010.P1-Data.csv', skiprows=[1], dtype={'P001001': 'str', 'P001001ERR': 'str'})

In [5]:
cross_ref = pd.read_excel(path+'data_raw/grf15_lea_tract.xlsx', dtype={'LEAID': 'str', 'TRACT': 'str'})

# Functions

In [6]:
# Split COI data by year (2010 or 2015)
def coi_split_year(coi):

    coi_2010 = coi[coi['year'] == 2010]
    coi_2015 = coi[coi['year'] == 2015]

    return coi_2010, coi_2015

In [7]:
# Split data from the cross reference between LEAID and census TRACT to get keys for splitting
# both COI and SEDA data
def leaid_split(cross_ref, year):
    leaid = cross_ref['LEAID'].unique()

    # Split to get test and train
    # Use year as random state to get different splits for each year
    leaid_train, leaid_test = train_test_split(leaid, test_size=0.2, random_state=year)

    # Write LEAIDs to csv for use with SEDA data
    leaid_train_ser = pd.Series(leaid_train)
    filename = path + 'data_inprocess/leaids_train_' + str(year) + '.csv'
    leaid_train_ser.to_csv(filename)

    leaid_test_ser = pd.Series(leaid_test)
    filename = path + 'data_inprocess/leaids_test_' + str(year) + '.csv'
    leaid_test_ser.to_csv(filename)

    # Take training and test sets of cross_ref dataset by matching with LEAIDs
    cross_ref_train = cross_ref[cross_ref['LEAID'].isin(leaid_train)]
    cross_ref_test = cross_ref[cross_ref['LEAID'].isin(leaid_test)]

    return cross_ref_train, cross_ref_test

In [8]:
# Preprocess cross_ref to calculate percentage of tract land area per school district
def preprocess_cross_ref(cross_ref):

    # Groupby tract to get total tract land area
    cross_ref['LANDAREA_TOT'] = cross_ref.groupby('TRACT')['LANDAREA'].transform('sum')

    # Divide tract land area per district by total tract land area to get percent
    cross_ref['LANDAREA_PERC'] = cross_ref['LANDAREA'] / cross_ref['LANDAREA_TOT']

    return cross_ref

In [9]:
# Process and join COI data with census population data to get total population 
# to use in weighting COI indicators
def preprocess_coi(coi, pop):

    # Strip '1400000US' from beginning of GEO_ID field
    pop.loc[:, 'TRACT'] = pop['GEO_ID'].str.replace('1400000US', '')

    # Remove the revision indicator string from the total population value
    pop.loc[:, 'pop_total'] = pop['P001001'].str.replace(r'\(r[0-9]+\)', '', regex=True)

    # Just take new geoid column with the population total
    pop_tract = pop[['TRACT', 'pop_total']].copy()

    # Set population total value data type to int
    pop_tract.loc[:, 'pop_total'] = pop_tract['pop_total'].astype('int64')

    # Rename COI columns to pop_child (as opposed to pop_total from the census) and geoid to TRACT for joining
    coi = coi.rename(columns={'pop': 'pop_child', 'geoid': 'TRACT'})

    # Merge COI and census population
    coi = coi.merge(pop_tract, on='TRACT')

    return coi

In [10]:
# Train-test split COI data
def train_test_split_coi(cross_ref, coi, pop, year):

    # Preprocess the cross_ref data to get percent land area of each tract in a
    # given school district
    cross_ref_proc = preprocess_cross_ref(cross_ref)

    # Subsection the cross_ref data based on spliting by LEAID
    cross_ref_train, cross_ref_test = leaid_split(cross_ref_proc, year)

    # Preprocess the COI data
    coi_proc = preprocess_coi(coi, pop)

    # Subset COI data by inner-joining with cross_ref subsections
    coi_dist_train = cross_ref_train.merge(coi_proc, on='TRACT')
    coi_dist_test = cross_ref_test.merge(coi_proc, on='TRACT')

    return coi_dist_train, coi_dist_test

In [21]:
# Process without splitting for various clustering methods
def process_coi_all(cross_ref, coi, pop, write=False):

    # Preprocess the cross_ref data to get percent land area of each tract in a
    # given school district
    cross_ref_proc = preprocess_cross_ref(cross_ref)

    # Preprocess the COI data
    coi_proc = preprocess_coi(coi, pop)

    # Merge COI data with cross_ref
    coi_dist_all = cross_ref_proc.merge(coi_proc, on='TRACT')
    
    # Identify the numeric indicator columns
    ind_column_names = ['ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD', 'ED_MATH', 
                        'ED_READING','ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE', 'ED_PRXHQECE', 
                        'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS', 'HE_OZONE', 'HE_PM25',
                        'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI', 'SE_POVRATE', 'SE_PUBLIC', 
                        'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT', 'SE_JOBPROX', 'SE_SINGLE']

    # Non-indicator columns
    non_ind_column_names = ['LEAID', 'NAME_LEA15', 'TRACT', 'COUNT', 'LANDAREA_PERC', 'year', 'pop_child', 'pop_total']

    # Process training data

    # Get the numeric indicator columns
    X = coi_dist_all[ind_column_names]

    # Make a pipeline for processing
    # Use median for imputer strategy because some of the variable distributions are highly skewed
    pipe = Pipeline([('impute', SimpleImputer(strategy='median')), ('scale', StandardScaler())])

    # Fit/transform just the numeric indicator columns
    X_transformed = pipe.fit_transform(X)

    # Reconstitute the dataframe with transformed data
    X_trans_df = pd.DataFrame(X_transformed, columns=ind_column_names)

    # Get non-indicator columns from training df
    coi_cols = coi_dist_all[non_ind_column_names]

    # Merge non-indicator and transformed columns
    coi_dist_prep = coi_cols.merge(X_trans_df, left_index=True, right_index=True)

    # Weight indicators by total population and land area
    coi_dist_prep = weight_coi(coi_dist_prep, ind_column_names)

    # Group COI data by school district
    coi_grp_dist = group_coi(coi_dist_prep, ind_column_names)

    # Write the data out to csv
    if write:
        filename = path + 'data_cleaned/coi_district_grouped.csv'
        coi_grp_dist.to_csv(filename)

    return coi_grp_dist

In [12]:
# Create population-weighted averages for COI indicators, scaled by the percent of tract land area
# in the school district
def weight_coi(coi_indicators, ind_column_names):

    # Non-indicator column names to be merged back in after weighting
    non_ind_cols = ['LEAID', 'NAME_LEA15', 'TRACT', 'year', 'pop_child', 'pop_total', 'pop_scaled']

    # Scale the total population by the percentage of tract land area that is in a given 
    # school district
    coi_indicators['pop_scaled'] = coi_indicators['pop_total'] * coi_indicators['LANDAREA_PERC']

    # Weight the indicators by the scaled population from the census data
    coi_weighted = coi_indicators[ind_column_names].multiply(coi_indicators['pop_scaled'], axis='index')

    # Merge back in the LEA, tract, year, and population columns
    coi_weighted = coi_indicators.loc[:, non_ind_cols].merge(coi_weighted, left_index=True, right_index=True)

    return coi_weighted

In [13]:
# Group weighted indicators into school districts, then divide by total population to get a 
# weighted average
def group_coi(coi_district, ind_column_names):

    # Group by school district ID/name and year
    coi_grouped = coi_district.groupby(['LEAID', 'NAME_LEA15', 'year']).sum()

    # Divide the weighted indicators by the total population of the district to get the weighted average
    coi_grouped.loc[:, ind_column_names] = coi_grouped.loc[:, ind_column_names].divide(coi_grouped['pop_scaled'], axis='index')

    # Reset the multi-index
    coi_grouped = coi_grouped.reset_index()

    return coi_grouped

In [14]:
# Complete processing of train/test data
def process_coi(coi, pop, cross_ref, year, write=False):

    # Preprocess cross-ref, split cross-ref LEAIDs, preprocess COI, split COI/cross-ref into train/test
    coi_dist_train, coi_dist_test = train_test_split_coi(cross_ref, coi, pop, year)

    # Identify the numeric indicator columns
    ind_column_names = ['ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD', 'ED_MATH', 
                        'ED_READING','ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE', 'ED_PRXHQECE', 
                        'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS', 'HE_OZONE', 'HE_PM25',
                        'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI', 'SE_POVRATE', 'SE_PUBLIC', 
                        'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT', 'SE_JOBPROX', 'SE_SINGLE']

    # Non-indicator columns
    non_ind_column_names = ['LEAID', 'NAME_LEA15', 'TRACT', 'COUNT', 'LANDAREA_PERC', 'year', 'pop_child', 'pop_total']

    # Process training data

    # Get the numeric indicator columns
    X_train = coi_dist_train[ind_column_names]

    # Make a pipeline for processing
    # Use median for imputer strategy because some of the variable distributions are highly skewed
    pipe = Pipeline([('impute', SimpleImputer(strategy='median')), ('scale', StandardScaler())])

    # Fit/transform just the numeric indicator columns
    X_transformed = pipe.fit_transform(X_train)

    # Reconstitute the dataframe with transformed data
    X_trans_df = pd.DataFrame(X_transformed, columns=ind_column_names)

    # Get non-indicator columns from training df
    coi_cols = coi_dist_train[non_ind_column_names]

    # Merge non-indicator and transformed columns
    coi_dist_train_prep = coi_cols.merge(X_trans_df, left_index=True, right_index=True)

    # Weight indicators by total population and land area
    coi_dist_train_prep = weight_coi(coi_dist_train_prep, ind_column_names)

    # Group COI data by school district
    coi_grp_dist_train = group_coi(coi_dist_train_prep, ind_column_names)

    # Write the data out to csv
    if write:
        filename = path + 'data_cleaned/coi_district_grouped_train_' + str(year) + '.csv'
        coi_grp_dist_train.to_csv(filename)

    # Process test data

    # Get the numeric indicator columns
    X_test = coi_dist_test[ind_column_names]

    # Just transform just the numeric indicator columns (use pipeline fitted above)
    X_transformed = pipe.transform(X_test)

    # Reconstitute the dataframe with transformed data
    X_trans_df = pd.DataFrame(X_transformed, columns=ind_column_names)

    # Get non-indicator columns from training df
    coi_cols = coi_dist_test[non_ind_column_names]

    # Merge non-indicator and transformed columns
    coi_dist_test_prep = coi_cols.merge(X_trans_df, left_index=True, right_index=True)

    # Weight indicators by total population and land area
    coi_dist_test_prep = weight_coi(coi_dist_test_prep, ind_column_names)

    # Group COI data by school district
    coi_grp_dist_test = group_coi(coi_dist_test_prep, ind_column_names)

    # Write the test data out to csv
    if write:
        filename = path + 'data_cleaned/coi_district_grouped_test_' + str(year) + '.csv'
        coi_grp_dist_test.to_csv(filename)

    return coi_grp_dist_train, coi_grp_dist_test

# Processing

In [15]:
# Split COI by year (2010, 2015)
coi_2010, ooi_2015 = coi_split_year(coi)

In [16]:
# Create train/test data for 2010 and 2015
coi_grp_dist_train_2010, coi_grp_dist_test_2010 = process_coi(coi_2010, pop, cross_ref, year=2010, write=True)
coi_grp_dist_train_2015, coi_grp_dist_test_2015 = process_coi(ooi_2015, pop, cross_ref, year=2015, write=True)

In [22]:
# Process COI without train/test split for DBSCAN clustering (requires all data)
coi_all = process_coi_all(cross_ref, coi, pop, write=True)