In [1]:
import numpy as np
import pandas as pd
import pickle

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, HistGradientBoostingRegressor
from sklearn.model_selection import GridSearchCV, KFold
from sklearn.inspection import permutation_importance

import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

pio.renderers.default='iframe'

random_state = 42
n_jobs = 8

In [2]:
#sklearn.__version__

# Data Loading

In [3]:
path = ''

In [4]:
coi_train_2010 = pd.read_csv(path+'data_cleaned/coi_district_grouped_train_2010.csv', index_col=0)
coi_train_2015 = pd.read_csv(path+'data_cleaned/coi_district_grouped_train_2015.csv', index_col=0)
coi_test_2010 = pd.read_csv(path+'data_cleaned/coi_district_grouped_test_2010.csv', index_col=0)
coi_test_2015 = pd.read_csv(path+'data_cleaned/coi_district_grouped_test_2015.csv', index_col=0)

In [5]:
seda_train_2010 = pd.read_csv(path+'data_cleaned/seda_train_2010.csv', index_col=0)
seda_train_2015 = pd.read_csv(path+'data_cleaned/seda_train_2015.csv', index_col=0)
seda_test_2010 = pd.read_csv(path+'data_cleaned/seda_test_2010.csv', index_col=0)
seda_test_2015 = pd.read_csv(path+'data_cleaned/seda_test_2015.csv', index_col=0)

In [10]:
with open(path+'data_cleaned/school_data_for_map.pkl', 'rb') as f:
    seda_map = pickle.load(f)
    
seda_map['latitude'] = seda_map['latitude'].astype(float)
seda_map['longitude'] = seda_map['longitude'].astype(float)

In [None]:
coi_train_2010.columns

Index(['LEAID', 'NAME_LEA15', 'year', 'pop_child', 'pop_total', 'pop_scaled',
       'ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD',
       'ED_MATH', 'ED_READING', 'ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE',
       'ED_PRXHQECE', 'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS',
       'HE_OZONE', 'HE_PM25', 'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI',
       'SE_POVRATE', 'SE_PUBLIC', 'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT',
       'SE_JOBPROX', 'SE_SINGLE'],
      dtype='object')

In [None]:
seda_train_2010.columns

Index(['fips', 'stateabb', 'sedalea', 'sedaleaname', 'subject', 'grade',
       'year', 'cs_mn_all', 'cs_mnse_all', 'totgyb_all', 'cs_mn_asn',
       'cs_mnse_asn', 'totgyb_asn', 'cs_mn_blk', 'cs_mnse_blk', 'totgyb_blk',
       'cs_mn_ecd', 'cs_mnse_ecd', 'totgyb_ecd', 'cs_mn_fem', 'cs_mnse_fem',
       'totgyb_fem', 'cs_mn_hsp', 'cs_mnse_hsp', 'totgyb_hsp', 'cs_mn_mal',
       'cs_mnse_mal', 'totgyb_mal', 'cs_mn_mfg', 'cs_mnse_mfg', 'totgyb_mfg',
       'cs_mn_mtr', 'cs_mnse_mtr', 'totgyb_mtr', 'cs_mn_nam', 'cs_mnse_nam',
       'totgyb_nam', 'cs_mn_nec', 'cs_mnse_nec', 'totgyb_nec', 'cs_mn_neg',
       'cs_mnse_neg', 'totgyb_neg', 'cs_mn_wag', 'cs_mnse_wag', 'totgyb_wag',
       'cs_mn_wbg', 'cs_mnse_wbg', 'totgyb_wbg', 'cs_mn_whg', 'cs_mnse_whg',
       'totgyb_whg', 'cs_mn_wht', 'cs_mnse_wht', 'totgyb_wht', 'cs_mn_wmg',
       'cs_mnse_wmg', 'totgyb_wmg', 'cs_mn_wng', 'cs_mnse_wng', 'totgyb_wng'],
      dtype='object')

# Functions

In [12]:
def make_clusters(coi, n_clusters, drop_outliers=False):

    if drop_outliers:
        coi = coi[coi['pop_child'] < 550000]
    
    km_cols = ['pop_child', 'pop_total', 'pop_scaled', 
               'ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD',
               'ED_MATH', 'ED_READING', 'ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE', 
               'ED_PRXHQECE', 'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS',
               'HE_OZONE', 'HE_PM25', 'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI',
               'SE_POVRATE', 'SE_PUBLIC', 'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT',
               'SE_JOBPROX', 'SE_SINGLE']
    
    km_df = coi.loc[:, km_cols]
    
    scaler = StandardScaler().fit(km_df.iloc[:, :3])
    km_df.iloc[:, :3] = scaler.transform(km_df.iloc[:, :3])

    pca = PCA(n_components=11, random_state=random_state).fit(km_df)
    pca_arr = pca.transform(km_df)
        
    pca_df = pd.concat([coi.reset_index(drop = True), pd.DataFrame(pca_arr)],axis = 1)
    pca_df.columns.values[-11:] = ['Component 1','Component 2','Component 3','Component 4','Component 5','Component 6',
                                   'Component 7','Component 8','Component 9','Component 10','Component 11']

    kmeans = KMeans(n_clusters=n_clusters, n_init=20, algorithm='lloyd', max_iter=1000, random_state=random_state).fit(pca_arr)
    
    clusters = kmeans.predict(pca_arr)
    
    df = pd.concat([pca_df, pd.Series(clusters, name='cluster')], axis=1)
    
    return scaler, pca, kmeans, df


In [13]:
def merge_data(coi, seda):
    df = coi.merge(seda, left_on='LEAID', right_on='sedalea')
    df = df.rename(columns={'year_x': 'coi_year', 'year_y': 'seda_year'})
    df = df.replace({'rla': 0, 'mth': 1})
    
    return df

In [14]:
def select_cols_X_y(df, cluster=False, col='cs_mn_all'):
    description_cols = ['LEAID', 'NAME_LEA15', 'fips', 'stateabb', 'sedalea', 'sedaleaname']
                        
    X_cols = ['coi_year', 'pop_child', 'pop_total', 'pop_scaled',
              'ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD', 
              'ED_MATH', 'ED_READING', 'ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE',
              'ED_PRXHQECE', 'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS',
              'HE_OZONE', 'HE_PM25', 'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI',
              'SE_POVRATE', 'SE_PUBLIC', 'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT',
              'SE_JOBPROX', 'SE_SINGLE', 'subject', 'grade', 'seda_year']

    if cluster:
        X_cols.append('cluster')
        
    y_col = col
    
    train = df[df[col].notna()]
    
    X = train[X_cols]
    y = train[y_col]
    desc_df = train[description_cols]
    
    return X, y, desc_df, train

In [15]:
def fit_hgbr(df, cluster_num):
    # If cluster selected, filter dataframe.  Else, use all clusters
    if cluster_num is None:
        train_clust = df.copy()
    else:
        train_clust = df[df['cluster'] == cluster_num]

    X_df, y, desc, total_df = select_cols_X_y(train_clust, cluster=True, col='cs_mn_all')

    X = X_df.to_numpy()

    hgbr = HistGradientBoostingRegressor(learning_rate=0.1, max_depth=4, random_state=random_state)
    hgbr.fit(X, y)
    
    return hgbr, hgbr.score(X, y)

In [16]:
def perm_importance(model, df_test, cluster_num=None, n_repeats=20, top_n=None):
    
    # If cluster selected, filter dataframe.  Else, use all clusters
    if cluster_num is None:
        test_clust = df_test.copy()
        cluster_name = 'All Clusters'
    else:
        test_clust = df_test[df_test['cluster'] == cluster_num]
        cluster_name = 'Cluster ' + str(cluster_num + 1)

    # Split into X and y
    X_test_df, y_test, desc, total_df = select_cols_X_y(test_clust, cluster=True, col='cs_mn_all')

    X_test = X_test_df.to_numpy()
    
    # Score the trained model on X_test and y_test
    score = model.score(X_test, y_test)
    
    # Use the trained model and X_test and y_test to calculate feature importance
    perm_imp = permutation_importance(model, X_test, y_test, n_repeats=n_repeats, random_state=random_state, n_jobs=n_jobs)
    sorted_idx = perm_imp.importances_mean.argsort()[::-1]
    
    # If selected, limit to top n features
    if top_n is not None:
        sorted_idx = sorted_idx[:top_n]
    
    # Human-readable feature labels
    labels = ['COI Year', 'Pop - Child', 'Pop - Total', 'Pop - Scaled to District',
              'AP Enroll', 'Adult Ed Attainment', 'College Enroll', 'Early Child Ed Enroll', 'High School Grad Rate', 
              '3rd Grade Math Proficiency', '3rd Grade Reading Proficiency', 'School Poverty', 'Teacher Experience', 'Early Child Ed Centers',
              'High-Quality Early Child Ed', 'Healthy Food Access', 'Green Space Access', 'Extreme Heat Exposure', 'Health Insurance Coverage', 
              'Ozone Concentration', 'Airborne Microparticles', 'Housing Vacancy Rate', 'Walkablity', 'Hazardous Waste', 'Industrial Pollutants',
              'Poverty Rate', 'Public Assistance Rate', 'Homeownership Rate', 'High-Skill Employment', 'Median Household Income', 'Employment Rate',
              'Commute Duration', 'Single-Headed Households', 'Subject (Read/Math)', 'Grade', 'School Year', 'Cluster ID']
    
    # Long-form dataframe of feature importances for later plotting
    df = pd.DataFrame(perm_imp.importances[sorted_idx].T, columns=np.array(labels)[sorted_idx])
    df_melt = df.melt(var_name='Variable', value_name='Importance')
    df_melt['Cluster Name'] = cluster_name

    return score, perm_imp, sorted_idx, df_melt

In [17]:
def table_feat_imp(perm_imp, sorted_idx, top=5):
    # Human-readable feature labels
    labels = ['COI Year', 'Pop - Child', 'Pop - Total', 'Pop - Scaled to District',
              'AP Enroll', 'Adult Ed Attainment', 'College Enroll', 'Early Child Ed Enroll', 'High School Grad Rate', 
              '3rd Grade Math Proficiency', '3rd Grade Reading Proficiency', 'School Poverty', 'Teacher Experience', 'Early Child Ed Centers',
              'High-Quality Early Child Ed', 'Healthy Food Access', 'Green Space Access', 'Extreme Heat Exposure', 'Health Insurance Coverage', 
              'Ozone Concentration', 'Airborne Microparticles', 'Housing Vacancy Rate', 'Walkablity', 'Hazardous Waste', 'Industrial Pollutants',
              'Poverty Rate', 'Public Assistance Rate', 'Homeownership Rate', 'High-Skill Employment', 'Median Household Income', 'Employment Rate',
              'Commute Duration', 'Single-Headed Households', 'Subject (Read/Math)', 'Grade', 'School Year', 'Cluster ID']
    
    # Limit to top <selected> features
    sorted_idx_top = sorted_idx[:top]
    
    # Table of top <selected> features
    imp_labels = np.array(labels)[sorted_idx_top]
    imp_means = [f'{x:.3f}' for x in perm_imp.importances_mean[sorted_idx_top]]
    imp_std = [f'+/- {x:.3f}' for x in perm_imp.importances_std[sorted_idx_top]]

    fig = go.Figure(data=[go.Table(columnwidth = [300, 100, 100],
                                   header=dict(values=['Variable', 'Importance - Mean', 'Importance - STD'], align='left'), 
                                   cells=dict(values=[imp_labels, imp_means, imp_std], align='left'))])
    fig.update_layout(
        height=300,
        width=500,
        showlegend=False,
        title_text='Feature Importance for All-Cluster Model',
    )
    
    return fig

# Clustered 2015 Models

In [18]:
scaler, pca, kmeans, coi_train_clust_2015 = make_clusters(coi_train_2015, n_clusters=4, drop_outliers=False)
coi_train_clust_2015.groupby('cluster')['LEAID'].count()

cluster
0    4710
1    2539
2       4
3    3594
Name: LEAID, dtype: int64

In [19]:
coi_train_clust_2015[coi_train_clust_2015['cluster'] == 2].iloc[:, :6]

Unnamed: 0,LEAID,NAME_LEA15,year,pop_child,pop_total,pop_scaled
936,622710,Los Angeles Unified School District,2015,1085808,4793985,4541622.0
1590,1200390,Dade County School District,2015,553299,2496435,2496435.0
2038,1709930,Chicago Public School District 299,2015,595956,2735079,2697662.0
6756,3620580,New York City Department Of Education,2015,1794644,8175133,8175133.0


In [20]:
train_2015 = merge_data(coi_train_clust_2015, seda_train_2015)

X_df, y, desc, train_2015_df = select_cols_X_y(train_2015, cluster=True, col='cs_mn_all')

X = X_df.to_numpy()

In [21]:
lm = LinearRegression()
lm.fit(X, y)
lm.score(X, y)

0.5516249041654944

In [22]:
lasso = Lasso(alpha=0.5, random_state=random_state)
lasso.fit(X, y)
lasso.score(X, y)

0.0025269158801806135

In [23]:
ridge = Ridge(alpha=1, random_state=random_state)
ridge.fit(X, y)
ridge.score(X, y)

0.5516249041351623

In [24]:
rfr7 = RandomForestRegressor(n_estimators=100, max_depth=7, n_jobs=n_jobs, random_state=random_state)
rfr7.fit(X, y)
rfr7.score(X, y)

0.6022567411502213

In [25]:
gbr2_1 = GradientBoostingRegressor(learning_rate=1, n_estimators=100, max_depth=2, random_state=random_state)
gbr2_1.fit(X, y)
gbr2_1.score(X, y)

0.6212151719113779

In [26]:
hgrb1_4 = HistGradientBoostingRegressor(learning_rate=0.1, max_depth=4, random_state=random_state)
hgrb1_4.fit(X, y)
hgrb1_4.score(X, y)

0.628602871091348

In [None]:
p_grid = {'learning_rate': [0.1, 0.5, 1, 1.5, 2], 'max_depth': [1, 2, 3, 4, 5]}
hgrb = HistGradientBoostingRegressor()

cv_best = {}

# Loop for each trial
for i in range(5):
    clf = GridSearchCV(estimator=hgrb, param_grid=p_grid, scoring='r2', cv=5, n_jobs=n_jobs)
    clf.fit(X, y)
    cv_best[i] = (clf.best_score_, clf.best_params_)

cv_best

{0: (0.5211301793049657, {'learning_rate': 0.1, 'max_depth': 4}),
 1: (0.5214743455551012, {'learning_rate': 0.1, 'max_depth': 4}),
 2: (0.5205631515621361, {'learning_rate': 0.1, 'max_depth': 3}),
 3: (0.520965856990057, {'learning_rate': 0.1, 'max_depth': 5}),
 4: (0.5212105123252881, {'learning_rate': 0.1, 'max_depth': 3})}

In [27]:
hgbr_0, hgbr_0_train_score = fit_hgbr(train_2015, 0)
hgbr_0_train_score

0.36637956022262863

In [28]:
hgbr_1, hgbr_1_train_score = fit_hgbr(train_2015, 1)
hgbr_1_train_score

0.7637803048526116

In [29]:
hgbr_2, hgbr_2_train_score = fit_hgbr(train_2015, 2)
hgbr_2_train_score

0.900445518260216

In [30]:
hgbr_3, hgbr_3_train_score = fit_hgbr(train_2015, 3)
hgbr_3_train_score

0.5170779029092183

In [31]:
hgbr_all, hgbr_all_train_score = fit_hgbr(train_2015, None)
hgbr_all_train_score

0.628602871091348

In [32]:
train_2015_df.loc[:, 'predictions'] = hgbr_all.predict(X)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [None]:
list(train_2015_df.columns)

['LEAID',
 'NAME_LEA15',
 'coi_year',
 'pop_child',
 'pop_total',
 'pop_scaled',
 'ED_APENR',
 'ED_ATTAIN',
 'ED_COLLEGE',
 'ED_ECENROL',
 'ED_HSGRAD',
 'ED_MATH',
 'ED_READING',
 'ED_SCHPOV',
 'ED_TEACHXP',
 'ED_PRXECE',
 'ED_PRXHQECE',
 'HE_FOOD',
 'HE_GREEN',
 'HE_HEAT',
 'HE_HLTHINS',
 'HE_OZONE',
 'HE_PM25',
 'HE_VACANCY',
 'HE_WALK',
 'HE_SUPRFND',
 'HE_RSEI',
 'SE_POVRATE',
 'SE_PUBLIC',
 'SE_HOME',
 'SE_OCC',
 'SE_MHE',
 'SE_EMPRAT',
 'SE_JOBPROX',
 'SE_SINGLE',
 'Component 1',
 'Component 2',
 'Component 3',
 'Component 4',
 'Component 4',
 'Component 6',
 'Component 7',
 'Component 8',
 'Component 9',
 'Component 10',
 'Component 11',
 'cluster',
 'fips',
 'stateabb',
 'sedalea',
 'sedaleaname',
 'subject',
 'grade',
 'seda_year',
 'cs_mn_all',
 'cs_mnse_all',
 'totgyb_all',
 'cs_mn_asn',
 'cs_mnse_asn',
 'totgyb_asn',
 'cs_mn_blk',
 'cs_mnse_blk',
 'totgyb_blk',
 'cs_mn_ecd',
 'cs_mnse_ecd',
 'totgyb_ecd',
 'cs_mn_fem',
 'cs_mnse_fem',
 'totgyb_fem',
 'cs_mn_hsp',
 'cs_mnse_

In [33]:
exclude_cols = ['Component 3', 'Component 4', 'Component 5', 'Component 6', 'Component 7',
                'Component 8', 'Component 9', 'Component 10', 'Component 11', 
                'cs_mnse_all', 'cs_mnse_asn', 'cs_mnse_blk', 'cs_mnse_ecd', 'cs_mnse_fem',
                'cs_mnse_hsp', 'cs_mnse_mal', 'cs_mnse_mfg', 'cs_mnse_mtr', 'cs_mnse_nam',
                'cs_mnse_nec', 'cs_mnse_neg', 'cs_mnse_wag', 'cs_mnse_wbg', 'cs_mnse_whg',
                'cs_mnse_wht', 'cs_mnse_wmg', 'cs_mnse_wng', 'totgyb_wng',
                'ED_APENR', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD', 'ED_TEACHXP', 'ED_PRXECE', 
                'ED_PRXHQECE', 'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS',
                'HE_OZONE', 'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'SE_POVRATE', 'SE_HOME', 
                'SE_OCC', 'SE_MHE', 'SE_EMPRAT', 'SE_JOBPROX']

display_df_train = train_2015_df[train_2015_df['cs_mn_all'].notna()]
display_df_train = display_df_train[display_df_train.columns[~display_df_train.columns.isin(exclude_cols)]]

# Test Set

In [34]:
km_cols = ['pop_child', 'pop_total', 'pop_scaled', 
           'ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD',
           'ED_MATH', 'ED_READING', 'ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE', 
           'ED_PRXHQECE', 'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS',
           'HE_OZONE', 'HE_PM25', 'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI',
           'SE_POVRATE', 'SE_PUBLIC', 'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT',
           'SE_JOBPROX', 'SE_SINGLE']

km_df = coi_test_2015.loc[:, km_cols]

km_df.iloc[:, :3] = scaler.transform(km_df.iloc[:, :3])

pca_arr = pca.transform(km_df)

pca_df = pd.concat([coi_test_2015.reset_index(drop = True), pd.DataFrame(pca_arr)],axis = 1)
pca_df.columns.values[-11:] = ['Component 1','Component 2','Component 3','Component 4','Component 5','Component 6',
                               'Component 7','Component 8','Component 9','Component 10','Component 11']

clusters = kmeans.predict(pca_arr)

coi_test_clust_2015 = pd.concat([pca_df, pd.Series(clusters, name='cluster')], axis=1)

test_2015 = merge_data(coi_test_clust_2015, seda_test_2015)

X_test_df, y_test, desc_test, test_2015_df = select_cols_X_y(test_2015, cluster=True, col='cs_mn_all')

X_test = X_test_df.to_numpy()

In [39]:
plot_clust_df = pd.concat([coi_train_clust_2015, coi_test_clust_2015], axis=0)
plot_clust_df['Cluster Name'] = ['Cluster ' + str(x+1) for x in plot_clust_df['cluster']]

In [40]:
plot_clust_df.columns

Index(['LEAID', 'NAME_LEA15', 'year', 'pop_child', 'pop_total', 'pop_scaled',
       'ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD',
       'ED_MATH', 'ED_READING', 'ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE',
       'ED_PRXHQECE', 'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS',
       'HE_OZONE', 'HE_PM25', 'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI',
       'SE_POVRATE', 'SE_PUBLIC', 'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT',
       'SE_JOBPROX', 'SE_SINGLE', 'Component 1', 'Component 2', 'Component 3',
       'Component 4', 'Component 5', 'Component 6', 'Component 7',
       'Component 8', 'Component 9', 'Component 10', 'Component 11', 'cluster',
       'Cluster Name'],
      dtype='object')

In [41]:
test_2015_df.loc[:, 'predictions'] = hgbr_all.predict(X_test)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [42]:
# Add test rows to display dataframe
display_df_test = test_2015_df[test_2015_df['cs_mn_all'].notna()]
display_df_test = display_df_test[display_df_test.columns[~display_df_test.columns.isin(exclude_cols)]]

display_df_all = pd.concat([display_df_train, display_df_test], axis=0)

display_df_all['subject'] = display_df_all['subject'].replace({0: 'Reading', 1: 'Math'})
display_df_all['Cluster Name'] = ['Cluster ' + str(x+1) for x in display_df_all['cluster']]
display_df_all.shape

(219872, 61)

# Feature Importance

In [43]:
all_cols = ['coi_year', 'pop_child', 'pop_total', 'pop_scaled',
            'ED_APENR', 'ED_ATTAIN', 'ED_COLLEGE', 'ED_ECENROL', 'ED_HSGRAD',
            'ED_MATH', 'ED_READING', 'ED_SCHPOV', 'ED_TEACHXP', 'ED_PRXECE',
            'ED_PRXHQECE', 'HE_FOOD', 'HE_GREEN', 'HE_HEAT', 'HE_HLTHINS',
            'HE_OZONE', 'HE_PM25', 'HE_VACANCY', 'HE_WALK', 'HE_SUPRFND', 'HE_RSEI',
            'SE_POVRATE', 'SE_PUBLIC', 'SE_HOME', 'SE_OCC', 'SE_MHE', 'SE_EMPRAT',
            'SE_JOBPROX', 'SE_SINGLE', 'subject', 'grade', 'seda_year', 'cluster']

labels = ['COI Year', 'Pop - Child', 'Pop - Total', 'Pop - Scaled to District',
          'AP Enroll', 'Adult Ed Attainment', 'College Enroll', 'Early Child Ed Enroll', 'High School Grad Rate', 
          '3rd Grade Math Proficiency', '3rd Grade Reading Proficiency', 'School Poverty', 'Teacher Experience', 'Early Child Ed Centers',
          'High-Quality Early Child Ed', 'Healthy Food Access', 'Green Space Access', 'Extreme Heat Exposure', 'Health Insurance Coverage', 
          'Ozone Concentration', 'Airborne Microparticles', 'Housing Vacancy Rate', 'Walkablity', 'Hazardous Waste', 'Industrial Pollutants',
          'Poverty Rate', 'Public Assistance Rate', 'Homeownership Rate', 'High-Skill Employment', 'Median Household Income', 'Employment Rate',
          'Commute Duration', 'Single-Headed Households', 'Subject (Read/Math)', 'Grade', 'School Year', 'Cluster ID']

In [77]:
# Feature importance, all clusters together
score_all, perm_imp_all, sorted_idx_all, df_melt_all = perm_importance(hgbr_all, test_2015, top_n=5)
score_all

0.5442006131285942

In [78]:
# Feature importance, clusters 0
score_0, perm_imp_0, sorted_idx_0, df_melt_0 = perm_importance(hgbr_0, test_2015, cluster_num=0, top_n=5)
score_0

0.1666369848474597

In [79]:
# Feature importance, clusters 1
score_1, perm_imp_1, sorted_idx_1, df_melt_1 = perm_importance(hgbr_1, test_2015, cluster_num=1, top_n=5)
score_1

0.5989009805862228

In [80]:
# No test samples in cluster 2, so use training feature importance
score_2, perm_imp_2, sorted_idx_2, df_melt_2 = perm_importance(hgbr_2, train_2015, cluster_num=2, top_n=5)

In [81]:
# Feature importance, clusters 3
score_3, perm_imp_3, sorted_idx_3, df_melt_3 = perm_importance(hgbr_3, test_2015, cluster_num=3, top_n=5)
score_3

0.362814415562249

In [82]:
# Concatenate feature importance dataframes from each cluster
feature_imp_df = pd.concat([df_melt_all, df_melt_0, df_melt_1, df_melt_2, df_melt_3])
feature_imp_df.columns

Index(['Variable', 'Importance', 'Cluster Name'], dtype='object')

# Model Scores on the Test Set

In [66]:
lm.score(X_test, y_test)

0.5279170599727858

In [67]:
lasso.score(X_test, y_test)

-0.004679532512761986

In [68]:
ridge.score(X_test, y_test)

0.5279174034397779

In [69]:
rfr7.score(X_test, y_test)

0.5117465353293285

In [70]:
gbr2_1.score(X_test, y_test)

0.5125269869243877

In [71]:
hgrb1_4.score(X_test, y_test)

0.5442006131285942

# Save data for display in Streamlit

In [72]:
with open(path+'data_display/feature_imp.pkl', 'wb') as f:
    pickle.dump(feature_imp_df, f)

In [73]:
# Save clusters to pickle for display later
with open(path+'data_display/clusters.pkl', 'wb') as f:
    pickle.dump(plot_clust_df, f)

In [76]:
# Choose display columns for viz
display_seda_cols = ['LEAID', 'NAME_LEA15', 'fips', 'stateabb', 'sedalea', 'sedaleaname', 'subject', 'grade', 
                     'seda_year', 'cs_mn_all', 'predictions', 'Cluster Name']

# Save just 4th grade data for display
disp_seda_limited_df = display_df_all[display_df_all['grade']==4][display_seda_cols]

# Join with map data
disp_seda_limited_df = disp_seda_limited_df.merge(seda_map, left_on='sedalea', right_on='sedalea')

with open(path+'data_display/seda_display.pkl', 'wb') as f:
    pickle.dump(disp_seda_limited_df, f)

In [75]:
# Choose display columns for viz
display_coi_cols = ['LEAID', 'NAME_LEA15', 'ED_ATTAIN', 'ED_MATH', 'ED_READING', 'ED_SCHPOV', 'HE_PM25', 
                    'HE_RSEI', 'SE_PUBLIC', 'SE_SINGLE', 'Component 1', 'Component 2', 'Cluster Name']

disp_coi = display_df_all[display_coi_cols]
disp_coi = disp_coi.groupby('LEAID').first().reset_index()

with open(path+'data_display/coi_display.pkl', 'wb') as f:
    pickle.dump(disp_coi, f)