In [1]:
import numpy as np
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.io as pio
import plotly.figure_factory as ff

import pickle

pio.renderers.default='iframe'

In [2]:
path = ''

# Data Imports

In [29]:
with open(path+'data_display/seda_display.pkl', 'rb') as f:
    seda_df = pickle.load(f)

with open(path+'data_display/coi_display.pkl', 'rb') as f:
    coi_df = pickle.load(f)

with open(path+'data_display/feature_imp.pkl', 'rb') as f:
    feature_imp_df = pickle.load(f)

with open(path+'data_display/clusters.pkl', 'rb') as f:
    cluster_df = pickle.load(f)
    
cross_val_results_df = pd.read_csv(path+'data_display/cross_val_results.csv')

model_results_df = pd.read_csv(path+'data_display/model_results.csv')

# Data Preparation

In [27]:
# # Create df for cluster plot
# cluster_df = coi_df.iloc[:, :]

#rename the sedalean name column
seda_df = seda_df.rename(columns={"NAME_LEA15": "sedalea_name"})
#convert the data types for the field year(year)
seda_df.loc[:, 'seda_year'] = pd.to_datetime(seda_df.loc[:,'seda_year'], format='%Y')


# Set up positive and negative score flags
seda_disp_df = seda_df.iloc[:, :]
seda_disp_df['sign'] =  np.where(seda_disp_df['cs_mn_all'] >= 0, 'Positive', 'Negative')
seda_disp_df['cs_mn_all_abs'] = np.abs(seda_disp_df['cs_mn_all'])


# Add histogram data
# # For Streamlit, make the histogram selection dynamic
# x1 = np.array(seda_df[(seda_df['Cluster Name']=='Cluster 1')&(seda_df['seda_year']==v_year_choice)&(seda_df['subject']==v_subject)]['cs_mn_all'], dtype='float')
# x2 = np.array(seda_df[(seda_df['Cluster Name']=='Cluster 2')&(seda_df['seda_year']==v_year_choice)&(seda_df['subject']==v_subject)]['cs_mn_all'], dtype='float')
# x3 = np.array(seda_df[(seda_df['Cluster Name']=='Cluster 3')&(seda_df['seda_year']==v_year_choice)&(seda_df['subject']==v_subject)]['cs_mn_all'], dtype='float')
# x4 = np.array(seda_df[(seda_df['Cluster Name']=='Cluster 4')&(seda_df['seda_year']==v_year_choice)&(seda_df['subject']==v_subject)]['cs_mn_all'], dtype='float')
x1 = np.array(seda_df[seda_df['Cluster Name']=='Cluster 1']['cs_mn_all'], dtype='float')
x2 = np.array(seda_df[seda_df['Cluster Name']=='Cluster 2']['cs_mn_all'], dtype='float')
x3 = np.array(seda_df[seda_df['Cluster Name']=='Cluster 3']['cs_mn_all'], dtype='float')
x4 = np.array(seda_df[seda_df['Cluster Name']=='Cluster 4']['cs_mn_all'], dtype='float')


# Group data together
hist_data = [x1, x2, x3, x4]
group_labels = ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster 4']


# Prep for COI histograms
hist_coi_cols = ['ED_SCHPOV', 'ED_ATTAIN', 'ED_MATH', 'ED_READING', 'SE_SINGLE', 'HE_HLTHINS', 'HE_PM25', 'HE_RSEI']
hist_coi_names = ['School Poverty', 'Adult Ed Attainment', '3rd Grade Math Proficiency', '3rd Grade Reading Proficiency', 
                  'Single-Headed Households', 'Health Insurance Coverage', 'Airborne Microparticles', 'Industrial Pollutants']
hist_coi_labels = {hist_coi_cols[i]: hist_coi_names[i] for i in range(len(hist_coi_names))}

hist_coi = coi_df.melt(id_vars=['LEAID', 'NAME_LEA15', 'Cluster Name'], value_vars=hist_coi_cols, 
                       var_name='COI Variable', value_name='Value').reset_index()
hist_coi['COI Variable'] = hist_coi['COI Variable'].replace(hist_coi_labels)

coi_hist_1 = hist_coi[hist_coi['COI Variable'].isin(hist_coi_names[:4])]
coi_hist_2 = hist_coi[hist_coi['COI Variable'].isin(hist_coi_names[4:])]


# Residuals
seda_df['residuals'] = seda_df['cs_mn_all'] - seda_df['predictions']

# Figures

In [5]:
# Create mapbox plot
fig_map = px.scatter_mapbox(data_frame=seda_disp_df,lat='latitude', lon='longitude', color='sign',color_discrete_sequence=px.colors.qualitative.G10,
                        zoom = 2,size='cs_mn_all_abs' ,text='sedalea_name', color_discrete_map = {'Negative': '#AB63FA', 'Positive':'#FECB52'},hover_data = ['sedalea_name','stateabb'], hover_name='sedalea_name')
fig_map.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig_map.update_layout(mapbox_style="open-street-map", autosize=True)
fig_map.show()

In [6]:
# Create distplot with custom bin_size
fig_dist = ff.create_distplot(
        hist_data, group_labels)
fig_dist.update_layout(autosize=True,legend={'traceorder':'normal'})
fig_dist.show()

In [7]:
fig_sp_clusters = px.scatter(cluster_df, 
                             x='Component 1', 
                             y='Component 2', 
                             color='Cluster Name', 
                             category_orders={'Cluster Name': ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster  4']}, 
                             hover_name='NAME_LEA15',
                             log_x=True,
                             log_y=True,
                             width=800, 
                             height=600,
                             title='School District Clusters from COI Indicators'
                             )
fig_sp_clusters.update_xaxes(showgrid=False)
fig_sp_clusters.update_yaxes(showgrid=False)

fig_sp_clusters.show()

In [56]:
sp_coi_hist_1 = px.histogram(coi_hist_1, 
                             x='Value', 
                             color='Cluster Name', 
                             category_orders={'Cluster Name': ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster  4']}, 
                             facet_col='COI Variable', 
                             marginal='violin',
                             nbins=100,
                             width=1200,
                             height=500,
                             title='Important COI Feature Distributions')

sp_coi_hist_1.show()

In [55]:
sp_coi_hist_2 = px.histogram(coi_hist_2, 
                             x='Value', 
                             color='Cluster Name', 
                             category_orders={'Cluster Name': ['Cluster 1', 'Cluster 2', 'Cluster 3', 'Cluster  4']}, 
                             facet_col='COI Variable', 
                             marginal='violin',
                             nbins=100,
                             width=1200,
                             height=500)

sp_coi_hist_2.show()

In [10]:
# feature_imp_all_df = feature_imp_df.groupby(['Cluster Name', 'Variable']).mean().reset_index()

# fig = px.bar(feature_imp_all_df, x='Variable', y='Importance', color='Cluster Name', barmode='group', log_y=True, height=800, width=1200)
# fig.show()

In [11]:
fig_resid = px.scatter(seda_df, 
                       x='predictions', 
                       y='residuals', 
                       opacity=0.25, 
                       labels=dict(predictions='Predicted Values', residuals='Residuals'),
                       title='Residuals for All-Cluster Model',
                       height=800,
                       width=800
                       )
fig_resid.show()

In [12]:
fig_bp_feat_imp = px.box(feature_imp_df, x='Variable', y='Importance', color='Cluster Name', height=600, width=1200)
fig_bp_feat_imp.show()

In [53]:
fig_model_results = go.Figure(data=[go.Table(columnwidth = [300, 300, 100, 100, 100],
                                             header=dict(values=list(model_results_df.columns),
                                                         fill_color='black', 
                                                         font=dict(color='white', size=16)), 
                                             cells=dict(values=[model_results_df['Model'], model_results_df['Hyperparameters'], 
                                                                model_results_df['Cluster'], model_results_df['Training Set Score'], 
                                                                model_results_df['Test Set Score']], 
                                                        align=['left', 'left', 'left', 'right', 'right'],
                                                        fill_color='grey', 
                                                        line_color='white',
                                                        font=dict(color='white', size=14),
                                                        format=[None, None, None, '.4f', '.4f']))])
fig_model_results.update_layout(
    height=900,
    width=1000,
    showlegend=False,
    title_text='Predictive Model Results',
)

fig_model_results.show()

In [54]:
fig_cross_val = go.Figure(data=[go.Table(columnwidth = [100, 100, 200, 100],
                                             header=dict(values=list(cross_val_results_df.columns), 
                                                         fill_color='black', 
                                                         font=dict(color='white', size=16)), 
                                             cells=dict(values=[cross_val_results_df['Cluster'], cross_val_results_df['Cross-Val Iteration'], 
                                                                cross_val_results_df['Best Parameters'], cross_val_results_df['Best Score']], 
                                                        align=['left', 'left', 'left', 'right'], 
                                                        fill_color='grey', 
                                                        line_color='white',
                                                        font=dict(color='white', size=14),
                                                        format=[None, None, None, '.4f']))])
fig_cross_val.update_layout(
    height=900,
    width=900,
    showlegend=False,
    title_text='Cross-Validation Grid Search Results',
)

fig_cross_val.show()