<h1>Diabetes Model Trainer Notebook</h1>
<h3>Training the model based on data scraped from the US Census Bureau</h3>
<h2><b>IMPORTANT:</b><br><span style="color:red;">This product uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau.</span></h2>

<ol>
    <li>Split the data into training, test, and validation sets with random seed <b>123</b></li>
    <ul>
        <li>4 fold cross validation - 80% of the data with training = 60%, validation = 20%</li>
        <li>Test = 20%</li>
    </ul>
    <li>Train the model based on broadband internet only, normalize y value, minimize MSE</li>
    <li>Use LASSO regularizatio|n to eliminate unecessary features from the theta vector</li>
    <li>Retrain the model with demographic features added in</li>
    <li>Retrain the model with economic features added in</li>
    <li>Retrain the model with social features added in</li>
</ol>

In [8]:
#Imports
#Data manipulation/training
import pandas as pd
import numpy as np
from sklearn.linear_model import Lasso
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

#Visualization
import plotly.express as px
import plotly.graph_objects as go
import seaborn as sns
sns.set_style("whitegrid")
import matplotlib.pyplot as plt

<h2>Defining Helper Functions and Classes</h2>

In [35]:
class Transformation:
    """Transformation class to represent the transformation on one variable"""
    
    def __init__(self, transformation, transformation_name, inverse, inverse_name):
        """
        Parameters
        ----------
        transformation : def (x: arraylike<float>) -> arraylike<float>
            the transformation function to apply to an float array
            
        transformation_name : string
            the name of the transformation
            
        inverse : def (x: arraylike<float>) -> arraylike<float>
            the inverse function which can undo the transformation
            
        inverse_name : string
            the name of the inverse transformation
        """
        
        self.transformation = transformation
        self.transformation_name = transformation_name
        self.inverse = inverse
        self.inverse_name = inverse_name

"""Utility Transformation objects"""
no_transformation = Transformation(lambda x: x, "", lambda x: x, "")
square = Transformation(lambda x: x**2, "The Square of", lambda x: x**0.5, "The Square Root of")
cube = Transformation(lambda x: x**3, "The Cube of", lambda x: x**(1/3), "The Cube Root of")
fourth = Transformation(lambda x: x**4, "The Fourth Power of", lambda x: x**0.25, "The Fourth Root of")
square_root = Transformation(lambda x: x**0.5, "The Square Root of", lambda x: x**2, "The Square of")
cube_root = Transformation(lambda x: x**(1/3), "The Cube Root of", lambda x: x**3, "The Cube of")
fourth_root = Transformation(lambda x: x**0.25, "The Fourth Root of", lambda x: x**4, "The Fourth Power of")
log = Transformation(lambda x: np.log(x), "The Natural Log of", lambda x: np.exp(x), "e to the Power of")
e_power = Transformation(lambda x: np.exp(x), "e to the Power of", lambda x: np.log(x), "The Natural Log of")

def wrap_label(axis_name):
    """Wraps an label that is too long by adding newline characters
    
    Parameters
    ----------
    axis_name: string
    """
    words = axis_name.split()
    final_label = ""
    for i in range(len(words)):
        if i % 9 == 0 and i != 0:
            final_label += "\n"
        final_label += f"{words[i]} "
    return final_label

def plot_relationship(X_feature, Y, X_transformation=no_transformation, Y_transformation=no_transformation):
    """Plots the relationship between an (optionally) transformation X feature and Y outcome
    
    If no transformation object is passed in, it assumed no transformation
    
    Parameters
    ----------
    X_feature : arraylike<float>
        The predictor variable
    
    Y : arraylike<float>
        The outcome variable
    
    X_transformation : Transformation
        The Transformation to apply to the X_feature
    
    Y_transformation : Transformation
        The Transformation to apply to the Y_feature
        
    Return Value
    ------------
    None
    """
    X_transformed = X_transformation.transformation(X_feature)
    Y_transformed = Y_transformation.transformation(Y)
    x_axis_name = f"{X_transformation.transformation_name} {X_feature.name}"
    y_axis_name = f"{Y_transformation.transformation_name} {Y.name}"
    fig_data = pd.DataFrame({x_axis_name : X_transformed, y_axis_name: Y_transformed})
    fig = sns.lmplot(data=fig_data, 
                     x=x_axis_name, 
                     y=y_axis_name, 
                     height=10, 
                     scatter_kws={
                         's': 0.7, 
                         'alpha': 0.4
                     },
                    )
    fig.set_xlabels(wrap_label(x_axis_name), fontsize=18)
    fig.set_ylabels(y_axis_name, fontsize=18)

def plot_residuals(X_feature, Y, X_transformation=no_transformation, Y_transformation=no_transformation, axis="X"):
    """Plots the residuals on a model trained on an (optionally) transformed X feature and Y outcome
    
    If no transformation object is passed in, it assumed no transformation
    
    Parameters
    ----------
    X_feature : arraylike<float>
        The predictor variable
    
    Y : arraylike<float>
        The outcome variable
    
    X_transformation : Transformation
        The Transformation to apply to the X_feature
    
    Y_transformation : Transformation
        The Transformation to apply to the Y_feature
        
    Return Value
    ------------
    None
    """
    X_transformed = X_transformation.transformation(X_feature)
    Y_transformed = Y_transformation.transformation(Y)
    y_axis_name = f"{Y_transformation.transformation_name} {Y.name}"
    x_axis_name = f"{X_transformation.transformation_name} {X_feature.name}"
    design_matrix = pd.DataFrame({x_axis_name: X_transformed})
    model = LinearRegression()
    model.fit(design_matrix, Y_transformed)
    predictions = model.predict(design_matrix)
    residuals = Y - Y_transformation.inverse(predictions)
    if axis == "X":
        fig_data = pd.DataFrame({x_axis_name : X_transformed, "Residual": residuals})
        fig = sns.lmplot(data=fig_data, 
                         x=x_axis_name, 
                         y="Residual", 
                         height=10, 
                         scatter_kws={
                             's': 0.7, 
                             'alpha': 0.4
                         },
                        )
        fig.set_xlabels(wrap_x_label(x_axis_name), fontsize=18)
        fig.set_ylabels("Residual", fontsize=18)
    else:
        fig_data = pd.DataFrame({Y.name: Y, "Residual": residuals})
        fig = sns.lmplot(data=fig_data, 
                         x=Y.name, 
                         y="Residual", 
                         height=10, 
                         scatter_kws={
                             's': 0.7, 
                             'alpha': 0.4
                         },
                        )
        fig.set_xlabels(wrap_x_label(y_axis_name), fontsize=18)
        fig.set_ylabels("Residual", fontsize=18)

In [65]:
X_train = pd.read_csv("./X_train_val.csv", index_col="ZCTA")
Y_train = pd.read_csv("./Y_train_val.csv", index_col="ZCTA")[["Individuals Age 18+ with Diagnosed Diabetes (%)"]]
cross_validation = KFold(n_splits=4)