In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import tabpy_client

In [None]:
def drop_data2(data):
    data = data.drop(['statecode', 'countycode', 'fipscode', 'state', 'county', 'year','county_ranked'], axis=1)

    # Pattern to match columns to drop
    pattern = r'(numerator|denominator|cihigh|cilow|other)'

    # Use DataFrame's filter method with regex to find matching columns
    columns_to_drop = data.filter(regex=pattern).columns

    # print(columns_to_drop)

    # Drop these columns
    data = data.drop(columns=columns_to_drop, axis=1)

    return data

In [None]:
def drop_related_outcome_cols(data):
    pattern = r'(v127|v002|v036|v037|v042|v001|v128|v129|v144|v145|v060|v061|v147)'
    columns_to_drop = data.filter(regex=pattern).columns
    data = data.drop(columns=columns_to_drop, axis=1)
    return data

In [None]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

def std_norm(data, column_lst):
    data = drop_data2(data) # Assuming drop_data2 is a predefined function

    # Standardize specified columns
    v127_rawvalue = StandardScaler().fit_transform(np.array(data['v127_rawvalue']).reshape(-1, 1))
    v002_rawvalue = StandardScaler().fit_transform(np.array(data['v002_rawvalue']).reshape(-1, 1))
    v036_rawvalue = StandardScaler().fit_transform(np.array(data['v036_rawvalue']).reshape(-1, 1))
    v037_rawvalue = StandardScaler().fit_transform(np.array(data['v037_rawvalue']).reshape(-1, 1))
    v042_rawvalue = StandardScaler().fit_transform(np.array(data['v042_rawvalue']).reshape(-1, 1))

    # Calculate the weighted sum
    data["Weighted_Normalize_Outcome"] = - (v127_rawvalue * 5 +
                                            v002_rawvalue * 1 +
                                            v036_rawvalue * 1 +
                                            v037_rawvalue * 2 +
                                            v042_rawvalue * 1)

    # Apply Min-Max scaling to the Weighted_Normalize_Outcome column
    data["Weighted_Normalize_Outcome"] = MinMaxScaler().fit_transform(
        data["Weighted_Normalize_Outcome"].values.reshape(-1, 1))

    return data

In [None]:
#read in data
data19 = pd.read_csv("final_dataset19.csv")
data23 = pd.read_csv("final_dataset23.csv")

In [None]:
#clean data and create output column
outcome_list = ['v127_rawvalue','v002_rawvalue','v036_rawvalue','v037_rawvalue','v042_rawvalue']

data19 = std_norm(data19,outcome_list)
data23 = std_norm(data23,outcome_list)

#print(data23["Weighted_Normalize_Outcome"].describe())
#print(data19["Weighted_Normalize_Outcome"].describe())

data19 = data19.drop(columns=outcome_list)
data23 = data23.drop(columns=outcome_list)

data19 = drop_related_outcome_cols(data19)
data23 = drop_related_outcome_cols(data23)

#data19.columns
#data23.columns

# 2019 RF

In [None]:
#pre data split
X = data19.drop('Weighted_Normalize_Outcome', axis=1)
y = data19['Weighted_Normalize_Outcome']

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Creating a Random Forest Regressor with GridSearchCV for hyperparameter tuning

param_grid = {
    'n_estimators': [100],  # Number of trees in the forest
    'max_features': ['auto'],  # Number of features to consider at every split
    'max_depth': [None],  # Maximum number of levels in tree
    'min_samples_split': [5],  # Minimum number of samples required to split a node
    'min_samples_leaf': [2],  # Minimum number of samples required at each leaf node
}

grid_search = GridSearchCV(estimator=RandomForestRegressor(random_state=1),
                           param_grid=param_grid,
                           cv=3,
                           verbose=1,
                           n_jobs=1) #Set at 1 because it Worked for some of our group and not for others. Ideally -1.

# Fitting the grid search to the data
grid_search.fit(X_train, y_train)

# Best parameters found by grid search
best_params = grid_search.best_params_

# Creating a model with the best parameters
best_model19 = RandomForestRegressor(**best_params, random_state=1)
best_model19.fit(X_train, y_train)

# Scoring the model on the test set
best_score = best_model19.score(X_test, y_test)
best_score, best_params

Fitting 3 folds for each of 1 candidates, totalling 3 fits


(0.8850023831211948,
 {'max_depth': None,
  'max_features': 'auto',
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 100})

In [None]:
# Predicting and calculating the Mean Squared Error
y_pred = best_model19.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
mse

0.002456517481278696

Variable importance

In [None]:
# Extract feature importances
feature_importances = best_model19.feature_importances_

# Convert to a Series for easier handling
importances_series = pd.Series(feature_importances, index=X.columns)

# Sort the features by importance
sorted_importances = importances_series.sort_values(ascending=False)

# Get the top 10 features
top_features = sorted_importances[:15]

print("Top 15 mportant features:\n", top_features)

Top 15 mportant features:
 v024_rawvalue      0.464979
v009_rawvalue      0.230212
v063_rawvalue      0.032012
v049_rawvalue      0.026888
v014_race_white    0.022312
v014_rawvalue      0.019180
v135_rawvalue      0.017311
v139_rawvalue      0.016812
v143_rawvalue      0.015050
v015_rawvalue      0.010926
v065_rawvalue      0.008742
v082_rawvalue      0.008178
v138_rawvalue      0.006467
v148_rawvalue      0.005114
v070_rawvalue      0.004696
dtype: float64


# 2023 RF

In [None]:
#pre data split
X1 = data23.drop('Weighted_Normalize_Outcome', axis=1)
y1 = data23['Weighted_Normalize_Outcome']

In [None]:
# Split the data
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.3, random_state=42)


param_grid = {
    'n_estimators': [100],  # Number of trees in the forest
    'max_features': ['auto'],  # Number of features to consider at every split
    'max_depth': [None],  # Maximum number of levels in tree
    'min_samples_split': [5],  # Minimum number of samples required to split a node
    'min_samples_leaf': [2],  # Minimum number of samples required at each leaf node
}

grid_search1 = GridSearchCV(estimator=RandomForestRegressor(random_state=1),
                           param_grid=param_grid,
                           cv=3,
                           verbose=1,
                           n_jobs=1) #Set at 1 because it Worked for some of our group and not for others. Ideally -1.

# Fitting the grid search to the data
grid_search1.fit(X_train1, y_train1)

# Best parameters found by grid search
best_params1 = grid_search1.best_params_

# Creating a model with the best parameters
best_model23 = RandomForestRegressor(**best_params, random_state=1)
best_model23.fit(X_train1, y_train1)

# Scoring the model on the test set
best_score1 = best_model23.score(X_test1, y_test1)
best_score1, best_params1

Fitting 3 folds for each of 1 candidates, totalling 3 fits


(0.8879558723731995,
 {'max_depth': None,
  'max_features': 'auto',
  'min_samples_leaf': 2,
  'min_samples_split': 5,
  'n_estimators': 100})

In [None]:
# Predicting and calculating the Mean Squared Error
y_pred1 = best_model23.predict(X_test1)
mse23 = mean_squared_error(y_test1, y_pred1)
mse23

0.002462806038869529

In [None]:
# Extract feature importances
feature_importances1 = best_model23.feature_importances_

# Convert to a Series for easier handling
importances_series1 = pd.Series(feature_importances1, index=X1.columns)

# Sort the features by importance
sorted_importances1 = importances_series1.sort_values(ascending=False)

# Get the top 10 features
top_features1 = sorted_importances1[:15]

print("Top 15 mportant features:\n", top_features1)

Top 15 mportant features:
 v070_rawvalue      0.510854
v024_rawvalue      0.107101
v063_rawvalue      0.084959
v139_rawvalue      0.052816
v143_rawvalue      0.038430
v135_rawvalue      0.020376
v015_race_white    0.012299
v014_race_white    0.011722
v014_rawvalue      0.010289
v148_rawvalue      0.007748
v009_rawvalue      0.006974
v011_rawvalue      0.005685
v135_race_white    0.005109
v045_rawvalue      0.003836
v082_rawvalue      0.003566
dtype: float64


# TabPy Connection & Function

In [None]:
# Connect to TabPy server
connection = tabpy_client.Client('http://localhost:9004/')

In [None]:
#load data for prediction.
#Needed additional loading of data used for indexing during single row prediction due to tabpy limitation with pandas.

d19 = pd.read_csv("final_dataset19.csv")
d23 = pd.read_csv("final_dataset23.csv")

#need to drop outcome columns
outcome_list = ['v127_rawvalue','v002_rawvalue','v036_rawvalue','v037_rawvalue','v042_rawvalue']
d19 = d19.drop(columns=outcome_list)
d23 = d23.drop(columns=outcome_list)

#need to drop related columns
d19 = drop_related_outcome_cols(d19)
d23 = drop_related_outcome_cols(d23)

#need to drop unused columns from earlier method but modified
d19 = d19.drop(['statecode', 'countycode', 'fipscode', 'year','county_ranked'], axis=1)
d23 = d23.drop(['statecode', 'countycode', 'fipscode', 'year','county_ranked'], axis=1)

# Pattern to match columns to drop
pattern = r'(numerator|denominator|cihigh|cilow|other)'

# Use DataFrame's filter method with regex to find matching columns
columns_to_drop = d19.filter(regex=pattern).columns
columns_to_drop1 = d23.filter(regex=pattern).columns

# Drop these columns
d19 = d19.drop(columns=columns_to_drop, axis=1)
d23 = d23.drop(columns=columns_to_drop1, axis=1)

d19ind = d19.drop(['state', 'county'], axis=1)
d23ind = d23.drop(['state', 'county'], axis=1)
#print(d19ind)


In [None]:
#prediction function used for RF inputs and outputs.
def Prediction(P1, P2, P3, P4, P5, P6, P7, P8, state, county, model, year):
    if year[0] == '2019' and model[0] =='Random Forest':
        r19, c19 = d19.shape
        index = -1 #error case.
        #search for matching state and county
        for i in range(0,r19):
            if d19.iloc[i,0] == state[0] and d19.iloc[i,1] == county[0]:
                index = i
        if index == -1:
            return -1
        else:
            row=d19ind.loc[index]
            row.at['v014_rawvalue'] = P1[0]
            row.at['v024_rawvalue'] = P2[0]
            row.at['v009_rawvalue'] = P3[0]
            row.at['v135_rawvalue'] = P4[0]
            row.at['v139_rawvalue'] = P5[0]
            row.at['v063_rawvalue'] = P6[0]
            row.at['v070_rawvalue'] = P7[0]
            row.at['v049_rawvalue'] = P8[0]
            row = row.to_numpy()
            Z = best_model19.predict([row])
            return Z[0]
    elif year[0] == '2023' and model[0] =='Random Forest':
        r23, c23 = d23.shape
        index = -1
    #search for matching state and county
        for i in range(0,r23):
            if d23.iloc[i,0] == state[0] and d23.iloc[i,1] == county[0]:
                index = i
        #retrieve row
        if index ==-1:
            return -1
        else:
            row=d23ind.loc[index]
            row.at['v014_rawvalue'] = P1[0]
            row.at['v024_rawvalue'] = P2[0]
            row.at['v009_rawvalue'] = P3[0]
            row.at['v135_rawvalue'] = P4[0]
            row.at['v139_rawvalue'] = P5[0]
            row.at['v063_rawvalue'] = P6[0]
            row.at['v070_rawvalue'] = P7[0]
            row.at['v049_rawvalue'] = P8[0]
            #feed into model
            row = row.to_numpy()
            #print(row)
            Z = best_model23.predict([row])
            return Z[0]
    #NN Code
    elif year[0] == '2023' and model[0] =='Neural Network':
        return -1 # NN code currently unable to work due to tabpy limitation with numpy
    else:
        return -1

In [None]:
# Publish prediction function to TabPy server so it can be used from Tableau
connection.deploy('Prediction',
                  Prediction, override = True)