In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.losses import MeanSquaredError

2023-11-30 22:09:52.994577: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
#Dopping useless string columns that cannot feed into the neural net
def drop_data2(data):
    data = data.drop(['statecode', 'countycode', 'fipscode', 'state', 'county', 'year','county_ranked'], axis=1)

    # Pattern to match columns to drop
    pattern = r'(numerator|denominator|cihigh|cilow|other)'

    # Use DataFrame's filter method with regex to find matching columns
    columns_to_drop = data.filter(regex=pattern).columns

    # print(columns_to_drop)

    # Drop these columns
    data = data.drop(columns=columns_to_drop, axis=1)

    return data

In [3]:

# dropping columns that are identical or mostly similar to outcome columns
def drop_related_outcome_cols(data):
    pattern = r'(v127|v002|v036|v037|v042|v001|v128|v129|v144|v145|v060|v061|v147)'
    columns_to_drop = data.filter(regex=pattern).columns
    data = data.drop(columns=columns_to_drop, axis=1)
    return data



In [4]:
def std_norm(data, column_lst):
    data = drop_data2(data) # Assuming drop_data2 is a predefined function

    # Standardize specified columns
    for i in column_lst:
        data[i] = StandardScaler().fit_transform(np.array(data[i]).reshape(-1, 1))

    # Calculate the weighted sum
    data["Weighted_Normalize_Outcome"] = - (data['v127_rawvalue'] * 5 +
                                            data['v002_rawvalue'] * 1 +
                                            data['v036_rawvalue'] * 1 +
                                            data['v037_rawvalue'] * 2 +
                                            data['v042_rawvalue'] * 1)

    # Apply Min-Max scaling to the Weighted_Normalize_Outcome column
    data["Weighted_Normalize_Outcome"] = MinMaxScaler().fit_transform(
        data["Weighted_Normalize_Outcome"].values.reshape(-1, 1))

    return data

In [5]:
#Read data csv
data19 = pd.read_csv("final_dataset19.csv")
data23 = pd.read_csv("final_dataset23.csv")


In [6]:
#Normalize the output features 
outcome_list = ['v127_rawvalue','v002_rawvalue','v036_rawvalue','v037_rawvalue','v042_rawvalue']

data19 = std_norm(data19,outcome_list)

data23 = std_norm(data23,outcome_list)


In [7]:
#Refine more data
data19 = data19.drop(columns=outcome_list)
data23 = data23.drop(columns=outcome_list)

data19 = drop_related_outcome_cols(data19)
data23 = drop_related_outcome_cols(data23)



# 2019 NN

In [8]:
#refine and standardize data in 2019
X = data19.drop('Weighted_Normalize_Outcome', axis=1)
y = data19['Weighted_Normalize_Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [9]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

#Create neural network 133x64x32x1
model = Sequential()
model.add(Dense(64, input_dim=113, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss=MeanSquaredError())

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
mse = model.evaluate(X_test, y_test)
print(f'Mean Squared Error on Test Data: {mse}')

2023-11-30 22:09:54.787772: I tensorflow/core/common_runtime/process_util.cc:146] Creating new thread pool with default inter op setting: 2. Tune using inter_op_parallelism_threads for best performance.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error on Test Data: 0.0202962476760149


# 2023 NN

In [10]:
#refine and standardize data in 2019
X = data23.drop('Weighted_Normalize_Outcome', axis=1)
y = data23['Weighted_Normalize_Outcome']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [11]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

#Create neural network 144x64x32x1
model = Sequential()
model.add(Dense(64, input_dim=144, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='linear'))

# Compile the model
model.compile(optimizer='adam', loss=MeanSquaredError())

# Train the model
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)

# Evaluate the model on the test set
mse = model.evaluate(X_test, y_test)
print(f'Mean Squared Error on Test Data: {mse}')

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
Mean Squared Error on Test Data: 0.025214705616235733


In [13]:
#Demo of picking out a single sample to show how .predict() works
single_sample = np.array(X_test[1],ndmin=2)

# Use the trained model to make a prediction on the single sample
prediction = model.predict(single_sample)

print(f"Prediction for the single sample: {prediction[0][0]}")


Prediction for the single sample: 0.5949423909187317
