<a href="https://colab.research.google.com/github/maxstclair/project_voting/blob/main/Project_Voting_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! git clone https://github.com/maxstclair/project_voting

Cloning into 'project_voting'...
remote: Enumerating objects: 85, done.[K
remote: Counting objects: 100% (21/21), done.[K
remote: Compressing objects: 100% (16/16), done.[K
remote: Total 85 (delta 11), reused 10 (delta 5), pack-reused 64[K
Receiving objects: 100% (85/85), 31.41 MiB | 12.84 MiB/s, done.
Resolving deltas: 100% (35/35), done.
Updating files: 100% (44/44), done.


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
from sklearn import linear_model

# Load training data
df = pd.read_csv('./project_voting/data/Vot_Census_Data_new.xlsx - Sheet1.csv')

# Get dummies for fixed effects
dummies = pd.get_dummies(df['COUNTY'])
df2 = pd.concat([df, dummies], axis=1)

# Inverse hyperbolic sine transformation for y
df2['Vote Skew'] = np.arcsinh(df2['Vote Skew'])
columns_to_drop = ['YEAR', 'STATE', 'COUNTY', 'Total Votes', 'Democrat Votes', 'Republican Votes']
df2 = df2.drop(columns=columns_to_drop)
df2.head()

Unnamed: 0,Total Population,Income to Poverty Level Ratio,Per Capita Income vs Pop,Median House Value,GED Ratio,Unemployment,Inflation,Interest Rates,Oil Prices,Vote Skew,...,virginia beach city,warren county,washington county,waynesboro city,westmoreland county,williamsburg city,winchester city,wise county,wythe county,york county
0,34066,0.98773,0.668291,149800,0.26977,0.05,0.0206,0.035,3.109,-6.113687,...,False,False,False,False,False,False,False,False,False,False
1,96633,0.951083,0.379632,349800,0.123446,0.05,0.0206,0.035,3.109,9.821844,...,False,False,False,False,False,False,False,False,False,False
2,16406,0.973912,1.341765,104000,0.274168,0.05,0.0206,0.035,3.109,-5.780753,...,False,False,False,False,False,False,False,False,False,False
3,12517,0.99257,1.933131,189800,0.275066,0.05,0.0206,0.035,3.109,-7.994295,...,False,False,False,False,False,False,False,False,False,False
4,32315,0.94272,0.652855,142200,0.237444,0.05,0.0206,0.035,3.109,-8.466321,...,False,False,False,False,False,False,False,False,False,False


In [12]:
# Train model

X_train = df2.drop('Vote Skew',axis=1)
y_train = df2['Vote Skew']

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
X_train_sc = scaler.transform(X_train)

reg = linear_model.Lasso(alpha=1, warm_start=True, max_iter=2000, fit_intercept=True)
reg.fit(X_train_sc,y_train)

sdf = pd.DataFrame({'variable': X_train.columns, 'coefficients':reg.coef_})
sdf

Unnamed: 0,variable,coefficients
0,Total Population,0.141322
1,Income to Poverty Level Ratio,-0.796470
2,Per Capita Income vs Pop,0.000000
3,Median House Value,0.000000
4,GED Ratio,-2.341917
...,...,...
138,williamsburg city,0.000000
139,winchester city,0.000000
140,wise county,-0.000000
141,wythe county,-0.000000


In [9]:
# load test data, get dummies
df_test = pd.read_csv('./project_voting/data/2024_training_data.csv')
df_test.rename(columns={'Interest Rate': 'Interest Rates'}, inplace=True)
new_order = ['County', 'Total Population', 'Income to Poverty Level Ratio',	'Per Capita Income vs Pop',	'Median House Value',	'GED Ratio',	'Unemployment',	'Inflation',	'Interest Rates',	'Oil Prices']
df_test = df_test[new_order]
dummies = pd.get_dummies(df_test['County'])
df_test2 = pd.concat([df_test, dummies], axis=1)
columns_to_drop2 = ['County']
df_test2 = df_test2.drop(columns=columns_to_drop2)
df_test2

Unnamed: 0,Total Population,Income to Poverty Level Ratio,Per Capita Income vs Pop,Median House Value,GED Ratio,Unemployment,Inflation,Interest Rates,Oil Prices,accomack county,...,virginia beach city,warren county,washington county,waynesboro city,westmoreland county,williamsburg city,winchester city,wise county,wythe county,york county
0,31338.33333,0.989725,0.817282,178566.6667,0.279730,0.029,0.035,0.06875,3.48,True,...,False,False,False,False,False,False,False,False,False,False
1,116206.66670,0.904140,0.405274,329250.0000,0.097905,0.029,0.035,0.06875,3.48,False,...,False,False,False,False,False,False,False,False,False,False
2,14261.66667,0.982802,2.167152,129450.0000,0.296163,0.029,0.035,0.06875,3.48,False,...,False,False,False,False,False,False,False,False,False,False
3,13217.16667,0.992636,2.564247,213066.6667,0.249439,0.029,0.035,0.06875,3.48,False,...,False,False,False,False,False,False,False,False,False,False
4,31495.16667,1.009275,0.875330,164816.6667,0.252671,0.029,0.035,0.06875,3.48,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
129,95651.00000,0.994426,0.391467,237933.3333,0.175062,0.029,0.035,0.06875,3.48,False,...,False,False,False,False,False,False,False,False,False,False
130,465126.50000,0.974273,0.089396,266983.3333,0.123107,0.029,0.035,0.06875,3.48,False,...,True,False,False,False,False,False,False,False,False,False
131,22890.83333,0.977557,1.168793,163050.0000,0.186382,0.029,0.035,0.06875,3.48,False,...,False,False,False,True,False,False,False,False,False,False
132,16253.83333,0.800741,2.182271,249583.3333,0.083526,0.029,0.035,0.06875,3.48,False,...,False,False,False,False,False,True,False,False,False,False


In [13]:
# Predict values
X_test = df_test2
X_test_sc = scaler.transform(X_test)
predictions = reg.predict(X_test_sc)

In [14]:
# Take hyperbolic sine to undo transformation
actual_vote_skew = np.sinh(predictions)

# Sum up net votes
sum_of_values = np.sum(actual_vote_skew)
sum_of_values

-32014.41886995041

Predicts total Republican votes exceed total Democrat votes by 32,000