In [4]:
# Create new venv with the following pacakges:
# pandas jupyter seaborn scikit-learn keras tensorflow

In [5]:
import pandas as pd
import jupyter 
import seaborn
import keras
import tensorflow
import numpy as np
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor


In [6]:
# Train Data:
df = pd.read_csv('./data_2018/cd_2018.csv')
df.head()

# Target variable = 'Voter Turnout'

Unnamed: 0,CD116,Voting rate,18-44,45-64,65 and older,Women,In Poverty,Did not finish high school,Bachelors or more,White,Black,Asian,Hispanic,urbanization_pct
0,Indiana 01,45.6,43.2,35.1,21.7,51.7,11.3,10.2,21.7,72.4,17.7,1.0,12.7,26.2
1,Indiana 02,43.8,43.8,33.6,22.6,51.4,11.1,13.2,21.8,88.9,6.5,0.8,5.7,14.7
2,Indiana 03,45.5,44.5,34.2,21.3,51.3,9.1,11.9,22.3,90.0,5.8,1.5,3.5,14.3
3,Indiana 04,43.1,46.0,33.4,20.7,50.4,10.5,9.8,24.7,91.7,4.0,1.6,3.4,10.9
4,Indiana 05,55.3,45.1,34.8,20.1,52.3,8.2,6.9,43.4,86.5,8.7,2.4,2.9,22.7


In [7]:
# Independent variables
X = df[['18-44', '45-64', '65 and older',
       'Women', 'In Poverty', 'Did not finish high school',
       'Bachelors or more', 'White', 'Black', 'Asian', 'Hispanic',
       'urbanization_pct']]

# Dependent variable
y = df["Voting rate"]

In [None]:
# Convert to numeric (in case of any issues) and drop missing rows
X = X.apply(pd.to_numeric, errors='coerce')
y = pd.to_numeric(y, errors='coerce')
data = pd.concat([X, y], axis=1).dropna()
X = data[['18-44', '45-64', '65 and older',
       'Women', 'In Poverty', 'Did not finish high school',
       'Bachelors or more', 'White', 'Black', 'Asian', 'Hispanic',
       'urbanization_pct']]
y = data["Voting rate"]

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Mutliple Linear Regression model

In [9]:
# Create and train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Print coefficients and intercept
print("Coefficients:", dict(zip(X.columns, model.coef_)))
print("Intercept:", model.intercept_)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.3f}")
print(f"R-squared: {r2:.3f}")

Coefficients: {'18-44': -0.22798066424579885, '45-64': 0.7643025277184403, '65 and older': -0.4507116605957943, 'Women': -1.1739746091738823, 'In Poverty': -0.40198196051308865, 'Did not finish high school': -1.1948106230351794, 'Bachelors or more': 0.24984143348231547, 'White': -1.1483201805573815, 'Black': -0.967532699837785, 'Asian': -0.769269527873696, 'Hispanic': -0.4947970172327683}
Intercept: 225.00323691409304
Mean Squared Error: 9.956
R-squared: 0.622


### Ridge Regression model

In [10]:
# Create and train the Ridge regression model
model = Ridge(alpha=1.0)  # You can tune alpha to control regularization strength
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Print coefficients and intercept
print("Coefficients:", dict(zip(X.columns, model.coef_)))
print("Intercept:", model.intercept_)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.3f}")
print(f"R-squared: {r2:.3f}")

Coefficients: {'18-44': -0.24663932028440957, '45-64': 0.7392351342049804, '65 and older': -0.4759321802203251, 'Women': -1.1081584539443952, 'In Poverty': -0.3924473940532858, 'Did not finish high school': -1.188231705095321, 'Bachelors or more': 0.24280675771553353, 'White': -1.0401301333801907, 'Black': -0.8655978075600975, 'Asian': -0.6142766159897772, 'Hispanic': -0.45104443078210316}
Intercept: 213.2098674762958
Mean Squared Error: 10.025
R-squared: 0.619


### Lasso Regression

In [11]:
# Create and train the Lasso regression model
model = Lasso(alpha=0.1)  # Alpha controls regularization strength; tweak as needed
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Print coefficients and intercept
print("Coefficients:", dict(zip(X.columns, model.coef_)))
print("Intercept:", model.intercept_)

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.3f}")
print(f"R-squared: {r2:.3f}")

Coefficients: {'18-44': 0.0, '45-64': 0.9359634946611879, '65 and older': -0.23127246217731198, 'Women': -0.8657510819309111, 'In Poverty': -0.3454971831774815, 'Did not finish high school': -1.1759677442850882, 'Bachelors or more': 0.21692441614471453, 'White': -0.5870020404232199, 'Black': -0.43548053250291724, 'Asian': -0.0, 'Hispanic': -0.25322605701773865}
Intercept: 133.20400402232934
Mean Squared Error: 10.557
R-squared: 0.599


### Random Forest Regression

In [12]:
# Split into train/test sets

# Create and train the Random Forest regression model
model = RandomForestRegressor(
    n_estimators=100,      # Number of trees in the forest
    random_state=42,
    max_depth=None,        # You can set a depth limit to prevent overfitting
    min_samples_split=2
)
model.fit(X_train, y_train)

# Predict on test set
y_pred = model.predict(X_test)

# Print feature importances
print("Feature importances:", dict(zip(X.columns, model.feature_importances_)))

# Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse:.3f}")
print(f"R-squared: {r2:.3f}")

Feature importances: {'18-44': 0.012454239224083593, '45-64': 0.04575790053960252, '65 and older': 0.0481422627026789, 'Women': 0.03164063373569537, 'In Poverty': 0.19915524593203884, 'Did not finish high school': 0.44909637081111436, 'Bachelors or more': 0.11271396349176997, 'White': 0.012788817693964718, 'Black': 0.0188849004462216, 'Asian': 0.043650644920275684, 'Hispanic': 0.025715020502554323}
Mean Squared Error: 10.857
R-squared: 0.588


In [13]:
# Data to apply model onto:
new_df = pd.read_csv('./data_2023/cd_2023.csv')
new_df.head()

Unnamed: 0,CD119,18-44,45-64,65 and older,Women,In Poverty,Did not finish high school,Bachelors or more,White,Black,Asian,Hispanic,urbanization_pct
0,Ohio 01,49.2,30.5,20.4,50.7,12.2,6.2,48.7,68.5,16.9,4.2,5.0,46.6
1,Ohio 02,41.8,33.6,24.6,50.1,14.6,9.3,21.4,92.4,1.6,0.5,1.5,10.2
2,Ohio 03,54.8,27.9,17.3,50.5,16.6,8.2,46.0,54.1,28.0,5.8,7.3,86.0
3,Ohio 04,43.3,33.0,23.7,49.2,10.8,7.2,28.9,85.7,3.9,3.4,2.8,13.8
4,Ohio 05,42.6,32.0,25.3,50.2,10.9,6.4,26.3,85.8,3.4,1.0,7.6,13.4


In [14]:
new_X = new_df[X.columns]  # ensure same feature order

new_predictions = model.predict(new_X)

new_df["Voter Turnout"] = new_predictions

predict_df = new_df[["CD119", "Voter Turnout"]].copy()

predict_df["Voter Turnout"] = predict_df["Voter Turnout"].round(2)

print(predict_df.head())



     CD119  Voter Turnout
0  Ohio 01          53.95
1  Ohio 02          48.65
2  Ohio 03          54.24
3  Ohio 04          53.66
4  Ohio 05          50.26


In [15]:
# Export
# predict_df.to_csv('./data_2023/predict_turnout_ohio_2026.csv', index=False)