In [1]:
import pandas as pd
import numpy as np

In [2]:
election_data = pd.read_csv("countypres_2000-2020.csv")

# Cleaning Election Data
election_data_cleaned = election_data[
    ['year', 'state', 'state_po', 'county_name', 'county_fips', 'candidate', 
     'party', 'candidatevotes', 'totalvotes']
].dropna()

election_data_cleaned = election_data_cleaned.rename(columns={
    'state_po': 'state_abbreviation',
    'county_fips': 'fips_code',
    'candidatevotes': 'votes_candidate',
    'totalvotes': 'votes_total'
})

winner_df = election_data_cleaned.loc[election_data_cleaned.groupby(['year','fips_code'])['votes_candidate'].idxmax()]
winner_df = winner_df[['year','fips_code','party']].dropna()
winner_df['party'] = winner_df['party'].replace({'republican':'REPUBLICAN','democrat':'DEMOCRAT'})
valid_parties = ['REPUBLICAN','DEMOCRAT']
winner_df = winner_df[winner_df['party'].isin(valid_parties)]

In [3]:
unemployment_data = pd.read_csv("Unemployment(UnemploymentMedianIncome).csv")

unemployment_cols = [col for col in unemployment_data.columns if 'Unemployment_rate_' in col]

unemp_long = unemployment_data.melt(
    id_vars=['FIPS_Code','State','Area_Name'],
    value_vars=unemployment_cols,
    var_name='year_var',
    value_name='unemp_rate'
)

unemp_long['year'] = unemp_long['year_var'].str.extract(r'(\d{4})').astype(int)
unemp_long = unemp_long.dropna(subset=['unemp_rate'])
unemp_long = unemp_long.rename(columns={'FIPS_Code': 'fips_code'})

In [4]:
population_data = pd.read_csv('combined_population_with_fips.csv')

pop_cols = [col for col in population_data.columns if 'POPESTIMATE' in col]

pop_long = population_data.melt(
    id_vars=['STATE','COUNTY','STNAME','FIPS'],
    value_vars=pop_cols,
    var_name='pop_year_var',
    value_name='population_est'
)

pop_long['year'] = pop_long['pop_year_var'].str.extract(r'(\d{4})').astype(int)
pop_long = pop_long.dropna(subset=['population_est'])

pop_long['fips_code'] = pd.to_numeric(pop_long['FIPS'], errors='coerce')
pop_long = pop_long[['fips_code','year','population_est']]


In [5]:
file_path = "Education.xlsx"
education_df = pd.read_excel(file_path, engine='openpyxl', header=3)
education_df = education_df.rename(columns={'FIPS Code': 'fips_code'})

# Columns to keeep for model 
bachelors_cols = [
    "Percent of adults with a bachelor's degree or higher, 1990",
    "Percent of adults with a bachelor's degree or higher, 2000",
    "Percent of adults with a bachelor's degree or higher, 2008-12",
    "Percent of adults with a bachelor's degree or higher, 2018-22"
]

hs_cols = [
    "Percent of adults with a high school diploma only, 1990",
    "Percent of adults with a high school diploma only, 2000",
    "Percent of adults with a high school diploma only, 2008-12",
    "Percent of adults with a high school diploma only, 2018-22"
]

edu_cols = ['fips_code'] + bachelors_cols + hs_cols
education_df = education_df[edu_cols]

In [6]:
master_df = winner_df.copy()
master_df = pd.merge(master_df, unemp_long, on=['fips_code','year'], how='left')
master_df = pd.merge(master_df, pop_long, on=['fips_code','year'], how='left')

def get_edu_columns_for_year(election_year):
    if election_year == 2000:
        return (
            "Percent of adults with a bachelor's degree or higher, 1990",
            "Percent of adults with a high school diploma only, 1990"
        )
    elif 2000 < election_year < 2008:
        return (
            "Percent of adults with a bachelor's degree or higher, 2000",
            "Percent of adults with a high school diploma only, 2000"
        )
    elif 2008 <= election_year < 2016:
        return (
            "Percent of adults with a bachelor's degree or higher, 2008-12",
            "Percent of adults with a high school diploma only, 2008-12"
        )
    else:
        return (
            "Percent of adults with a bachelor's degree or higher, 2018-22",
            "Percent of adults with a high school diploma only, 2018-22"
        )

edu_columns = master_df['year'].apply(get_edu_columns_for_year)

master_df['bachelors_col'] = edu_columns.apply(lambda x: x[0])
master_df['hs_col'] = edu_columns.apply(lambda x: x[1])
master_df = pd.merge(master_df, education_df, on='fips_code', how='left')
master_df['bachelors_rate'] = master_df.apply(
    lambda row: row[row['bachelors_col']] if pd.notnull(row['bachelors_col']) else np.nan, axis=1
)
master_df['hs_rate'] = master_df.apply(
    lambda row: row[row['hs_col']] if pd.notnull(row['hs_col']) else np.nan, axis=1
)
master_df['edu_attainment_rate'] = master_df['bachelors_rate'] + master_df['hs_rate']
master_df = master_df.drop(columns=['bachelors_col', 'hs_col', 'bachelors_rate', 'hs_rate'])
master_df = master_df.dropna(subset=['party', 'unemp_rate', 'population_est', 'edu_attainment_rate'])
master_df.reset_index(drop=True, inplace=True)
master_df

Unnamed: 0,year,fips_code,party,State,Area_Name,year_var,unemp_rate,population_est,"Percent of adults with a bachelor's degree or higher, 1990","Percent of adults with a bachelor's degree or higher, 2000","Percent of adults with a bachelor's degree or higher, 2008-12","Percent of adults with a bachelor's degree or higher, 2018-22","Percent of adults with a high school diploma only, 1990","Percent of adults with a high school diploma only, 2000","Percent of adults with a high school diploma only, 2008-12","Percent of adults with a high school diploma only, 2018-22",edu_attainment_rate
0,2000,1001.0,REPUBLICAN,AL,"Autauga County, AL",Unemployment_rate_2000,4.1,44021.0,14.5,18.0,21.707831,29.558575,32.0,33.8,33.786706,31.146113,46.500000
1,2000,1003.0,REPUBLICAN,AL,"Baldwin County, AL",Unemployment_rate_2000,3.7,141342.0,16.8,23.1,27.741591,32.561579,31.8,29.6,28.816463,27.775383,48.600000
2,2000,1005.0,DEMOCRAT,AL,"Barbour County, AL",Unemployment_rate_2000,5.6,29015.0,11.8,10.9,14.524286,11.881188,27.1,32.4,33.294700,36.814710,38.900000
3,2000,1007.0,REPUBLICAN,AL,"Bibb County, AL",Unemployment_rate_2000,5.4,19913.0,4.7,7.1,8.996005,10.919937,33.8,35.7,41.609744,40.879121,38.500000
4,2000,1009.0,REPUBLICAN,AL,"Blount County, AL",Unemployment_rate_2000,3.5,51107.0,7.0,9.6,12.381469,14.741407,34.7,36.0,36.157404,35.313717,41.700000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18629,2020,56037.0,REPUBLICAN,WY,"Sweetwater County, WY",Unemployment_rate_2020,7.5,42673.0,13.3,17.0,17.006235,19.982605,37.5,34.7,36.002755,32.652026,52.634631
18630,2020,56039.0,DEMOCRAT,WY,"Teton County, WY",Unemployment_rate_2020,6.0,23497.0,30.0,45.8,48.994399,60.292061,28.5,18.9,23.746181,16.650814,76.942875
18631,2020,56041.0,REPUBLICAN,WY,"Uinta County, WY",Unemployment_rate_2020,6.4,20215.0,14.3,15.0,18.272985,19.105017,39.2,35.6,34.946031,36.319120,55.424137
18632,2020,56043.0,REPUBLICAN,WY,"Washakie County, WY",Unemployment_rate_2020,5.3,7760.0,18.4,18.7,23.339826,21.137475,34.5,33.8,32.388879,28.454990,49.592465


In [7]:
# Model 
feature_cols = ['unemp_rate', 'population_est', 'edu_attainment_rate']
X = master_df[feature_cols]
y = master_df['party']

from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y_encoded = le.fit_transform(y)
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.3, random_state=42, stratify=y_encoded
)

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

rf = RandomForestClassifier(random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=5,
    scoring='accuracy',
    n_jobs=-1
)

grid_search.fit(X_train, y_train)
best_rf = grid_search.best_estimator_

In [8]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

y_pred = best_rf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print("Model Performance:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Model Performance:
Accuracy: 0.8344
Precision: 0.8462
Recall: 0.9676
F1 Score: 0.9028

Classification Report:
              precision    recall  f1-score   support

    DEMOCRAT       0.72      0.32      0.44      1145
  REPUBLICAN       0.85      0.97      0.90      4446

    accuracy                           0.83      5591
   macro avg       0.78      0.64      0.67      5591
weighted avg       0.82      0.83      0.81      5591



In [9]:
import joblib

new_data = pd.DataFrame({
    'unemp_rate': [2.3],
    'population_est': [59726],
    'edu_attainment_rate': [60]
})

print("New Data Point:")
print(new_data)
print("\n")

prediction_encoded = best_rf.predict(new_data)
prediction_party = le.inverse_transform(prediction_encoded)

print(f"The predicted winning party is: {prediction_party[0]}")
print("\n")

prediction_prob = best_rf.predict_proba(new_data)
prob_df = pd.DataFrame(prediction_prob, columns=le.classes_)
print("Prediction Probabilities:")
print(prob_df)
print("\n")

New Data Point:
   unemp_rate  population_est  edu_attainment_rate
0         2.3           59726                   60


The predicted winning party is: REPUBLICAN


Prediction Probabilities:
   DEMOCRAT  REPUBLICAN
0  0.219499    0.780501




In [10]:
model_data = {
    'model': best_rf,  
    'feature_order': feature_cols
}

joblib.dump(model_data, '../random_forest_model.pkl')


['../random_forest_model.pkl']