In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [2]:
# Load the voter file
df = pd.read_csv("voterfile .csv")

In [3]:
#show the data 
df.sample(5)

Unnamed: 0,optimus_id,age,party,ethnicity,maritalstatus,dwellingtype,income,education,cd,dma,...,intrst_musical_instruments_in_hh,donates_to_liberal_causes,donates_to_conservative_causes,home_owner_or_renter,g08_precinct_turnout,g10_precinct_turnout,g12_precinct_turnout,p08_precinct_turnout,p10_precinct_turnout,p12_precinct_turnout
18390,976753,72.0,Non-Partisan,European,Married,Single Family Dwelling Unit,125k-200k,Grad Degree - Likely,2.0,RENO DMA (EST.),...,,,,Likely Homeowner,0.46,0.56,0.77,0.19,0.35,0.38
27065,992000,49.0,American Independent,European,,,Unknown,,2.0,RENO DMA (EST.),...,,,,,0.76,0.69,0.86,0.21,0.33,0.21
48047,701538,33.0,Non-Partisan,European,,,Unknown,,3.0,LAS VEGAS DMA (EST.),...,,,,,0.72,0.65,0.78,0.2,0.33,0.18
33479,511355,29.0,Non-Partisan,European,,,Unknown,,1.0,LAS VEGAS DMA (EST.),...,,,,,0.55,0.44,0.65,0.1,0.19,0.11
33003,553397,54.0,Republican,European,,Single Family Dwelling Unit,125k-200k,Bach Degree - Extremely Likely,3.0,LAS VEGAS DMA (EST.),...,,,,Likely Homeowner,0.61,0.46,0.77,0.05,0.14,0.1


In [4]:
#data from the first five 
df.head()

Unnamed: 0,optimus_id,age,party,ethnicity,maritalstatus,dwellingtype,income,education,cd,dma,...,intrst_musical_instruments_in_hh,donates_to_liberal_causes,donates_to_conservative_causes,home_owner_or_renter,g08_precinct_turnout,g10_precinct_turnout,g12_precinct_turnout,p08_precinct_turnout,p10_precinct_turnout,p12_precinct_turnout
0,861681,69.0,Republican,European,Married,Single Family Dwelling Unit,75k-125k,Bach Degree - Extremely Likely,4.0,LAS VEGAS DMA (EST.),...,,,,Likely Homeowner,0.56,0.54,0.75,0.17,0.32,0.24
1,1084850,20.0,American Independent,European,,,Unknown,,2.0,RENO DMA (EST.),...,,,,,0.84,0.82,0.92,0.47,0.62,0.47
2,644435,28.0,Non-Partisan,European,,,Unknown,,3.0,LAS VEGAS DMA (EST.),...,,,,,0.49,0.34,0.7,0.04,0.09,0.06
3,57683,78.0,American Independent,European,Married,,Unknown,,3.0,LAS VEGAS DMA (EST.),...,,,,,0.84,0.79,0.91,0.24,0.46,0.3
4,167371,68.0,Democratic,,,,Unknown,,4.0,LAS VEGAS DMA (EST.),...,,,,,0.71,0.66,0.81,0.19,0.37,0.34


In [5]:
#data from the last five
df.tail()

Unnamed: 0,optimus_id,age,party,ethnicity,maritalstatus,dwellingtype,income,education,cd,dma,...,intrst_musical_instruments_in_hh,donates_to_liberal_causes,donates_to_conservative_causes,home_owner_or_renter,g08_precinct_turnout,g10_precinct_turnout,g12_precinct_turnout,p08_precinct_turnout,p10_precinct_turnout,p12_precinct_turnout
49995,251398,23.0,American Independent,European,,Single Family Dwelling Unit,125k-200k,Some College -Extremely Likely,3.0,LAS VEGAS DMA (EST.),...,,,,Likely Homeowner,0.68,0.56,0.77,0.12,0.22,0.12
49996,684299,24.0,Democratic,,,Small Mult or large mult w/apt number,35k-75k,Some College - Likely,1.0,LAS VEGAS DMA (EST.),...,,,,Likely Homeowner,0.5,0.38,0.67,0.05,0.09,0.06
49997,369815,28.0,Non-Partisan,European,,Small Mult or large mult w/apt number,35k-75k,HS Diploma - Likely,1.0,LAS VEGAS DMA (EST.),...,,,,,0.45,0.31,0.58,0.07,0.1,0.08
49998,276455,49.0,Republican,European,,Small Mult or large mult w/apt number,75k-125k,HS Diploma - Extremely Likely,4.0,LAS VEGAS DMA (EST.),...,,,,,0.52,0.35,0.68,0.06,0.1,0.06
49999,878074,69.0,American Independent,European,,Single Family Dwelling Unit,35k-75k,Some College - Likely,2.0,RENO DMA (EST.),...,,,,,0.79,0.81,0.87,0.37,0.62,0.44


In [6]:
#check shape of data
df.shape

(50000, 39)

In [7]:
#show the name of columns in our dataset
df.columns

Index(['optimus_id', 'age', 'party', 'ethnicity', 'maritalstatus',
       'dwellingtype', 'income', 'education', 'cd', 'dma',
       'occupationindustry', 'vh14p', 'vh12g', 'vh12p', 'vh10g', 'vh10p',
       'vh08g', 'vh08p', 'vh06g', 'vh06p', 'vh04g', 'vh04p', 'vh02g', 'vh02p',
       'vh00g', 'vh00p', 'net_worth', 'petowner_dog', 'intrst_nascar_in_hh',
       'intrst_musical_instruments_in_hh', 'donates_to_liberal_causes',
       'donates_to_conservative_causes', 'home_owner_or_renter',
       'g08_precinct_turnout', 'g10_precinct_turnout', 'g12_precinct_turnout',
       'p08_precinct_turnout', 'p10_precinct_turnout', 'p12_precinct_turnout'],
      dtype='object')

In [8]:
# Relevent Independent Variables
features = ['age', 'party', 'vh14p', 'vh12g', 'vh12p', 'vh10g', 'vh10p',
            'vh08g', 'vh08p', 'vh06g', 'vh06p', 'vh04g', 'vh04p', 'vh02g', 'vh02p',
            'vh00g', 'vh00p', 'g08_precinct_turnout', 'g10_precinct_turnout', 'g12_precinct_turnout',
            'p08_precinct_turnout', 'p10_precinct_turnout', 'p12_precinct_turnout'] 

In [9]:
df=df[['optimus_id'] + features]
df.sample(5)

Unnamed: 0,optimus_id,age,party,vh14p,vh12g,vh12p,vh10g,vh10p,vh08g,vh08p,...,vh02g,vh02p,vh00g,vh00p,g08_precinct_turnout,g10_precinct_turnout,g12_precinct_turnout,p08_precinct_turnout,p10_precinct_turnout,p12_precinct_turnout
44264,551098,53.0,Republican,0,0,0,1,0,1,0,...,1,0,1,0,0.51,0.39,0.7,0.06,0.11,0.07
47205,509629,34.0,Democratic,0,0,0,0,0,0,0,...,0,0,0,0,0.52,0.41,0.65,0.12,0.17,0.13
35875,712733,54.0,Democratic,0,0,0,0,0,0,0,...,1,0,0,0,0.67,0.57,0.78,0.11,0.23,0.12
38365,333623,29.0,Democratic,0,0,0,1,0,0,0,...,1,0,0,0,0.69,0.56,0.78,0.11,0.21,0.15
11767,874332,66.0,American Independent,0,1,1,1,0,1,1,...,1,0,1,0,0.62,0.54,0.75,0.17,0.25,0.21


In [10]:
#check any null value in our dataset
df.isnull().sum()

optimus_id               0
age                     17
party                    0
vh14p                    0
vh12g                    0
vh12p                    0
vh10g                    0
vh10p                    0
vh08g                    0
vh08p                    0
vh06g                    0
vh06p                    0
vh04g                    0
vh04p                    0
vh02g                    0
vh02p                    0
vh00g                    0
vh00p                    0
g08_precinct_turnout     1
g10_precinct_turnout     1
g12_precinct_turnout     1
p08_precinct_turnout     1
p10_precinct_turnout     1
p12_precinct_turnout     1
dtype: int64

In [11]:
#percentage of null value
(df.isnull().sum()/df.shape[0]) *100

optimus_id              0.000
age                     0.034
party                   0.000
vh14p                   0.000
vh12g                   0.000
vh12p                   0.000
vh10g                   0.000
vh10p                   0.000
vh08g                   0.000
vh08p                   0.000
vh06g                   0.000
vh06p                   0.000
vh04g                   0.000
vh04p                   0.000
vh02g                   0.000
vh02p                   0.000
vh00g                   0.000
vh00p                   0.000
g08_precinct_turnout    0.002
g10_precinct_turnout    0.002
g12_precinct_turnout    0.002
p08_precinct_turnout    0.002
p10_precinct_turnout    0.002
p12_precinct_turnout    0.002
dtype: float64

In [12]:
#missing valve is less than 0.5 % so drop the rows
df.dropna(inplace=True)

In [13]:
#percentage of null value
(df.isnull().sum()/df.shape[0]) *100

optimus_id              0.0
age                     0.0
party                   0.0
vh14p                   0.0
vh12g                   0.0
vh12p                   0.0
vh10g                   0.0
vh10p                   0.0
vh08g                   0.0
vh08p                   0.0
vh06g                   0.0
vh06p                   0.0
vh04g                   0.0
vh04p                   0.0
vh02g                   0.0
vh02p                   0.0
vh00g                   0.0
vh00p                   0.0
g08_precinct_turnout    0.0
g10_precinct_turnout    0.0
g12_precinct_turnout    0.0
p08_precinct_turnout    0.0
p10_precinct_turnout    0.0
p12_precinct_turnout    0.0
dtype: float64

In [14]:
# Separate categorical and numerical features
cat_features = ['party']
num_features = [col for col in features if col not in cat_features]

In [15]:
num_transformer = SimpleImputer(strategy='mean')  # Handling missing numerical values
cat_transformer = OneHotEncoder(handle_unknown='ignore')  # Encoding categorical variables

In [16]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transformer, num_features),
        ('cat', cat_transformer, cat_features)
    ]
)

In [17]:
X=df.drop(columns=['optimus_id'],axis=1)

In [18]:
X

Unnamed: 0,age,party,vh14p,vh12g,vh12p,vh10g,vh10p,vh08g,vh08p,vh06g,...,vh02g,vh02p,vh00g,vh00p,g08_precinct_turnout,g10_precinct_turnout,g12_precinct_turnout,p08_precinct_turnout,p10_precinct_turnout,p12_precinct_turnout
0,69.0,Republican,0,0,0,1,0,1,0,1,...,1,0,1,0,0.56,0.54,0.75,0.17,0.32,0.24
1,20.0,American Independent,0,0,0,0,0,0,0,0,...,0,0,0,0,0.84,0.82,0.92,0.47,0.62,0.47
2,28.0,Non-Partisan,0,0,0,0,0,0,0,1,...,0,0,0,0,0.49,0.34,0.70,0.04,0.09,0.06
3,78.0,American Independent,0,0,0,0,0,0,0,0,...,0,0,0,0,0.84,0.79,0.91,0.24,0.46,0.30
4,68.0,Democratic,0,0,0,1,0,1,1,1,...,1,1,1,1,0.71,0.66,0.81,0.19,0.37,0.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
49995,23.0,American Independent,0,0,0,0,0,0,0,0,...,1,0,0,0,0.68,0.56,0.77,0.12,0.22,0.12
49996,24.0,Democratic,0,0,0,0,0,0,0,0,...,1,0,0,0,0.50,0.38,0.67,0.05,0.09,0.06
49997,28.0,Non-Partisan,0,0,0,0,0,0,0,0,...,0,0,1,0,0.45,0.31,0.58,0.07,0.10,0.08
49998,49.0,Republican,0,0,1,1,1,0,0,1,...,1,1,1,0,0.52,0.35,0.68,0.06,0.10,0.06


In [19]:
y=df['vh14p']

In [20]:
y.shape

(49982,)

In [21]:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=42)

In [22]:
# Define model pipeline
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=100, random_state=42))
])

In [23]:
y_train.shape

(39985,)

In [24]:
X_train.shape

(39985, 23)

In [25]:
# Train the model
model.fit(X_train, y_train)

In [26]:
# Make predictions
vote_predictions = model.predict(X_test)
# Probability of voting
vote_probabilities = model.predict_proba(X_test)[:, 1]  

In [27]:
# Evaluate the model
accuracy = accuracy_score(y_test, vote_predictions)
roc_auc = roc_auc_score(y_test, vote_probabilities)
print(f"Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")

Accuracy: 1.0000
ROC AUC Score: 1.0000


In [28]:
# Prepare the final output DataFrame
output_df = X_test.copy()
output_df['optimus_id'] = df.loc[X_test.index, 'optimus_id']
output_df['vote'] = vote_predictions
output_df['vote_prob'] = vote_probabilities

In [29]:
# Save to CSV
output_df.to_csv("final_voter_turnout_predictions.csv", index=False)
print("Predictions saved to final_voter_turnout_predictions.csv")

Predictions saved to final_voter_turnout_predictions.csv
