# 1) Importing all necessary libraries
# Bringing in the essentials — pandas for data handling
# scikit-learn for all the modeling, preprocessing, and evaluation tools.

In [21]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression

# 2. Load Data
# Loading the voter file..

In [None]:
df = pd.read_csv("voterfile .csv")

# 3. Dropping all irrelevant Rows and Columns
# Dropping 'vh14p' because this is May 2014, and we wouldn't have data from the 2014 primary yet.
# Just being cautious with future leakage.

In [None]:
df = df.drop(columns=['vh14p'], errors='ignore')

df = df.dropna(subset=['vh12g'])

# 4. Defining target
# we are predicting wether voted in 2012 electons

In [None]:
target = 'vh12g'
X = df.drop(columns=[target])
y = df[target]

optimus_id = X['optimus_id']

# 5. Numerical and Categorical Columns
# splitting data into numerical vs categorical for preprocessing.


In [None]:
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
numerical_cols = [col for col in numerical_cols if col not in ['optimus_id', 'cd']]
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()

# 6. Preprocessing and Combining
# working with numerical and Categorical Pipelines

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median'))
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ]
)

# 7. Pipline with Logistic regression

In [None]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=1000))
]

# 8. Train and Predictions

In [None]:
model.fit(X, y)

vote_prob = model.predict_proba(X)[:, 1]
vote = (vote_prob >= 0.5).astype(int)

# 9. Output and saving

In [None]:
output_df = X.copy()
output_df['vote'] = vote
output_df['vote_prob'] = vote_prob

used_features = numerical_cols + categorical_cols
final_columns = ['optimus_id'] + used_features + ['vote', 'vote_prob']
output_df = output_df[final_columns]

output_df.to_csv("predictions.csv", index=False)