## Figure out the column we want to predict

In [1]:
from functools import reduce
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.feature_selection import mutual_info_regression
from sklearn.impute import SimpleImputer
from sklearn.inspection import permutation_importance
from sklearn.linear_model import LassoCV, LinearRegression, LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, f1_score, accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
import csv
import gc
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import shap
import xgboost as xgb

In [2]:
# President: 2016 (Trump), 2020 (Biden), 2024 (Trump)
# Governor: 2018 (Whitmer), 2022 (Whitmer)
# Secretary of State: 2018 (Benson), 2022 (Benson)
# Attorney General: 2018 (Nessel), 2022 (Nessel)
# U.S. Senate: 2014 (Peters), 2018 (Stabenow), 2020 (Peters), 2024 (Slotkin)
# U.S. House: every cycle
# State Senate: 2014, 2018, 2022
# State House: every cycle

# For any given prediction year you must have two previous
# non-prediction "historical" years of two previous elections.

OFFICES = ['U.S. House']
YEARS = ['2018', '2020', '2022', '2024']

# OFFICES = ['U.S. House', 'State House']
# YEARS = ['2018', '2020', '2022', '2024']

# OFFICES = ['U.S. Senate']
# YEARS = ['2020', '2024']

# OFFICES = ['State Senate']
# YEARS = ['2022']

# OFFICES = ['President']
# YEARS = ['2024']

# Not enough data
# # OFFICES = ['Governor', 'Secretary of State', 'Attorney General']
# # YEARS = ['2018', '2022']

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
pd.set_option("display.max_columns", None)

### If I store dem shares/changes as positive numbers in 02_vote file, then this file needs to be updated

In [5]:
def formatOfficeName(office):
    return office.replace(' ', '_').replace('.', '')


def categorize_partisan_change_amount(row):
    # negative => left => more dem, positive => right => more rep
    change_amount = row["rep_share_change_prev"] - row["dem_share_change_prev"]
    return change_amount


def categorize_partisanship(row):
    if row["dem_share_prev"] >= 0.667:
        return "strong democrat"
    elif row["dem_share_prev"] >= 0.501:
        return "leans democrat"
    elif row["rep_share_prev"] >= 0.667:
        return "strong republican"
    elif row["rep_share_prev"] >= 0.501:
        return "leans republican"
    elif row["oth_share_prev"] >= 0.667:
        return "strong independent"
    elif row["oth_share_prev"] >= 0.501:
        return "leans independent"
    else:
        return "neutral"


def categorize_partisan_change(row):
    # Model with a number line, ignore "other" parties
    # negative = left = more dem, positive = right = more rep
    change = row["rep_share_change_prev"] - row["dem_share_change_prev"]
    
    if np.abs(change) >= 0.01:
        if change > 0.5:
            return "more republican ++++++++"
        if change > 0.35:
            return "more republican +++++++"
        if change > 0.25:
            return "more republican ++++++"
        if change > 0.15:
            return "more republican +++++"
        if change > 0.1:
            return "more republican ++++"
        if change > 0.05:
            return "more republican +++"
        elif change > 0.01:
            return "more republican ++"
        elif change > 0.005:
            return "more republican +"
        elif change < -0.5:
            return "more democrat ++++++++"
        elif change < -0.35:
            return "more democrat +++++++"
        elif change < -0.25:
            return "more democrat ++++++"
        elif change < -0.15:
            return "more democrat +++++"
        elif change < -0.1:
            return "more democrat ++++"
        elif change < -0.05:
            return "more democrat +++"
        elif change < -0.01:
            return "more democrat ++"
        elif change < -0.005:
            return "more democrat +"
    else:
        return "no change"


def cleanColumnNames(df):
    df.columns = (
        df.columns
        .str.lower()
        .str.replace('.', '', regex=False)
        .str.replace(' ', '_')
        .str.replace('/', '_')
    )
    return df


def formatColumnTypes(df):
    for col in df.columns:
        if col != 'standardized_id_num':
            df[col] = pd.to_numeric(df[col], errors='ignore')
            
        if df[col].dtype == 'object':
            df[col] = df[col].astype(str)
            
    return df

In [6]:
def makeData(years=YEARS, offices=OFFICES, historic=True):
    dfs = {}

    drop_columns = [
        'Census County Code', 'City/Township Code', 'District Code', 'Election Type', 'Election Year',
        'Michigan County Code', 'Office Description', 'Precinct Label', 'Precinct Number', 
        'Status Code', 'Ward Number', 'aland_tract', 'awater_tract', 'registered_voters', 'standardized_id', 
        'tractce_tract',
    ]
    
    for year in years:
        print(f'Processing year {year}...')
        dfs[year] = {}
        
        for office in offices:
            office = formatOfficeName(office)
            print(f'Processing office {office}...')
            
            df = pd.read_csv(f'data/generated_data/df_06_tract_{year}_{office}.csv')
            df = df.drop(columns=drop_columns, errors='ignore')
            df = cleanColumnNames(df)
            df = formatColumnTypes(df)

            if historic:
                df["partisanship_lean_prev"] = df.apply(categorize_partisanship, axis=1)
                df["partisanship_lean_change_prev"] = df.apply(lambda row: categorize_partisan_change(row), axis=1)
                df["partisanship_lean_change_amount_prev"] = df.apply(lambda row: categorize_partisan_change_amount(row), axis=1)

            dfs[year][office] = df

    return dfs

dfs = makeData()

Processing year 2018...
Processing office US_House...
Processing year 2020...
Processing office US_House...
Processing year 2022...
Processing office US_House...
Processing year 2024...
Processing office US_House...


#### Previous Cycle
Previous cycle datasets include columns ending in <code>_votes</code> and <code>_share</code> and the <code>turnout_pct</code> column, all of which 
<br>represent "seen" results that are only found after the event in question, after predictions are made.

In [7]:
df_2020 = dfs['2020']['US_House'].copy()

df_2020.to_csv('df_2020.csv', index=False)
df_2020.sample()

Unnamed: 0,office_code,county_name,city_township_description,standardized_id_num,dem_votes,oth_votes,rep_votes,total_votes,dem_share,rep_share,oth_share,turnout_pct,dem_share_prev,rep_share_prev,oth_share_prev,dem_share_change_prev,rep_share_change_prev,oth_share_change_prev,dem_votes_change_prev,rep_votes_change_prev,oth_votes_change_prev,registered_voters_change_prev,turnout_pct_change_prev,nearest_bound_school_district,nearest_bound_census_tract,nearest_bound_zipcode,geometry,geoid_tract,geoidfq_tract,name_tract,geometry_tract,partisanship_lean_prev,partisanship_lean_change_prev,partisanship_lean_change_amount_prev
3325,6,ST. CLAIR,KIMBALL TOWNSHIP,1474316000003,580.0,0.0,1381.0,1961.0,0.295767,0.704233,0.0,1.105411,0.337441,0.615166,0.047393,-0.003822,-0.014823,0.018644,33.0,51.0,27.0,0.0,0.06257,5299,26147634100,48074,"POLYGON ((-82.6169679864893 42.96863308309992,...",26147630000.0,1400000US26147634100,6341.0,"POLYGON ((-82.624515 42.991076, -82.58455 42.9...",leans republican,more democrat ++,-0.011001


In [10]:
# For historic data, add the not-prev change. 
# For 2020, compute the change between 2018 and 2020, instead of between 2016 and 2018.
# This is the target column we want to predict for new data.

#### Upcoming Cycle
Columns ending in <code>_prev</code> and <code>_change</code> are shifts between the *previous* two elections and only represent "seen" data
<br>computed prior to the upcoming cycle that is being predicted.

In [12]:
# Here we will predict the not-prev change between 2022 and 2024, assuming 2024 is a "future" cycle!
# We want to find what the shift may be.

In [8]:
# Mock prediction without unseen data.
unseen_columns = [
    'dem_votes', 'oth_votes', 'rep_votes', 'total_votes', 
    'dem_share', 'rep_share', 'oth_share', 'turnout_pct',
]

unneeded_columns_for_preds = [
    'nearest_bound_school_district', 'nearest_bound_census_tract', 'nearest_bound_zipcode',
    'geometry', 'geoid_tract', 'geoidfq_tract', 'name_tract', 'geometry_tract',
]

drop_cols = unseen_columns + unneeded_columns_for_preds

df_2024 = dfs['2024']['US_House'].copy()
df_pred_2024 = df_2024.drop(columns=drop_cols)

df_pred_2024.to_csv('df_pred_2024.csv', index=False)
df_pred_2024.sample()

Unnamed: 0,office_code,county_name,city_township_description,standardized_id_num,dem_share_prev,rep_share_prev,oth_share_prev,dem_share_change_prev,rep_share_change_prev,oth_share_change_prev,dem_votes_change_prev,rep_votes_change_prev,oth_votes_change_prev,registered_voters_change_prev,turnout_pct_change_prev,partisanship_lean_prev,partisanship_lean_change_prev,partisanship_lean_change_amount_prev
716,6,KALAMAZOO,KALAMAZOO CITY CITY,774216000024,0.793701,0.179528,0.026772,-0.025823,0.016252,0.009571,151.0,53.0,14.0,0.0,0.173429,strong democrat,more republican ++,0.042075
