In [None]:
from functools import reduce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
def calcPartisanChange(row):
    change = row["rep_share_change"] - row["dem_share_change"]
    return change


def removeUncommonColumns(nested_dict):
    print("Removing uncommon columns...")
    
    # Flatten and find common columns
    all_dfs = [df for year in nested_dict for df in nested_dict[year].values()]
    common_cols = set(all_dfs[0].columns)
    for df in all_dfs[1:]:
        common_cols &= set(df.columns)
    
    # Safely trim all dataframes
    for year in nested_dict:
        for office in nested_dict[year]:
            df = nested_dict[year][office]
            existing_cols = [col for col in common_cols if col in df.columns]
            nested_dict[year][office] = df[existing_cols]
    
    return nested_dict


def makeDatasets():
    print('Making datasets...')
    df_datasets = {}
    
    for year in YEARS:
        print(f'Processing year {year}...')
        df_datasets[year] = {}
        
        for office in OFFICES:
            print(f'Processing office {office}...')
            
            df_precincts = pd.read_csv('data/generated_data/df_06_tract_' + year + '_' + office + '.csv')
            df_precincts['standardized_id_num'] = df_precincts['standardized_id_num'].astype(str).str.zfill(13)
    
            print(f'Loading census data...')
            census_dataset_dfs = []
            for census_dataset in census_datasets:
                census_dataset = census_dataset.lower()
                if census_dataset[:1] == 's':
                    census_dataset_code = census_dataset[:5].upper()
                    census_dataset_label = census_dataset[6:]
                elif census_dataset[:1] == 'b':
                    census_dataset_code = census_dataset[:6].upper()
                    census_dataset_label = census_dataset[7:]
                
                df_census_dataset = pd.read_csv(f'data/generated_data/df_06_{census_dataset_label}_' + year + '_' + office + '.csv')
                df_census_dataset.rename(columns={f'geoid_{census_dataset_label}': 'geoidfq_tract'}, inplace=True)
                
                census_dataset_dfs.append(df_census_dataset)
    
            dfs = [df_precincts]
            dfs.extend(census_dataset_dfs)
            
            df = reduce(lambda left, right: pd.merge(left, right, on='geoidfq_tract', how='left'), dfs)
            df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)
    
            drop_cols = [
                'Census County Code', 'City/Township Code', 'City/Township Description', 'County Name', 'District Code', 
                'Election Type', 'Election Year', 'Michigan County Code', 'Office Code', 'Office Description', 
                'Precinct Label', 'Precinct Number', 'Status Code', 'Ward Number', 'aland_tract', 'awater_tract', 
                'county', 'county_fips', 'dem_share', 'dem_votes', 'electionye', 'geoid_tract', 'geoidfq_tract', 
                'geometry', 'geometry_tract', 'locale_full', 'locale_full', 'name_tract', 'nearest_bound_census_tract', 
                'nearest_bound_school_district', 'nearest_bound_zipcode', 'nearest_tract', 'objectid', 'objectid', 
                'office', 'oth_share', 'oth_votes', 'precinct_num', 'precinct_num', 'precinct_wp_id', 'registered_voters', 
                'registered_voters_change', 'rep_share', 'rep_votes', 'shapestarea', 'shapestlength', 'standardized_id', 
                'subdivision_fips', 'total_votes', 'tractce_tract', 'turnout_pct', 'ward_num'
            ]
            df = df.drop(columns=drop_cols, errors='ignore')

            # Categorical columns
            string_columns = ["standardized_id_num"]
            string_df = df[string_columns]
    
            # Numeric columns
            numeric_df = df.drop(columns=string_columns, errors='ignore')
            numeric_df = numeric_df.apply(pd.to_numeric, errors='coerce')
            numeric_df = numeric_df.fillna(numeric_df.median())
            
            df = pd.concat([string_df, numeric_df], axis=1)
    
            df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)
            df["partisanship_change_amount"] = df.apply(lambda row: calcPartisanChange(row), axis=1)
    
            df_datasets[year][office] = df
    
    df_datasets = removeUncommonColumns(df_datasets)

    print('Done making datasets.')
    return df_datasets


def makeFeaturesTargets(df):
    y = df[['partisanship_change_amount']]
    X = df.drop(columns=['standardized_id_num', 'partisanship_change_amount'])
    
    # Drop all-nan columns
    X = X.dropna(axis=1, how='all')
    
    # Remove confounders
    X = X.drop(columns=[
        'dem_share_prev', 'rep_share_prev', 'oth_share_prev', 
        'dem_votes_change', 'rep_votes_change', 'oth_votes_change',
        # 'dem_share_change', 'rep_share_change', 'oth_share_change', # HUGE predictive power
        'turnout_pct_change'
    ])
    
    return X, y


def fitModel(X, y):
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ])
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # or 'median'
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numeric_transformer, numeric_cols)
    ])
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)

    return model, X_train, X_test, y_train, y_test, numeric_cols


def makePredictions(X_test, y_test):
    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    return mse, y_pred


def plotAccuracy(y_test, y_pred):
    plt.scatter(y_test, y_pred)
    plt.xlabel("True Values")
    plt.ylabel("Predicted Values")
    plt.title("Prediction Accuracy")
    plt.grid(True)
    return plt


def featureCoeff(model, numeric_cols):
    sort_column = 'coefficient' # ordered by good predictive power
    
    coefficients = model.named_steps['regressor'].coef_.flatten()
    feature_names = numeric_cols
    
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefficients,
        'abs_coefficient': np.abs(coefficients)
    }).sort_values(by=sort_column, ascending=False)
    
    top_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False).head(TOP_N_FEATURES)
    return top_features


def plotFeatureCoeff(top_features):
    plt.figure(figsize=(12, 7))
    bars = plt.barh(top_features['feature'], top_features['coefficient'])
    plt.xlabel('Coefficient Value')
    plt.title(f'Top {TOP_N_FEATURES} Most Influential Features (Linear Regression)')
    plt.axvline(x=0, color='gray', linestyle='--')
    plt.grid(True, axis='x', linestyle=':', alpha=0.7)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    return plt

### Execution

In [None]:
YEARS = ['2018', '2020', '2022']
OFFICES = ['US_House']
TOP_N_FEATURES = 25

census_datasets = [
    'b02001_race', 'b04007_ancestry', 'b05012_nativity_us', 'b08303_travel_time_work', 
    'b25003_housing_rentership', 'dp04_housing_characteristics', 'dp05_age_race', 's0101_age_sex', 
    's1101_households_families', 's1201_marital_status', 's1501_educational_attainment', 's1701_income_poverty', 
    's1903_median_income', 's2101_veteran_status', 's2201_food_stamps', 's2301_employment_status', 
    's2401_occupation_sex', 's2403_industry_sex', 's2501_occupancy_characteristics', 
    's2503_financial_characteristics', 's2701_health_insurance',
]

In [None]:
df_datasets = makeDatasets()  # outside the loop, processes all years and offices

for year in YEARS:
    for office in OFFICES:
        df = df_datasets[year][office]
        X, y = makeFeaturesTargets(df)
        model, X_train, X_test, y_train, y_test, numeric_cols = fitModel(X, y)
        mse, y_pred = makePredictions(X_test, y_test)
        print("Mean Squared Error:", mse)
        plt = plotAccuracy(y_test, y_pred)
        plt.show()
        top_features = featureCoeff(model, numeric_cols)
        print(top_features)
        plt = plotFeatureCoeff(top_features)
        plt.show()

### Predict all data

In [None]:
df_2020 = df_datasets['2020']['US_House']
df_2020.sample()

In [None]:
# Target and features
y_2020 = df_2020[['partisanship_change_amount']]
X_2020 = df_2020.drop(columns=['standardized_id_num', 'partisanship_change_amount'])

# Drop all-nan columns
# X_2020 = X_2020.dropna(axis=1, how='all')

# These confound results
X_2020 = X_2020.drop(columns=[
    'dem_share_prev', 'rep_share_prev', 'oth_share_prev', 
    'dem_votes_change', 'rep_votes_change', 'oth_votes_change',
    # 'dem_share_change', 'rep_share_change', 'oth_share_change', # This line has huge predictive power
    'turnout_pct_change'
])

X_2020.sample()

In [None]:
# model.fit(X, y)
y_pred_all_2020 = model.predict(X_2020) 

mse_2020 = mean_squared_error(y_2020, y_pred_all_2020)
print("Mean Squared Error:", mse_2020)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_2020, y_pred_all_2020)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("Prediction Accuracy")
plt.grid(True)
plt.show()