In [None]:
from functools import reduce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
pd.set_option("display.max_columns", None)

### If I change dem share and change as positive numbers in 02_vote file, then we update below:

In [None]:
# Initialize to avoid load error, set
# actual values at execution step.
YEARS = []  
OFFICES = []


def removeUncommonColumns(nested_dict):
    print("Removing uncommon columns...")
    
    # Flatten and find common columns
    all_dfs = [df for year in nested_dict for df in nested_dict[year].values()]
    common_cols = set(all_dfs[0].columns)
    for df in all_dfs[1:]:
        common_cols &= set(df.columns)
    
    # Safely trim all dataframes
    for year in nested_dict:
        for office in nested_dict[year]:
            df = nested_dict[year][office]
            existing_cols = [col for col in common_cols if col in df.columns]
            nested_dict[year][office] = df[existing_cols]
    
    return nested_dict


def makeDatasets(years, offices, remove_xtra_cols=False):
    print('Making datasets...')
    df_datasets = {}
    
    for year in years:
        print(f'Processing year {year}...')
        df_datasets[year] = {}
        
        for office in offices:
            print(f'Processing office {office}...')
            
            df_precincts = pd.read_csv('data/generated_data/df_06_tract_' + year + '_' + office + '.csv')
            df_precincts['standardized_id_num'] = df_precincts['standardized_id_num'].astype(str).str.zfill(13)
    
            print(f'Loading census data...')
            census_dataset_dfs = []
            for census_dataset in census_datasets:
                census_dataset = census_dataset.lower()
                if census_dataset[:1] == 's':
                    census_dataset_code = census_dataset[:5].upper()
                    census_dataset_label = census_dataset[6:]
                elif census_dataset[:1] == 'b':
                    census_dataset_code = census_dataset[:6].upper()
                    census_dataset_label = census_dataset[7:]
                
                df_census_dataset = pd.read_csv(f'data/generated_data/df_06_{census_dataset_label}_' + year + '_' + office + '.csv')
                df_census_dataset.rename(columns={f'geoid_{census_dataset_label}': 'geoidfq_tract'}, inplace=True)
                
                census_dataset_dfs.append(df_census_dataset)
    
            dfs = [df_precincts]
            dfs.extend(census_dataset_dfs)

            # Get rid of mysterious column dups, when dataset is third-to-last in the list. But this is brittle.
            df = reduce(lambda left, right: pd.merge(left, right, on='geoidfq_tract', how='left', suffixes=('', '_dup')), dfs)
            df = df.loc[:,~df.columns.str.endswith('_dup')]
            df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)

            drop_cols = [
                'Census County Code', 'City/Township Code', 'City/Township Description', 'County Name', 'District Code', 
                'Election Type', 'Election Year', 'Michigan County Code', 'Office Code', 'Office Description', 
                'Precinct Label', 'Precinct Number', 'Status Code', 'Ward Number', 'aland_tract', 'awater_tract', 
                'county', 'county_fips', 'dem_share', 'dem_votes', 'electionye', 'geoid_tract', 'geoidfq_tract', 
                'geometry', 'geometry_tract', 'locale_full', 'locale_full', 'name_tract', 'nearest_bound_census_tract', 
                'nearest_bound_school_district', 'nearest_bound_zipcode', 'nearest_tract', 'objectid', 'objectid', 
                'office', 'oth_share', 'oth_votes', 'precinct_num', 'precinct_num', 'precinct_wp_id', 'registered_voters', 
                'registered_voters_change_prev', 'rep_share', 'rep_votes', 'shapestarea', 'shapestlength', 'standardized_id', 
                'subdivision_fips', 'total_votes', 'tractce_tract', 'turnout_pct', 'ward_num',
                'turnout_pct_change_curr', 'dem_votes_change_curr',  'dem_share_change_curr', 
                'rep_votes_change_curr', 'registered_voters_change_curr', 'oth_votes_change_curr',
                'rep_share_change_curr', 'oth_share_change_curr',
            ]
            df = df.drop(columns=drop_cols, errors='ignore')

            # Categorical columns
            string_columns = ["standardized_id_num"]
            string_df = df[string_columns]
    
            # Numeric columns
            numeric_df = df.drop(columns=string_columns, errors='ignore')
            numeric_df = numeric_df.apply(pd.to_numeric, errors='coerce')
            numeric_df = numeric_df.fillna(numeric_df.median())
            
            df = pd.concat([string_df, numeric_df], axis=1)
    
            df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)

            # Compute change in leaning across previous two cycles
            # df["partisanship_change_amount_prev"] = df.apply(lambda row: calcPartisanChange(row), axis=1)

            # For running this function separately
            # on other years and offices outside of
            # the main execution loop.
            # For example, 2024 has columns that other
            # years do not, which causes errors.
            if remove_xtra_cols:
                df = df.drop(columns=remove_xtra_cols, errors='ignore')

            df_datasets[year][office] = df
    
    df_datasets = removeUncommonColumns(df_datasets)

    print('Done making datasets.')
    return df_datasets


def makeFeaturesTargets(df):
    y = df[['partisan_temp']]
    X = df.drop(columns=['standardized_id_num', 'partisan_temp', 'partisan_temp_change_curr'])
    
    # Drop all-nan columns
    X = X.dropna(axis=1, how='all')
    
    # Remove useless computed features.
    # x_share_change columns have HUGE predictive power, and represent share
    # changes from the prev. two cycles, which is NOT unseen "future" data.
    # For reference: 'dem_share_change_prev', 'rep_share_change_prev', 'oth_share_change_prev',
    X = X.drop(columns=[
        'dem_share_prev', 'rep_share_prev', 'oth_share_prev', 
        'dem_votes_change_prev', 'rep_votes_change_prev', 'oth_votes_change_prev',
        # 'dem_share_change_prev', 'rep_share_change_prev', 'oth_share_change_prev',
        # 'dem_share_change_prev', 'oth_share_change_prev',
        'partisan_temp_change_prev', 'turnout_pct_change_prev',
    ])
    
    return X, y


def fitModel(X, y):
    categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
    ])
    
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),  # or 'median'
        ('scaler', StandardScaler())
    ])
    
    preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numeric_transformer, numeric_cols)
    ])
    
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)

    return model, X_train, X_test, y_train, y_test, numeric_cols


def makePredictions(X_test, model):
    y_pred = model.predict(X_test)
    return y_pred


def plotAccuracy(y_test, y_pred):
    plt.scatter(y_test, y_pred)
    plt.xlabel("True Values")
    plt.ylabel("Predicted Values")
    plt.title("Prediction Accuracy")
    plt.grid(True)
    return plt


def featureCoeff(model, numeric_cols):
    sort_column = 'coefficient' # ordered by good predictive power
    
    coefficients = model.named_steps['regressor'].coef_.flatten()
    feature_names = numeric_cols
    
    coef_df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coefficients,
        'abs_coefficient': np.abs(coefficients)
    }).sort_values(by=sort_column, ascending=False)
    
    # top_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False).head(TOP_N_FEATURES)
    top_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False)
    return top_features


def plotFeatureCoeff(top_features):
    plt.figure(figsize=(12, 7))
    bars = plt.barh(top_features['feature'], top_features['coefficient'])
    plt.xlabel('Coefficient Value')
    plt.title(f'Top {TOP_N_FEATURES} Most Influential Features (Linear Regression)')
    plt.axvline(x=0, color='gray', linestyle='--')
    plt.grid(True, axis='x', linestyle=':', alpha=0.7)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    return plt


def mergeTopFeatures(top_features_lists):
    from itertools import chain

    # Get features appearing in all lists
    common_features = set(top_features_lists[0])
    for feature_list in top_features_lists[1:]:
        common_features.intersection_update(feature_list)

    # Preserve the order of first appearance
    seen = set()
    merged_common_ordered = []

    for item in chain.from_iterable(top_features_lists):
        if item in common_features and item not in seen:
            seen.add(item)
            merged_common_ordered.append(item)

    return merged_common_ordered

#### Execution

In [None]:
YEARS = ['2022']
OFFICES = ['US_House']
TOP_N_FEATURES = 100

census_datasets = [
    'b02001_race', 'b04007_ancestry', 'b05012_nativity_us', 'b08303_travel_time_work', 'b25003_housing_rentership', 
    'dp02_selected_social_characteristics', 'dp03_selected_economic_characteristics', 'dp04_housing_characteristics', 'dp05_age_race', 
    's0101_age_sex', 's1101_households_families', 's1201_marital_status', 's1501_educational_attainment', 's1701_income_poverty', 
    's1903_median_income', 's2101_veteran_status', 's2201_food_stamps', 's2301_employment_status', 's2401_occupation_sex', 
    's2403_industry_sex', 's2501_occupancy_characteristics', 's2701_health_insurance', 's2503_financial_characteristics',
]

#### Feature Analysis

In [None]:
# cols = [col for col in X.columns if not col.startswith(('S', 'B', 'D'))]
# print(cols)

In [None]:
# df_datasets = makeDatasets(YEARS, OFFICES)
# df = df_datasets['2022']['US_House'].copy()
# X, y = makeFeaturesTargets(df)
# X

In [None]:
df_datasets = makeDatasets(YEARS, OFFICES)

top_features_lists = []

for year in YEARS:
    for office in OFFICES:
        df = df_datasets[year][office].copy()
        X, y = makeFeaturesTargets(df)

        # EXPERIMENTAL â€“ these seem to be "must have" columns, at least rep_share_change_prev.
        X = X.drop(columns=[
            # 'rep_share_change_prev', 'dem_share_change_prev', 'oth_share_change_prev',
        ])

        model, X_train, X_test, y_train, y_test, numeric_cols = fitModel(X, y)
        y_pred = makePredictions(X_test, model)
        
        mse = mean_squared_error(y_test, y_pred)
        print("Mean Squared Error:", mse)

        r2 = model.score(X_test, y_test)
        print("R2 Score:", r2)
        
        plt = plotAccuracy(y_test, y_pred)
        # plt.close()
        plt.show()
        
        top_features = featureCoeff(model, numeric_cols)
        print(f'Top 10 features: {top_features[:10]}')

        top_features_lists.append(top_features['feature'].tolist())
        
        plt = plotFeatureCoeff(top_features)
        # plt.close()
        plt.show()

#### Use only top_n features
Train from scratch on the top features detected from above steps

In [None]:
# Merge top features from cycles
top_features_all = mergeTopFeatures(top_features_lists)
print(f'Num top features: {len(top_features_all)}')

top_features_all = top_features_all[:150]

In [None]:
# Remove certain features by name
if 'rep_share_change_prev' in top_features_all:
    top_features_all.remove('rep_share_change_prev')
if 'dem_share_change_prev' in top_features_all:
    top_features_all.remove('dem_share_change_prev')
if 'oth_share_change_prev' in top_features_all:
    top_features_all.remove('oth_share_change_prev')

# top_n_features_all = top_features_all[:TOP_N_FEATURES]

print(f'Top feature count: {len(top_features_all)}')
print(f'Top 10 features: {top_features_all[:10]}')

#### Aggregate Historical Training
Do we have data leakage here? Maybe not, if we train on historical data and
<br>run a separate test on newer data in a following step. Say, here we
train 2018-2022, <br>and then in another cell test 2024 on the same model

In [None]:
# Later in the notebook we pull 2024 data, which does not include the columns
# below for 2018, 2020, and 2022. If these columns are left in, errors are
# thrown when processing 2024 data. This must be done at this step, since
# the model is being trained here.

# When using ALL features, some columns are not present in 2024 and must be removed
# missing_2024_cols = ['S1101_C03_014E', 'S1101_C02_014E', 'S1101_C04_014E']
missing_2024_cols = False

df_datasets = makeDatasets(YEARS, OFFICES, missing_2024_cols)

dfs = []
for year in YEARS:
    for office in OFFICES:
        dfs.append(df_datasets[year][office].copy())
df = pd.concat(dfs, axis=0, ignore_index=True)
X, y = makeFeaturesTargets(df)

X = X[top_features_all]

model, X_train, X_test, y_train, y_test, numeric_cols = fitModel(X, y)
y_pred = makePredictions(X_test, model)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = model.score(X_test, y_test)
print("R2 Score:", r2)

plt = plotAccuracy(y_test, y_pred)
plt.show()

top_features = featureCoeff(model, numeric_cols)
print(f'Top 10 features: {top_features[:10]}')

plt = plotFeatureCoeff(top_features)
plt.show()

#### Individual Cycle Future Prediction
This functionality requires objects and variables initiatlized in previous cells, including <code>model</code>
<br>and <code>numeric_cols</code>. When using aggregated cycle data, the model should be trained on historical
<br>elections only, previous to the upcoming election that is being predicted.

In [None]:
year = '2022'
office = 'US_House'

df_datasets = makeDatasets([year], [office])
df = df_datasets[year][office].copy()
X, y = makeFeaturesTargets(df)

X = X[top_features_all]

y_pred = makePredictions(X, model)

mse = mean_squared_error(y, y_pred)
print("Mean Squared Error:", mse)

r2 = model.score(X_test, y_test)
print("R2 Score:", r2)

plt = plotAccuracy(y, y_pred)
plt.show()

top_features = featureCoeff(model, numeric_cols)
print(f'Top 10 features: {top_features[:10]}')

plt = plotFeatureCoeff(top_features)
plt.show()