In [None]:
# President: 2016 (Trump), 2020 (Biden), 2024 (Trump)
# Governor: 2018 (Whitmer), 2022 (Whitmer)
# Secretary of State: 2018 (Benson), 2022 (Benson)
# Attorney General: 2018 (Nessel), 2022 (Nessel)
# U.S. Senate: 2014 (Peters), 2018 (Stabenow), 2020 (Peters), 2024 (Slotkin)
# U.S. House: every cycle
# State Senate: 2014, 2018, 2022
# State House: every cycle

OFFICES = ['U.S. House', 'State House']
YEARS = ['2018', '2020', '2022', '2024']

# OFFICES = ['U.S. Senate']
# YEARS = ['2020']

# OFFICES = ['State Senate']
# YEARS = ['2022']

# OFFICES = ['President']
# YEARS = ['2024']

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LassoCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
import shap
import numpy as np
import pandas as pd

In [None]:
# pd.set_option('display.max_rows', None)
pd.set_option("display.max_columns", None)

In [None]:
# Must be numeric, not categorical.
# Leave target in drop_features_* list below.
TARGET = 'partisanship_lean_curr' 

# These are useless and add noise to 
# the models.
drop_features_required = [
    'standardized_id', 'standardized_id_num',
    'aland_tract', 'awater_tract', 'geoid_tract', 'geoidfq_tract', 
    'geometry', 'geometry_tract', 'name_tract', 'tractce_tract',
    'nearest_bound_census_tract', 'nearest_bound_school_district', 'nearest_bound_zipcode',
]

# Drop these during train/test/prediction
drop_features_optional = [
    # 'office_code', 
    # 'dem_share_prev', 
    # 'rep_share_prev', 'oth_share_prev', 
    # 'dem_share_change_prev', 'rep_share_change_prev', 'oth_share_change_prev', 
    # 'dem_votes_change_prev', 'rep_votes_change_prev', 'oth_votes_change_prev', 
    # 'registered_voters_change_prev', 'turnout_pct_change_prev', 
    # 'partisan_temp_prev', 'partisan_temp_change_prev', 
    # 'partisanship_lean_prev', 'partisanship_lean_change_prev', 'partisanship_lean_change_amount_prev',
]

# Seen features that may or may not be used as
# targets as well.
drop_features_seen = [
    'dem_votes', 'oth_votes', 'rep_votes', 'total_votes', 
    'dem_share', 'rep_share', 'oth_share',  'turnout_pct',
    'dem_share_change_curr','rep_share_change_curr', 'oth_share_change_curr', 
    'dem_votes_change_curr','rep_votes_change_curr', 'oth_votes_change_curr', 
    'partisan_temp', 'partisanship_lean_curr', 'registered_voters',
    'registered_voters_change_curr','turnout_pct_change_curr',
    'partisan_temp_category', 'partisan_temp_change_curr',
    'pedersen_index_percent', 'pedersen_index',
    'partisanship_lean_change_amount_curr',
]

if TARGET in drop_features_seen:
    drop_features_seen.remove(TARGET) # Keep target for later extraction

drop_features = drop_features_required + drop_features_optional + drop_features_seen

In [None]:
top_features_list = []

# Rank all features for the target defined above
# using several different metrics as well
# as an average score across metrics to help
# test many different combinations of features
# and targets.
for year in YEARS:
    print(f'Processing year {year}...')
    
    for office in OFFICES:
        print(f'Processing year {office}...')
        
        office = office.replace(' ', '_').replace('.', '')
        
        df = pd.read_csv(f'data/generated_data/07_ml_features_{year}_{office}_with_geometry.csv', low_memory=False)
        df = df.drop(columns=drop_features)
        
        # Target and features
        y = df[TARGET]

        # Categorical targets need to be encoded
        if y.dtype == 'object' or y.dtype.name == 'category':
            label_encoder = LabelEncoder()
            y = pd.Series(label_encoder.fit_transform(y), name=TARGET)
        
        X = df.drop(columns=[TARGET])

        # Combine X and y, drop rows where y is NaN
        df_model = pd.concat([X, y], axis=1)
        df_model = df_model.dropna(subset=[TARGET])
        
        # Separate again
        y = df_model[TARGET]
        X = df_model.drop(columns=[TARGET])
        
        # Keep only numeric features
        X_numeric = X.select_dtypes(include=[np.number]).copy()
        
        # Drop any columns with all NaNs or constant values
        X_numeric = X_numeric.dropna(axis=1, how='all')
        X_numeric = X_numeric.loc[:, X_numeric.nunique() > 1]
        
        # Fill remaining NaNs with mean
        X_numeric = X_numeric.fillna(X_numeric.mean(numeric_only=True))
        
        # Train-test split
        X_train, X_test, y_train, y_test = train_test_split(X_numeric, y, test_size=0.2, random_state=42)

        # Begin running models to compute corresonding accuracies.
        
        print('1. Correlation')
        correlations =  X_numeric.corrwith(y).abs().sort_values(ascending=False)
        
        print('2. Random Forest')
        rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=10, max_features='sqrt')
        rf.fit(X_train, y_train)
        rf_importances = pd.Series(rf.feature_importances_, index=X_numeric.columns)
        
        print('3. LassoCV')
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X_numeric)
        lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
        lasso.fit(X_scaled, y)
        lasso_importances = pd.Series(np.abs(lasso.coef_), index=X_numeric.columns)
        
        print('4. Mutual Information')
        mi = mutual_info_regression(X_numeric, y, random_state=42)
        mi_importances = pd.Series(mi, index=X_numeric.columns)
        
        print('5. SHAP')
        explainer = shap.Explainer(rf, X_train)
        shap_values = explainer(X_test)
        shap_importances = pd.Series(np.abs(shap_values.values).mean(0), index=X_numeric.columns)
        
        df_importances = pd.DataFrame({
            'Correlation': correlations,
            'RandomForest': rf_importances,
            'LassoCV': lasso_importances,
            'MutualInfo': mi_importances,
            'SHAP': shap_importances
        })
        
        # df_importances['Average'] = df_importances.mean(axis=1)
        
        df_importances = df_importances.reset_index()
        df_importances.rename(columns={'index': 'Feature name'}, inplace=True)

        top_features_list.append(df_importances)

In [None]:
# Combine the feature columns
df_combined = pd.concat(top_features_list, axis=0)

# Aggregate features to compute averages.
df_aggregated = df_combined.groupby('Feature name').mean(numeric_only=True).reset_index()
df_aggregated['Average'] = df_aggregated.select_dtypes(include=[np.number]).mean(axis=1)
df_aggregated = df_aggregated.sort_values(by='Average', ascending=False)

df_aggregated.to_csv(f'data/generated_data/df_importances_{TARGET}.csv', index=False)

In [None]:
import matplotlib.pyplot as plt

# Choose top N features to plot
top_n = 20
df_plot = df_aggregated.head(top_n).set_index('Feature name')

# Drop 'Average' to plot metrics separately
metrics = df_plot.drop(columns='Average')

# Plot
ax = metrics.plot(kind='barh', figsize=(12, 10), width=0.85)
plt.gca().invert_yaxis()  # highest at top
plt.title(f'Top {top_n} Feature Importances by Metric')
plt.xlabel('Importance Score')
plt.ylabel('Feature')
plt.legend(title='Metric')
plt.tight_layout()
plt.savefig(f'output/figures/features_ranking_{TARGET}.png')
plt.close()
# plt.show()