In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import numpy as np
import pandas as pd
import geopandas as gpd
from functools import reduce

In [None]:
pd.set_option("display.max_columns", None)

In [None]:
YEARS = ['2022']
OFFICES = ['US_House']

In [None]:
census_datasets = [
    'b02001_race', 'b04007_ancestry', 'b05012_nativity_us', 'b08303_travel_time_work', 
    'b25003_housing_rentership', 'dp04_housing_characteristics', 'dp05_age_race', 's0101_age_sex', 
    's1101_households_families', 's1201_marital_status', 's1501_educational_attainment', 's1701_income_poverty', 
    's1903_median_income', 's2101_veteran_status', 's2201_food_stamps', 's2301_employment_status', 
    's2401_occupation_sex', 's2403_industry_sex', 's2501_occupancy_characteristics', 
    's2503_financial_characteristics', 's2701_health_insurance',
]

def calcPartisanChange(row):
    change = row["rep_share_change"] - row["dem_share_change"]
    return change

for year in YEARS:
    print(f'Processing year {year}')
    
    for office in OFFICES:
        print(f'Processing office {office}')
        
        df_precincts = pd.read_csv('data/generated_data/df_06_tract_' + year + '_' + office + '.csv')
        df_precincts['standardized_id_num'] = df_precincts['standardized_id_num'].astype(str).str.zfill(13)

        print(f'Loading census data')
        census_dataset_dfs = []
        for census_dataset in census_datasets:
            census_dataset = census_dataset.lower()
            if census_dataset[:1] == 's':
                census_dataset_code = census_dataset[:5].upper()
                census_dataset_label = census_dataset[6:]
            elif census_dataset[:1] == 'b':
                census_dataset_code = census_dataset[:6].upper()
                census_dataset_label = census_dataset[7:]
            
            df_census_dataset = pd.read_csv(f'data/generated_data/df_06_{census_dataset_label}_' + year + '_' + office + '.csv')
            df_census_dataset.rename(columns={f'geoid_{census_dataset_label}': 'geoidfq_tract'}, inplace=True)
            
            census_dataset_dfs.append(df_census_dataset)

        dfs = [df_precincts]
        dfs.extend(census_dataset_dfs)
        
        df = reduce(lambda left, right: pd.merge(left, right, on='geoidfq_tract', how='left'), dfs)
        df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)

        drop_cols = [
            "City/Township Description", "District Code", "Election Type",
            "Michigan County Code",
            "Office Description", "Precinct Label", "Status Code",
            "County Name", "Precinct Number", "Election Year",
            "total_votes", "registered_voters", "turnout_pct",
            "nearest_bound_census_tract", "nearest_bound_school_district", "nearest_bound_zipcode",
            "Office Code", "Census County Code",
            "City/Township Code", "Ward Number",
            "county", "office", "electionye",
            "registered_voters_change",
            "dem_votes", "rep_votes", "oth_votes",
            "dem_share", "rep_share", "oth_share",
            "locale_full", "objectid", "precinct_num", "precinct_wp_id",
            "standardized_id", "geometry", "geometry_tract", "geoidfq_tract", "name_tract",
            "subdivision_fips", "ward_num", "locale_full", "county_fips", "objectid", "precinct_num",
            "nearest_tract", "awater_tract", "aland_tract", "tractce_tract", "shapestarea", "shapestlength", "geoid_tract",
        ]
        df = df.drop(columns=drop_cols, errors='ignore')
        
        string_columns = ["standardized_id_num"]
        string_df = df[string_columns]

        # Force numeric
        numeric_df = df.drop(columns=string_columns, errors='ignore')
        numeric_df = numeric_df.apply(pd.to_numeric, errors='coerce')
        numeric_df = numeric_df.fillna(numeric_df.median())
        
        df = pd.concat([string_df, numeric_df], axis=1)

        df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)

        df["partisanship_change_amount"] = df.apply(lambda row: calcPartisanChange(row), axis=1)

print('DONE')

In [None]:
# Target and features
y = df[['partisanship_change_amount']]
X = df.drop(columns=['standardized_id_num', 'partisanship_change_amount'])

# Drop all-nan columns
X = X.dropna(axis=1, how='all')

# These confound results
X = X.drop(columns=[
    'dem_share_prev', 'rep_share_prev', 'oth_share_prev', 
    'dem_votes_change', 'rep_votes_change', 'oth_votes_change', 
    'turnout_pct_change'
])

X.sample(3)

In [None]:
from sklearn.impute import SimpleImputer

categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),  # or 'median'
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y_test, y_pred)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("Prediction Accuracy")
plt.grid(True)
plt.show()

### Feature analysis

In [None]:
top_n = 25

In [None]:
sort_column = 'coefficient' # ordered by good predictive power

coefficients = model.named_steps['regressor'].coef_.flatten()
feature_names = numeric_cols

coef_df = pd.DataFrame({
    'feature': feature_names,
    'coefficient': coefficients,
    'abs_coefficient': np.abs(coefficients)
}).sort_values(by=sort_column, ascending=False)

top_features = coef_df[coef_df['coefficient'] > 0].sort_values(by='coefficient', ascending=False).head(top_n)
print(top_features)

In [None]:
plt.figure(figsize=(12, 7))
bars = plt.barh(top_features['feature'], top_features['coefficient'])
plt.xlabel('Coefficient Value')
plt.title(f'Top {top_n} Most Influential Features (Linear Regression)')
plt.axvline(x=0, color='gray', linestyle='--')
plt.grid(True, axis='x', linestyle=':', alpha=0.7)
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

### Predict all data

In [None]:
model.fit(X, y)
y_pred_all = model.predict(X)

mse = mean_squared_error(y, y_pred_all)
print("Mean Squared Error:", mse)

In [None]:
import matplotlib.pyplot as plt

plt.scatter(y, y_pred_all)
plt.xlabel("True Values")
plt.ylabel("Predicted Values")
plt.title("Prediction Accuracy")
plt.grid(True)
plt.show()