In [None]:
from functools import reduce
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split,cross_val_score, KFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)

In [None]:
OFFICES = ['U.S. House', 'State House']
YEARS = ['2018', '2020', '2022', '2024']

# OFFICES = ['U.S. Senate']
# YEARS = ['2020']

# OFFICES = ['State Senate']
# YEARS = ['2022']

# OFFICES = ['President']
# YEARS = ['2024']



# TARGET = 'rep_share' # Good for rep-leaning regions
# TARGET = 'dem_share' # Good for dem-leaning regions
TARGET = 'partisan_temp' # Good!
# TARGET = 'partisanship_lean_change_amount_curr'
# TARGET = 'pedersen_index_percent'

TOP_N_FEATURES = 100
TOP_N_FEATURES_TO_DISPLAY = 15

# Socioeconomic data as features in addition
# to the original and the engineered features.
census_datasets = [
    'b02001_race', 'b04007_ancestry', 'b05012_nativity_us', 'b08303_travel_time_work', 'b25003_housing_rentership', 
    'dp02_selected_social_characteristics', 'dp03_selected_economic_characteristics', 'dp04_housing_characteristics', 'dp05_age_race', 
    's0101_age_sex', 's1101_households_families', 's1201_marital_status', 's1501_educational_attainment', 's1701_income_poverty', 
    's1903_median_income', 's2101_veteran_status', 's2201_food_stamps', 's2301_employment_status', 's2401_occupation_sex', 
    's2403_industry_sex', 's2501_occupancy_characteristics', 's2701_health_insurance', 's2503_financial_characteristics',
]

# These key-like columns just add noise.
drop_features_required = [
    'standardized_id', 'standardized_id_num',
    'aland_tract', 'awater_tract', 'geoid_tract', 'geoidfq_tract', 
    'geometry', 'geometry_tract', 'name_tract', 'tractce_tract',
    'nearest_bound_census_tract', 'nearest_bound_school_district', 'nearest_bound_zipcode',
]

# Optionally drop one or more of these during 
# train/test/prediction.
drop_features_optional = [
    # 'office_code', 
    # 'dem_share_prev', 
    # 'rep_share_prev', 'oth_share_prev', 
    # 'dem_share_change_prev', 'rep_share_change_prev', 'oth_share_change_prev', 
    # 'dem_votes_change_prev', 'rep_votes_change_prev', 'oth_votes_change_prev', 
    # 'registered_voters_change_prev', 'turnout_pct_change_prev', 
    # 'partisan_temp_prev', 'partisan_temp_change_prev', 
    # 'partisanship_lean_prev', 'partisanship_lean_change_prev', 'partisanship_lean_change_amount_prev',
]

# Seen features that may or may not be used as
# targets as well.
drop_features_seen = [
    'dem_votes', 'oth_votes', 'rep_votes', 'total_votes', 
    'dem_share', 'rep_share', 'oth_share',  'turnout_pct',
    'dem_share_change_curr','rep_share_change_curr', 'oth_share_change_curr', 
    'dem_votes_change_curr','rep_votes_change_curr', 'oth_votes_change_curr', 
    'partisan_temp', 'partisanship_lean_curr', 'registered_voters',
    'registered_voters_change_curr','turnout_pct_change_curr',
    'partisan_temp_category', 'partisan_temp_change_curr',
    'pedersen_index_percent', 'pedersen_index',
    'partisanship_lean_change_amount_curr',
]

# DO NOT EDIT BELOW THIS LINE
if TARGET in drop_features_seen:
    drop_features_seen.remove(TARGET) # Keep target in features for later extraction

drop_features = drop_features_required + drop_features_optional + drop_features_seen

In [None]:
''' Remove top features not shared between 
    different datasets to prevent errors.'''
def removeUncommonColumns(nested_dict):
    print("Removing uncommon columns...")
    
    # Flatten and find common columns
    all_dfs = [df for year in nested_dict for df in nested_dict[year].values()]
    common_cols = set(all_dfs[0].columns)
    for df in all_dfs[1:]:
        common_cols &= set(df.columns)
    
    # Safely trim all dataframes
    for year in nested_dict:
        for office in nested_dict[year]:
            df = nested_dict[year][office]
            existing_cols = [col for col in common_cols if col in df.columns]
            nested_dict[year][office] = df[existing_cols]

    print('Done.')
    
    return nested_dict


''' Pull the engineered feature data along with its
    target for each year and office.'''
def makeDatasets(years, offices):
    print('Making datasets...')
    
    df_datasets = {}
    
    for year in years:
        print(f'Processing year {year}...')
        df_datasets[year] = {}
        
        for office in offices:
            office = office.replace(' ', '_').replace('.', '')
            print(f'Processing office {office}...')

            df = pd.read_csv('data/generated_data/07_ml_features_' + year + '_' + office + '.csv', low_memory=False)
            df_datasets[year][office] = df
    
    df_datasets = removeUncommonColumns(df_datasets)
    print('Done.')
    
    return df_datasets


''' Split features and target into X
    and y variables with some cleanup.'''
def makeFeaturesTargets(df):
    print(f'Making features and target...')
    
    y = df[[TARGET]]
    X = df.drop(columns=['standardized_id_num', 'partisan_temp', 'partisan_temp_change_curr'])
    X = X.replace(['-', '(X)', 'N/A', 'null', ''], pd.NA)
    
    X, y = X.align(y.dropna(), join='inner', axis=0)
    
    print('Done.')
    return X, y


''' Pipeline to impute and encode categorical
    variables, as well as scale, etc.'''
def fitModel(X, y, k=5):
    print(f'Fitting model...')

    # Define non-numeric features
    categorical_cols = [
        'office_code',
        'partisanship_lean_curr',
        'partisanship_lean_prev',
        'partisanship_lean_change_prev',
    ]

    # Format the columns
    categorical_cols = [col for col in categorical_cols if col in X.columns]
    numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
    numeric_cols = [col for col in numeric_cols if col not in categorical_cols]

    # Set up the pipeline
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(drop='first', handle_unknown='ignore'))
    ])
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    preprocessor = ColumnTransformer(transformers=[
        ('cat', categorical_transformer, categorical_cols),
        ('num', numeric_transformer, numeric_cols)
    ])
    model = Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('regressor', LinearRegression())
    ])

    # K-fold CV
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    r2_scores = cross_val_score(model, X, y, cv=kf, scoring='r2', n_jobs=-1)
    mse_scores = -cross_val_score(model, X, y, cv=kf, scoring='neg_mean_squared_error', n_jobs=-1)

    print(f'Average R² across {k} folds: {r2_scores.mean():.4f} ± {r2_scores.std():.4f}')
    print(f'Average MSE across {k} folds: {mse_scores.mean():.4f} ± {mse_scores.std():.4f}')

    # Final fit
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    model.fit(X_train, y_train)

    print('Final model fitted on training split.')
    return model, X_train, X_test, y_train, y_test, numeric_cols


def makePredictions(X_test, model):
    print(f'Making predictions...')
    y_pred = model.predict(X_test)
    print('Done.')
    return y_pred


def plotAccuracy(y_test, y_pred):
    print(f'Plotting accuracy...')
    plt.figure(figsize=(12, 9))
    plt.scatter(y_test, y_pred, alpha=0.4)
    plt.xlim(-1.25, 1.25)
    plt.ylim(-1.25, 1.25)
    plt.xlabel("True Values")
    plt.ylabel("Predicted Values")
    plt.title("Prediction Accuracy")
    plt.grid(True)
    print('Done.')
    return plt


def featureCoeff(model):
    print(f'Computing feature coefficients from pipeline...')

    regressor = model.named_steps['regressor']
    preprocessor = model.named_steps['preprocessor']
    coef = regressor.coef_.flatten()

    # Inline feature name extraction
    output_features = []
    for name, transformer, columns in preprocessor.transformers_:
        if transformer == 'drop' or transformer is None:
            continue
        if hasattr(transformer, 'get_feature_names_out'):
            try:
                names = transformer.get_feature_names_out(columns)
            except:
                names = columns
        else:
            names = columns
        output_features.extend(names)

    feature_names = output_features

    if len(coef) != len(feature_names):
        raise ValueError(f"Mismatch: {len(coef)} coefficients vs {len(feature_names)} feature names")

    df = pd.DataFrame({
        'feature': feature_names,
        'coefficient': coef,
        'abs_coefficient': np.abs(coef)
    })

    top_features = df.sort_values(by='abs_coefficient', ascending=False).head(TOP_N_FEATURES_TO_DISPLAY)
    print('Done.')
    return top_features


def plotFeatureCoeff(features):
    print(f'Plotting feature coefficients...')
    plt.figure(figsize=(12, 18))
    bars = plt.barh(features['feature'], features['coefficient'])
    plt.xlabel('Coefficient Value')
    plt.xticks(fontsize=7)
    plt.yticks(fontsize=7)
    plt.title(f'Most Influential Features (Linear Regression)')
    plt.axvline(x=0, color='gray', linestyle='--')
    plt.grid(True, axis='x', linestyle=':', alpha=0.7)
    plt.gca().invert_yaxis()
    plt.tight_layout()
    print('Done.')
    return plt


def get_feature_names(model):
    print("Getting feature names...")

    preprocessor = model.named_steps['preprocessor']
    feature_names = preprocessor.get_feature_names_out()

    # Store both prefixed and cleaned names
    cleaned_names = [name.split('__', 1)[-1] for name in feature_names]

    print(f"Done. Retrieved {len(feature_names)} feature names.")
    return feature_names, cleaned_names


def mergeTopFeatures(top_features_lists):
    print(f'Creating common top features using clean names...')
    from itertools import chain

    # Make clean names
    normalized_lists = []
    for item in top_features_lists:
        if isinstance(item, list):
            normalized_lists.append(item)
        elif hasattr(item, 'columns') and 'feature' in item.columns:
            normalized_lists.append(item['feature'].tolist())
        else:
            raise ValueError("Each item must be a list or a DataFrame with a 'feature' column")

    # Find intersection
    common_features = set(normalized_lists[0])
    for feature_list in normalized_lists[1:]:
        common_features.intersection_update(feature_list)

    # Preserve order
    seen = set()
    merged_common_ordered = []

    for item in chain.from_iterable(normalized_lists):
        if item in common_features and item not in seen:
            seen.add(item)
            merged_common_ordered.append(item)

    print('Done.')
    return merged_common_ordered


def one_hot_encode_selected(df, columns_to_encode):
    df = df.copy()
    
    if not columns_to_encode:
        return df
        
    encoded = pd.get_dummies(df[columns_to_encode], prefix=columns_to_encode)
    df = df.drop(columns=columns_to_encode)
    
    return pd.concat([df, encoded], axis=1)

In [None]:
def getRankedFeatureList(target=TARGET):
    df_ranked_features = pd.read_csv(f'data/generated_data/df_importances_{target}.csv')
    df_ranked_features = df_ranked_features.sort_values(by='Average', ascending=False)
    df_ranked_features = df_ranked_features[~df_ranked_features['Feature name'].isin(drop_features)]
    
    df_ranked_features_top = df_ranked_features.head(TOP_N_FEATURES)
    
    ranked_features_top_list = df_ranked_features_top['Feature name'].tolist()
    
    return ranked_features_top_list

ranked_features_top_list = getRankedFeatureList()

In [None]:
def predDatasetsIndiv(df_datasets, years, offices):
    for year in years:
        print(f'Processing year {year}...')
        
        for office in offices:
            office = office.replace(' ', '_').replace('.', '')
            print(f'Processing office {office}...')
            
            df = df_datasets[year][office].copy()
            X, y = makeFeaturesTargets(df)
            X = X[ranked_features_top_list]
    
            print(f"Training over {len(X.columns)} features...")
            
            model, X_train, X_test, y_train, y_test, numeric_cols = fitModel(X, y)
            y_pred = makePredictions(X_test, model)
            
            mse = mean_squared_error(y_test, y_pred)
            print("Mean Squared Error:", mse)
    
            r2 = model.score(X_test, y_test)
            print("R2 Score:", r2)
    
            plt = plotAccuracy(y_test, y_pred)
            plt.savefig(f'output/figures/regression_accuracy_{year}_{office}_individual.png')
            # plt.show()
            plt.close()
            
            top_features = featureCoeff(model)
            plt = plotFeatureCoeff(top_features)
            plt.savefig(f'output/figures/regression_top_features_{year}_{office}_individual.png')
            # plt.show()
            plt.close()

#### Predict Individual Datasets

In [None]:
years = ['2018', '2020', '2022']
offices = ['U.S. House']

df_datasets = makeDatasets(years, offices)
predDatasetsIndiv(df_datasets, years, offices)

#### Fit/Train Final Model
Do we have data leakage here? Maybe not, if we train on historical data and
<br>run a separate test on newer data in a following step. Say, here we
train 2018-2022, <br>and then in another cell test 2024 on the same model

In [None]:
def aggDatasets(datasets, years, offices):
    dfs = []
    
    for year in years:
        print(f'Processing year {year}...')
        for office in offices:
            office = office.replace(' ', '_').replace('.', '')
            
            print(f'Processing office {office}...')
            dfs.append(df_datasets[year][office].copy())
            
    df = pd.concat(dfs, axis=0, ignore_index=True)
    
    return df

In [None]:
def fitDatasetsAgg(df_datasets):
    X, y = makeFeaturesTargets(df)
    X = X[ranked_features_top_list]

    print(f"Training over {len(X.columns)} features...")
    
    model, X_train, X_test, y_train, y_test, numeric_cols = fitModel(X, y)
    y_pred = makePredictions(X_test, model)

    expected_columns = X.columns.tolist()
    
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse)

    r2 = model.score(X_test, y_test)
    print("R2 Score:", r2)

    plt = plotAccuracy(y_test, y_pred)
    plt.savefig(f'output/figures/regression_accuracy_aggregate.png')
    plt.show()
    # plt.close()
    
    top_features = featureCoeff(model)
    plt = plotFeatureCoeff(top_features)
    plt.savefig(f'output/figures/regression_top_features_aggregate.png')
    # plt.show()
    plt.close()

    return model

In [None]:
def predDatasetsAgg(df_datasets):
    X, y = makeFeaturesTargets(df)
    X = X[ranked_features_top_list]

    print(f"Training over {len(X.columns)} features...")
    
    model, X_train, X_test, y_train, y_test, numeric_cols = fitModel(X, y)
    y_pred = makePredictions(X_test, model)
    
    mse = mean_squared_error(y_test, y_pred)
    print("Mean Squared Error:", mse)

    r2 = model.score(X_test, y_test)
    print("R2 Score:", r2)

    plt = plotAccuracy(y_test, y_pred)
    plt.savefig(f'output/figures/regression_accuracy_aggregate.png')
    plt.show()
    # plt.close()
    
    top_features = featureCoeff(model)
    plt = plotFeatureCoeff(top_features)
    plt.savefig(f'output/figures/regression_top_features_aggregate.png')
    # plt.show()
    plt.close()

    return model

#### Fit aggregated data
This functionality produces the <code>model</code> object to be used later. Make sure no datasets from this
<br>cell is NOT included in holdout testing.

In [None]:
years = ['2018', '2020', '2022']
offices = ['U.S. House']

df_datasets = makeDatasets(years, offices)
df = aggDatasets(df_datasets, years, offices)

X, y = makeFeaturesTargets(df)
X = X[ranked_features_top_list]
X = X.replace({pd.NA: np.nan})

model, X_train, X_test, y_train, y_test, numeric_cols = fitModel(X, y)
y_pred = makePredictions(X_test, model)

expected_columns = X.columns.tolist()

mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error:", mse)

r2 = model.score(X_test, y_test)
print("R2 Score:", r2)

plt = plotAccuracy(y_test, y_pred)
plt.savefig(f'output/figures/regression_accuracy_aggregate.png')
plt.show()

top_features = featureCoeff(model)
plt = plotFeatureCoeff(top_features)
plt.savefig(f'output/figures/regression_top_features_aggregate.png')
plt.close()
# plt.show()

#### Holdout Prediction
This functionality requires the <code>model</code> object from a previous cell. Make sure this holdout
<br>dataset was not included in the model's training.

In [None]:
years = ['2024']
offices = ['U.S. House']

df_datasets = makeDatasets(years, offices)
df = df_datasets['2024']['US_House']

df_orig = df.copy()

X, y = makeFeaturesTargets(df)
X = X[ranked_features_top_list]

# Scale X
X = X.replace({pd.NA: np.nan})

# NaN missing cells
X = X.replace({pd.NA: np.nan})

# # Predict, transformations handled by pipeline
y_pred = model.predict(X)

# MSE
mse = mean_squared_error(y, y_pred)
print("Mean Squared Error:", mse)

# R2
r2 = model.score(X, y)
print("R2 Score:", r2)

with open(f'output/reports/prediction_eval_{TARGET}_holdout_regression.txt', 'w') as f:
    output = f"R2 Score: {r2}"
    output += f"\nMSE: {mse}"
    f.write(output)

plt = plotAccuracy(y, y_pred)
plt.savefig(f'output/figures/regression_accuracy_{TARGET}_holdout.png')
plt.close()
# plt.show()

top_features = featureCoeff(model)
plt = plotFeatureCoeff(top_features)
plt.savefig(f'output/figures/regression_top_features_{TARGET}_holdout.png')
plt.close()
# plt.show()

y_pred = model.predict(X).ravel()
df_orig[f"predicted_{TARGET}"] = pd.Series(y_pred, index=X.index)

df_export = df_orig[["standardized_id_num", TARGET, f"predicted_{TARGET}"]].copy()
df_export["standardized_id_num"] = df_export["standardized_id_num"].apply(lambda x: str(x).replace('.0', '').zfill(13))

filename = f"data/generated_data/predicted_{TARGET}_{years[0]}_{offices[0].replace('.', '').replace(' ', '_')}.csv"
df_export.to_csv(filename, index=False)

print(f"Predictions saved to {filename}")

### Grid search

In [None]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, r2_score

def gridSearch(df):
    df = df.copy()
    
    X, y = makeFeaturesTargets(df)
    X = X[ranked_features_top_list]
    X = X.replace({pd.NA: np.nan})
    
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    models = {
        'LinearRegression': (
            LinearRegression(), 
            {}
        ),
        'Ridge': (
            Ridge(),
            {'regressor__alpha': [0.01, 0.1, 1.0, 10]}
        ),
        'Lasso': (
            Lasso(max_iter=10000),
            {'regressor__alpha': [0.01, 0.1, 1.0, 10]}
        ),
        'ElasticNet': (
            ElasticNet(max_iter=10000), 
            {
            'regressor__alpha': [0.01, 0.1, 1.0],
            'regressor__l1_ratio': [0.2, 0.5, 0.8]
            }
        ),
        'DecisionTree': (
            DecisionTreeRegressor(), 
            {
            'regressor__max_depth': [5, 10, None],
            'regressor__min_samples_split': [2, 10]
            }
        ),
        'RandomForest': (
            RandomForestRegressor(n_jobs=-1), 
            {
            'regressor__n_estimators': [50, 100],
            'regressor__max_depth': [5, 10, None]
            }
        ),
        'GradientBoosting': (
            GradientBoostingRegressor(), 
            {
            'regressor__n_estimators': [50, 100],
            'regressor__learning_rate': [0.05, 0.1],
            'regressor__max_depth': [3, 5]
            }
        ),
    }
    
    y = y.values.ravel() 
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    best_models = {}
    results = []
    
    for name, (model, params) in models.items():
        print(f"🔍 Tuning: {name}")
        
        pipe = Pipeline([
            ('preprocessor', preprocessor),
            ('regressor', model)
        ])
        
        grid = GridSearchCV(pipe, param_grid=params, cv=5, scoring='r2', n_jobs=-1)
        grid.fit(X_train, y_train)
    
        best_model = grid.best_estimator_
        y_pred = best_model.predict(X_test)
    
        mse = mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
    
        results.append({
            'Model': name,
            'Best Params': grid.best_params_,
            'MSE': mse,
            'R2': r2
        })
    
        best_models[name] = grid 
    
    results_df = pd.DataFrame(results).sort_values(by='R2', ascending=False)

    print('Done.')

    return best_models, results_df

In [None]:
years = ['2018', '2020', '2022']
offices = ['U.S. House']

df_datasets = makeDatasets(years, offices)
df = aggDatasets(df_datasets, years, offices)

best_models, results_df = gridSearch(df)

In [None]:
def fitBestModel(df, params):
    df = df.copy()
    
    X, y = makeFeaturesTargets(df)
    X = X[ranked_features_top_list]
    X = X.replace({pd.NA: np.nan})
    
    numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    numeric_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])
    
    categorical_transformer = Pipeline([
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
    
    preprocessor = ColumnTransformer([
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])
    
    gbr_final = GradientBoostingRegressor(
        random_state=42,
        **params
    )
    
    model = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', gbr_final)
    ])
    
    model.fit(X, y.values.ravel())

    print('Done.')

    return model

In [None]:
def cleanParams(best_models, model_name):
    params = best_models[model_name].best_params_
    clean_params = {k.replace('regressor__', ''): v for k, v in params.items()}
    
    print(f'Cleaned params for {model_name}: {clean_params}')
    
    return clean_params

In [None]:
years = ['2018', '2020', '2022']
offices = ['U.S. House']

df_datasets = makeDatasets(years, offices)
df = aggDatasets(df_datasets, years, offices)

clean_params = cleanParams(best_models, 'GradientBoosting')
model = fitBestModel(df, clean_params)

In [None]:
def makePredsBestModel(df, model):
    df = df.copy()
    
    X, y = makeFeaturesTargets(df)
    X = X[ranked_features_top_list]
    X = X.replace({pd.NA: np.nan})
    X = X.dropna()
    
    # Align datagrame to cleaned X
    df = df.loc[X.index]
    
    y_pred = model.predict(X)
    
    y_true = df[TARGET]
    
    df[f"predicted_{TARGET}"] = y_pred
    
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    
    print(f"Mean Squared Error: {mse:.4f}")
    print(f"R² Score: {r2:.4f}")

    return df, y_pred, y_true

In [None]:
years = ['2024']
offices = ['U.S. House']

df_datasets = makeDatasets(years, offices)
df_holdout = df_datasets['2024']['US_House']

df_pred_best, y_pred_best, y_true_best = makePredsBestModel(df_holdout, model)

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_true_best, y_pred_best, alpha=0.4)
plt.plot([-1, 1], [-1, 1], color='red', linestyle='--')  # Diag. line
plt.xlabel("Actual")
plt.ylabel("Predicted")
plt.title("Predicted Temperature vs. Actual Temperature")
plt.grid(True)
plt.tight_layout()
# We need to label these better
plt.savefig(f'output/figures/regression_best_model_preds.png')
plt.close()
# plt.show()

#### Benchmark 1

In [None]:
BENCHMARK_TYPE = 'uniform'

df_benchmark = df_holdout.copy()

X, y = makeFeaturesTargets(df_benchmark)
X = X[ranked_features_top_list]
X = X.replace({pd.NA: np.nan})
X = X.dropna()

# Align dataframe to cleaned X
df_benchmark = df_benchmark.loc[X.index]
y_true = df_benchmark[TARGET]

if BENCHMARK_TYPE == 'uniform':
    # Random uniform within observed range
    low, high = y_true.min(), y_true.max()
    y_dummy = np.random.uniform(low, high, size=len(y_true))
elif BENCHMARK_TYPE == 'permutation':
    # Random shuffling of true values (permutation)
    y_dummy = np.random.permutation(y_true)
elif BENCHMARK_TYPE == 'median':
    # Median
    y_dummy = np.full(len(y_true), np.median(y_true))
else:
    # Dummy target mean
    mean_value = y_true.mean()
    y_dummy = [mean_value] * len(y_true)

mse = mean_squared_error(y_true, y_dummy)
r2 = r2_score(y_true, y_dummy)

print(f"📉 Dummy {BENCHMARK_TYPE.capitalize()} Squared Error: {mse:.4f}")
print(f"📈 Dummy {BENCHMARK_TYPE.capitalize()} R² Score: {r2:.4f}")

plt.figure(figsize=(8, 6))
plt.scatter(y_true, y_dummy, alpha=0.4)
plt.plot([-1, 1], [-1, 1], color='red', linestyle='--')  # Diag. line
plt.xlabel("Actual")
plt.ylabel("Benchmark")
plt.title("Benchmark vs. Actual")
plt.grid(True)
plt.tight_layout()
# We need to name this better
plt.savefig(f'output/figures/regression_benchmark_1.png')
plt.close()
# plt.show()

#### Benchmark 2

In [None]:
RULE_OF_THUMB_FEATURES = ['partisan_temp_prev']

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

df_benchmark = df.copy()

# Rule-of-thumb feature
rule_features = RULE_OF_THUMB_FEATURES

X_rule = df_benchmark[rule_features].copy()
y_true = df_benchmark[TARGET].copy()

# Drop rows with missing data
mask = X_rule.notna().all(axis=1) & y_true.notna()
X_rule = X_rule.loc[mask]
y_true = y_true.loc[mask]

# Fit linear model
rule_model = LinearRegression()
rule_model.fit(X_rule, y_true)
y_pred = rule_model.predict(X_rule)

mse = mean_squared_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

print("Linear Rule-of-Thumb Benchmark")
print(f"Feature(s) used: {rule_features}")
print(f"Mean Squared Error: {mse:.4f}")
print(f"R² Score: {r2:.4f}")

plt.figure(figsize=(8, 6))
plt.scatter(y_true, y_pred, alpha=0.4)
plt.plot([-1, 1], [-1, 1], color='red', linestyle='--')  # Diagonal reference
plt.xlabel("Actual")
plt.ylabel("Predicted (Rule-of-Thumb)")
plt.title("Rule-of-Thumb Prediction vs. Actual")
plt.grid(True)
plt.tight_layout()
# We need to name this better
plt.savefig(f'output/figures/regression_benchmark_2.png')
plt.close()
# plt.show()