PEHA Script for Hemauer, Saunders, and Desmarais

Last updated: 05/29/2025

In [None]:
### Boehmke et al. 2017 PEHA

from sklearn import linear_model
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report
import time
import pandas as pd
import random

random.seed(1337)

# Data
boehmke_2017_full = pd.read_stata(r"boehmke_analysis\replication_data\boehmke2017.dta")

covariates = ["srcs_decay","nbrs_lag","rpcpinc","totpop","legp_squire",
                "citi6010","unif_rep","unif_dem","time","time_sq","time_cube"]
boehmke_2017 = boehmke_2017_full[["state", "year", "statepol", "adopt"] + covariates].dropna()

# Define X and y
X = boehmke_2017.drop(columns=['adopt', 'year', 'statepol']).copy()
X = pd.get_dummies(X, columns=['state'], drop_first = True)  # drop_first avoids perfect multicollinearity
y = boehmke_2017['adopt']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1337, stratify = y)

# Define parameter grid for Logistic Regression
# Base params common to all
common_params = {
    'C': [0.001, 0.01, 0.1, 1, 10],
    'class_weight': [None, 'balanced'],
    'fit_intercept': [True, False]
}

# Build full param grid
param_grid = [
    # lbfgs supports only l2 or none
    {
        **common_params,
        'solver': ['lbfgs'],
        'penalty': ['l2', 'none']
    },
    # liblinear supports l1 and l2 only (no elasticnet or none)
    {
        **common_params,
        'solver': ['liblinear'],
        'penalty': ['l1', 'l2']
    },
    # saga supports l1, l2, elasticnet, none
    {
        **common_params,
        'solver': ['saga'],
        'penalty': ['l1', 'l2', 'elasticnet', 'none'],
        'l1_ratio': [0, 0.25, 0.5, 0.75, 1]  # Only used if penalty = 'elasticnet', ignored otherwise
    }
]

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator = linear_model.LogisticRegression(max_iter = 2500, random_state = 1337),
    param_grid = param_grid,
    scoring = 'average_precision',
    cv = 5,
    n_jobs = -1,
    verbose = 2,
    refit = True 
)

# Fit grid search
start_time = time.time()
grid_search.fit(X_train, y_train)
end_time = time.time()

print(f"GridSearchCV took {end_time - start_time:.2f} seconds")
print("Best parameters found:", grid_search.best_params_)

# Predict with best estimator
y_pred = grid_search.predict(X_test)

# Evaluation
print("F1 Score:", f1_score(y_test, y_pred, average = "macro"))
print("Balanced Accuracy Score:", balanced_accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))