RF Script for Hemauer, Saunders, and Desmarais

Last updated: 05/24/2025

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import time
import pandas as pd
import random

random.seed(1337)

# Data
boehmke_2017_full = pd.read_stata(r"boehmke_analysis\replication_data\boehmke2017.dta")

covariates = ["srcs_decay","nbrs_lag","rpcpinc","totpop","legp_squire",
                "citi6010","unif_rep","unif_dem","time","time_sq","time_cube"]
boehmke_2017 = boehmke_2017_full[["state", "year", "statepol", "adopt"] + covariates].dropna()

# Factor DV
boehmke_2017['state'] = boehmke_2017['state'].astype('category')

# Encode 'state' as numeric codes for modeling
X = boehmke_2017.drop('adopt', axis = 1).copy()
X['state'] = X['state'].cat.codes
y = boehmke_2017['adopt']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1337, stratify = y)

# Apply SMOTE to the training data
smote = SMOTE(random_state = 1337)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Fit a Random Forest Classifier with hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf_boehmke_2017 = RandomForestClassifier(random_state = 1337)

grid_search = GridSearchCV(
    estimator = rf_boehmke_2017,
    param_grid = param_grid,
    cv = 5,
    n_jobs = -1,
    verbose = 2,
    scoring = 'average_precision'  # This should be correct for imbalanced datasets (?)
)

start_time = time.time()

grid_search.fit(X_train_resampled, y_train_resampled)

end_time = time.time()
print(f"GridSearchCV took {end_time - start_time:.2f} seconds")

# Predict
y_pred = grid_search.best_estimator_.predict(X_test)

# Test Statistics
f1_macro = f1_score(y_test, y_pred, average = 'macro')
balanced_acc = balanced_accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"F1 Macro: {f1_macro}")
print(f"Balanced Accuracy: {balanced_acc}")
print("Classification Report:\n", report)

Fitting 5 folds for each of 864 candidates, totalling 4320 fits


1440 fits failed out of a total of 4320.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
866 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ndhem\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\sklearn\model_selection\_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
    ~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\ndhem\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\sklearn\base.py", line 1382, in wrapper
    estimator._validate_params()
    ~~~~~~~~~~~~~~~~~~~~~~~~~~^^
  File "C:\U

GridSearchCV took 4010.10 seconds
F1 Macro: 0.5762065050049979
Balanced Accuracy: 0.5574436297092547
Classification Report:
               precision    recall  f1-score   support

         0.0       0.96      0.98      0.97      8448
         1.0       0.30      0.13      0.18       444

    accuracy                           0.94      8892
   macro avg       0.63      0.56      0.58      8892
weighted avg       0.92      0.94      0.93      8892

