In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, balanced_accuracy_score, classification_report
from imblearn.over_sampling import SMOTE
import pandas as pd
import random

random.seed(1337)

# Data
boehmke_2017_full = pd.read_stata("boehmke_analysis/replication_data/boehmke2017.dta")
boehmke_2017 = boehmke_2017_full[["state", "year", "statepol", "adopt"] + covariates].dropna()

# Separate features and target
X = boehmke_2017.drop('adopt', axis = 1)
y = boehmke_2017['adopt']

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1337, stratify = y)

# Apply SMOTE to the training data
smote = SMOTE(random_state = 1337)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Fit a Random Forest Classifier with hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [10, 20, 30, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf_boehmke_2017 = RandomForestClassifier(random_state = 1337)

grid_search = GridSearchCV(
    estimator = rf_boehmke_2017,
    param_grid = param_grid,
    cv = 5,
    n_jobs = -1,
    verbose = 2,
    scoring = 'average_precision'
)

grid_search.fit(X_train_resampled, y_train_resampled)

# Predict
y_pred = grid_search.best_estimator_.predict(X_test)

# Test Statistics
f1_macro = f1_score(y_test, y_pred, average='macro')
balanced_acc = balanced_accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"F1 Macro: {f1_macro}")
print(f"Balanced Accuracy: {balanced_acc}")
print("Classification Report:\n", report)

FileNotFoundError: [Errno 2] No such file or directory: 'boehmke_analysis/replication_data/boehmke2017.dta'