In [None]:
# President: 2016 (Trump), 2020 (Biden), 2024 (Trump)
# U.S. Senate: 2014 (Peters), 2018 (Stabenow), 2020 (Peters), 2024 (Slotkin)
# U.S. House: every cycle
# State Senate: 2014, 2018, 2022
# State House: every cycle

OFFICES = ['U.S. House', 'State House']
YEARS = ['2018', '2020', '2022', '2024']

# OFFICES = ['U.S. Senate']
# YEARS = ['2020']

# OFFICES = ['State Senate']
# YEARS = ['2022']

# OFFICES = ['President']
# YEARS = ['2024']

TARGET = 'partisanship_lean_curr'

TOP_N_FEATURES = 20
FEATURES_ALREADY_RANKED = True

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import r2_score, accuracy_score, f1_score, accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)

### XG Boost Classifier

In [None]:
# These key-like columns just add noise.
drop_features_required = [
    'standardized_id', 'standardized_id_num',
    'aland_tract', 'awater_tract', 'geoid_tract', 'geoidfq_tract', 
    'geometry', 'geometry_tract', 'name_tract', 'tractce_tract',
    'nearest_bound_census_tract', 'nearest_bound_school_district', 'nearest_bound_zipcode',
]

# Optionally drop one or more of these during 
# train/test/prediction.
drop_features_optional = [
    # 'office_code', 
    # 'dem_share_prev', 
    # 'rep_share_prev', 'oth_share_prev', 
    # 'dem_share_change_prev', 'rep_share_change_prev', 'oth_share_change_prev', 
    # 'dem_votes_change_prev', 'rep_votes_change_prev', 'oth_votes_change_prev', 
    # 'registered_voters_change_prev', 'turnout_pct_change_prev', 
    # 'partisan_temp_prev', 'partisan_temp_change_prev', 
    # 'partisanship_lean_prev', 'partisanship_lean_change_prev', 'partisanship_lean_change_amount_prev',
]

# Seen features that may or may not be used as
# targets as well.
drop_features_seen = [
    'dem_votes', 'oth_votes', 'rep_votes', 'total_votes', 
    'dem_share', 'rep_share', 'oth_share',  'turnout_pct',
    'dem_share_change_curr','rep_share_change_curr', 'oth_share_change_curr', 
    'dem_votes_change_curr','rep_votes_change_curr', 'oth_votes_change_curr', 
    'partisan_temp', 'partisanship_lean_curr', 'registered_voters',
    'registered_voters_change_curr','turnout_pct_change_curr',
    'partisan_temp_category', 'partisan_temp_change_curr',
    'pedersen_index_percent', 'pedersen_index',
    'partisanship_lean_change_amount_curr',
]

# Socioeconomic data as features in addition
# to the original and the engineered features.
census_datasets = [
    'b02001_race', 'b04007_ancestry', 'b05012_nativity_us', 'b08303_travel_time_work', 'b25003_housing_rentership', 
    'dp02_selected_social_characteristics', 'dp03_selected_economic_characteristics', 'dp04_housing_characteristics', 'dp05_age_race', 
    's0101_age_sex', 's1101_households_families', 's1201_marital_status', 's1501_educational_attainment', 's1701_income_poverty', 
    's1903_median_income', 's2101_veteran_status', 's2201_food_stamps', 's2301_employment_status', 's2401_occupation_sex', 
    's2403_industry_sex', 's2501_occupancy_characteristics', 's2701_health_insurance', 's2503_financial_characteristics',
]

# DO NOT EDIT BELOW THIS LINE
if TARGET in drop_features_seen:
    drop_features_seen.remove(TARGET) # Keep target in features for later extraction

drop_features = drop_features_required + drop_features_optional + drop_features_seen

In [None]:
''' Pull the engineered feature data along with its
    target for each year and office.'''
def makeDatasets(years, offices):
    print('Making datasets...')
    
    df_datasets = {}
    
    for year in years:
        print(f'Processing year {year}...')
        df_datasets[year] = {}
        
        for office in offices:
            office = office.replace(' ', '_').replace('.', '')
            print(f'Processing office {office}...')

            df = pd.read_csv('data/generated_data/07_ml_features_' + year + '_' + office + '.csv', low_memory=False)
            df_datasets[year][office] = df
    
    df_datasets = removeUncommonColumns(df_datasets)
    print('Done.')
    
    return df_datasets


def aggDatasets(datasets, years, offices):
    dfs = []
    
    for year in years:
        print(f'Processing year {year}...')
        for office in offices:
            office = office.replace(' ', '_').replace('.', '')
            
            print(f'Processing office {office}...')
            dfs.append(df_datasets[year][office].copy())
            
    df = pd.concat(dfs, axis=0, ignore_index=True)
    
    return df


''' Remove top features not shared between 
    different datasets to prevent errors.'''
def removeUncommonColumns(nested_dict):
    print("Removing uncommon columns...")
    
    # Flatten and find common columns
    all_dfs = [df for year in nested_dict for df in nested_dict[year].values()]
    common_cols = set(all_dfs[0].columns)
    for df in all_dfs[1:]:
        common_cols &= set(df.columns)
    
    # Safely trim all dataframes
    for year in nested_dict:
        for office in nested_dict[year]:
            df = nested_dict[year][office]
            existing_cols = [col for col in common_cols if col in df.columns]
            nested_dict[year][office] = df[existing_cols]

    print('Done.')
    
    return nested_dict

In [None]:
# INDIVIDUAL TESTS

from xgboost import XGBClassifier

for year in YEARS:
    print(f'Processing year {year}...')

    for office in OFFICES:
        office = office.replace(' ', '_').replace('.', '')
        print(f'Processing office {office}...')
        
        df = pd.read_csv(f'data/generated_data/07_ml_features_{year}_{office}.csv')
        
        if FEATURES_ALREADY_RANKED:
            print(f'Features already ranked, select top...')
            feature_importance_file = f'data/generated_data/df_importances_{TARGET}.csv'
            top_feature_columns = pd.read_csv(feature_importance_file)['Feature name'].head(TOP_N_FEATURES).tolist()
            required_columns = ['standardized_id_num', TARGET]
            selected_columns = [col for col in top_feature_columns + required_columns if col in df.columns]
            df = df[selected_columns].dropna(subset=[TARGET])
        
        # Clean ID
        df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)
        
        # Drop rows w/o targets
        df = df.dropna(subset=[TARGET])
        
        # Save standardized IDs
        X_stids = df['standardized_id_num']
        
        # Define y and encode
        y = df[TARGET]
        label_encoder = LabelEncoder()
        y_encoded = label_encoder.fit_transform(y)
        
        # Drop unneeded columns after saving id
        cols_to_drop = [col for col in drop_features if col in df.columns]
        df = df.drop(columns=cols_to_drop)
        X = df.drop(columns=[TARGET])
        
        categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
        numeric_cols = X.select_dtypes(include=['number']).columns.tolist()
        
        for col in categorical_cols:
            X[col] = X[col].astype(str)
        
        categorical_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ])
        
        numeric_transformer = Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ])
        
        preprocessor = ColumnTransformer([
            ('cat', categorical_transformer, categorical_cols),
            ('num', numeric_transformer, numeric_cols)
        ])
        
        model = XGBClassifier(
            objective="multi:softmax",  # multi:softmax or multi:softprob
            num_class=len(y.unique()),
            use_label_encoder=False,
            n_estimators=200,
            max_depth=5,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1,
            eval_metric="mlogloss"
        )
        
        model = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', model)
        ])
        
        X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y, random_state=42)
        
        model.fit(X_train, y_train)
        
        y_pred = model.predict(X_test)

        decoded_y_test = label_encoder.inverse_transform(y_test)
        decoded_y_pred = label_encoder.inverse_transform(y_pred)
        
        print(classification_report(decoded_y_test, decoded_y_pred))

        # Create the confusion matrix
        cm = confusion_matrix(decoded_y_test, decoded_y_pred)
        
        # Plot with seaborn for nice formatting
        plt.figure(figsize=(8, 6))
        sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
                    xticklabels=label_encoder.classes_, 
                    yticklabels=label_encoder.classes_)
        
        plt.xlabel('Predicted')
        plt.ylabel('True')
        plt.title('Confusion Matrix')
        plt.tight_layout()
        plt.savefig(f'output/figures/confusion_matrix_{year}_{office}.png')
        plt.close()
        # plt.show()
        
        # Save predictions to disk.
        results_df = pd.DataFrame({
            'standardized_id_num': X_stids.loc[X_test.index],
            'true_label': decoded_y_test,
            'predicted_label': decoded_y_pred
        })
        results_df['standardized_id_num'] = results_df['standardized_id_num'].astype(str).str.zfill(13)
        
        filename = f'data/generated_data/prediction_results_{year}_{office}_classification.csv'
        results_df.to_csv(filename, index=False)


        #############################
        # FEATURE PERFORMANCE
        #############################
        from sklearn.linear_model import LogisticRegression
        
        feature_performance = []
        is_continuous = y.dtype.kind in 'fc' # float or continuous
        
        for feature in tqdm(X.columns):
            X_feature = X[[feature]].copy()
            
            # Handle missing values
            if X_feature[feature].dtype == 'object':
                X_feature = X_feature.fillna(X_feature.mode().iloc[0])
                X_feature = pd.get_dummies(X_feature, drop_first=True)
            
                # Need at least 1 column after one-hot encoding
                if X_feature.shape[1] == 0:
                    continue  # Leave as is
            else:
                X_feature = X_feature.fillna(X_feature.mean(numeric_only=True))
            
            # Force one column, even if empty
            if X_feature.shape[1] == 0:
                continue
            
            X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(X_feature, y, test_size=0.2, random_state=42)
            
            if is_continuous:  # Regression
                model = LinearRegression()
                model.fit(X_train_feat, y_train_feat)
                y_pred = model.predict(X_test_feat)
                score = r2_score(y_test_feat, y_pred)  # R² for regression
                metric = "Score"
                
            else:  # Classification
                model = LogisticRegression(max_iter=200)  # or DecisionTreeClassifier()
                model.fit(X_train_feat, y_train_feat)
                y_pred = model.predict(X_test_feat)
                score = accuracy_score(y_test_feat, y_pred)  # Acc. for classification
                metric = "Score"
                
            feature_performance.append({"Feature name": feature, metric: score})
            
        feature_performance_df = pd.DataFrame(feature_performance).sort_values(by=metric, ascending=False)
        filename = f'data/generated_data/feature_rankings_{year}_{office}.csv'
        feature_performance_df.to_csv(filename, index=None)

In [None]:
# AGGREGATE TRAIN/TEST
# Take several years and aggregate into a larger single dataset
# for maximum training data, perhaps leaving out one holdout set.
df_datasets = makeDatasets(['2018', '2020', '2022'], ['U.S. House'])
df = aggDatasets(df_datasets, ['2018', '2020', '2022'], ['U.S. House'])

from xgboost import XGBClassifier

# If features have been ranked, use only those features ranked
# high for the target in quest.
if FEATURES_ALREADY_RANKED:
    print(f'Features already ranked, select top...')
    feature_importance_file = f'data/generated_data/df_importances_{TARGET}.csv'
    top_feature_columns = pd.read_csv(feature_importance_file)['Feature name'].head(TOP_N_FEATURES).tolist()
    required_columns = ['standardized_id_num', TARGET]
    selected_columns = [col for col in top_feature_columns + required_columns if col in df.columns]
    df = df[selected_columns].dropna(subset=[TARGET])

# Ensure clean 13-character left-padded ID
df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)

# Drop rows w/o targets
df = df.dropna(subset=[TARGET])

# Save standardized IDs
X_stids = df['standardized_id_num']

# Define y and encode
y = df[TARGET]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Drop unneeded columns after saving id
cols_to_drop = [col for col in drop_features if col in df.columns]
df = df.drop(columns=cols_to_drop)
X = df.drop(columns=[TARGET])

# Define categorical and numeric columns for formatting.
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

for col in categorical_cols:
    X[col] = X[col].astype(str)

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])

xgb_classifier = XGBClassifier(
    objective="multi:softmax",  # multi:softmax or multi:softprob
    num_class=len(y.unique()),
    use_label_encoder=False,
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric="mlogloss"
)

xgb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb_classifier)
])

X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y, random_state=42)

xgb_pipeline.fit(X_train, y_train)
y_pred = xgb_pipeline.predict(X_test)

decoded_y_test = label_encoder.inverse_transform(y_test)
decoded_y_pred = label_encoder.inverse_transform(y_pred)

# Print classification report
class_report = classification_report(decoded_y_test, decoded_y_pred, output_dict=True)
print(class_report)

df_class_report = pd.DataFrame(class_report).transpose()
df_class_report = df_class_report.round(3)
with open(f'output/reports/classification_report_{TARGET}_aggregate_classification.md', 'w') as f:
    f.write("# Classification Report\n\n")
    f.write(df_class_report.to_markdown())

# Create the confusion matrix
cm = confusion_matrix(decoded_y_test, decoded_y_pred)

# Plot matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig(f'output/figures/confusion_matrix_{TARGET}_aggregate.png')
plt.close()
# plt.show()

# Save predictions to disk
results_df = pd.DataFrame({
    'standardized_id_num': X_stids.loc[X_test.index],
    'true_label': decoded_y_test,
    'predicted_label': decoded_y_pred
})
results_df['standardized_id_num'] = results_df['standardized_id_num'].astype(str).str.zfill(13)

filename = f'data/generated_data/prediction_results_{TARGET}_aggregate_classification.csv'
results_df.to_csv(filename, index=False)


#############################
# FEATURE PERFORMANCE
#############################
from sklearn.linear_model import LogisticRegression

feature_performance = []

for feature in tqdm(X.columns):
    X_feature = X[[feature]].copy()
    
    # Handle missing values
    if X_feature[feature].dtype == 'object':
        X_feature = X_feature.fillna(X_feature.mode().iloc[0])
        X_feature = pd.get_dummies(X_feature, drop_first=True)
    
        # Need at least 1 column after one-hot encoding
        if X_feature.shape[1] == 0:
            continue  # Leave as is
    else:
        X_feature = X_feature.fillna(X_feature.mean(numeric_only=True))
    
    # Force one column, even if empty
    if X_feature.shape[1] == 0:
        continue
    
    X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(X_feature, y, test_size=0.2, random_state=42)
    
    model = LogisticRegression(max_iter=200)  # or DecisionTreeClassifier()
    model.fit(X_train_feat, y_train_feat)
    y_pred = model.predict(X_test_feat)
    score = accuracy_score(y_test_feat, y_pred)
    metric = "Score"
        
    feature_performance.append({"Feature name": feature, metric: score})
    
feature_performance_df = pd.DataFrame(feature_performance).sort_values(by=metric, ascending=False)
filename = f'data/generated_data/feature_rankings_{TARGET}_aggregate_classification.csv'
feature_performance_df.to_csv(filename, index=None)

In [None]:
# AGGREGATE BENCHMARKING

RULE_OF_THUMB_FEATURES = ['partisanship_lean_prev', 'partisanship_lean_prev_prev']

df_datasets = makeDatasets(['2018', '2020', '2022'], ['U.S. House'])
df = aggDatasets(df_datasets, ['2018', '2020', '2022'], ['U.S. House'])

# Check and compute average rule-of-thumb if both fields exist
rule_cols = RULE_OF_THUMB_FEATURES
available_rule_cols = [col for col in rule_cols if col in df.columns]

if len(available_rule_cols) == 0:
    raise Exception("No rule-of-thumb columns found.")

# Calculate rule-of-thumb prediction (average if 2, fallback to 1)
if len(available_rule_cols) == 2:
    df['rule_of_thumb'] = df[available_rule_cols].mean(axis=1)
else:
    df['rule_of_thumb'] = df[available_rule_cols[0]]

# Round and convert benchmark to original categorical labels
df['rule_of_thumb_rounded'] = df['rule_of_thumb'].round()

# Filter to rows with non-null target and benchmark
mask = df['rule_of_thumb_rounded'].notna() & df[TARGET].notna()
y_true = df.loc[mask, TARGET]
y_benchmark = df.loc[mask, 'rule_of_thumb_rounded']

# Fit encoder on all labels if needed
label_encoder.fit(df[TARGET].dropna().astype(str).unique())

# Encode labels
y_true_encoded = label_encoder.transform(y_true.astype(str))
y_benchmark_encoded = label_encoder.transform(y_benchmark.astype(str))

# print("\nRule-of-Thumb Benchmark (Average of Previous Values):")
# print(classification_report(y_true_encoded, y_benchmark_encoded, target_names=label_encoder.classes_))
decoded_y_true = label_encoder.inverse_transform(y_true_encoded)
decoded_y_benchmark = label_encoder.inverse_transform(y_benchmark_encoded)

print(classification_report(decoded_y_true, decoded_y_benchmark))

# Create the confusion matrix
cm = confusion_matrix(decoded_y_true, decoded_y_benchmark)

# Plot with seaborn for nice formatting
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig(f'output/figures/confusion_matrix_{TARGET}_aggregate_benchmark.png')
plt.close()
# plt.show()

# Save benchmark results
benchmark_df = pd.DataFrame({
    'standardized_id_num': df.loc[mask, 'standardized_id_num'],
    'true_label': y_true,
    'benchmark_label': y_benchmark
})

filename = f"data/generated_data/benchmark_results_{TARGET}_aggregate_classification.csv"
benchmark_df.to_csv(filename, index=False)
print(f"Saved benchmark predictions to {filename}")

In [None]:
df_datasets = makeDatasets(['2024'], ['U.S. House'])
df_holdout = aggDatasets(df_datasets, ['2024'], ['U.S. House'])

# For benchmarking
df_holdout_benchmark = df_holdout.copy()

if FEATURES_ALREADY_RANKED:
    print(f'Features already ranked, select top...')
    feature_importance_file = f'data/generated_data/df_importances_{TARGET}.csv'
    top_feature_columns = pd.read_csv(feature_importance_file)['Feature name'].head(TOP_N_FEATURES).tolist()
    required_columns = ['standardized_id_num', TARGET]
    selected_columns = [col for col in top_feature_columns + required_columns if col in df_holdout.columns]
    df_holdout = df_holdout[selected_columns].dropna(subset=[TARGET])

# Clean ID
df_holdout['standardized_id_num'] = df_holdout['standardized_id_num'].astype(str).str.zfill(13)

# Drop rows w/o targets
df_holdout = df_holdout.dropna(subset=[TARGET])

# Save standardized IDs
X_stids = df_holdout['standardized_id_num']

# Define y and encode
y = df_holdout[TARGET]
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Drop unneeded columns after saving id
cols_to_drop = [col for col in drop_features if col in df_holdout.columns]
df_holdout = df_holdout.drop(columns=cols_to_drop)
X = df_holdout.drop(columns=[TARGET])

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

for col in categorical_cols:
    X[col] = X[col].astype(str)

# --- STEP 2: Predict using the trained pipeline ---
print("Predicting on 2024 holdout set...")
y_pred_encoded = xgb_pipeline.predict(X)

decoded_y_test = label_encoder.inverse_transform(y_encoded)
decoded_y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Print classification report
class_report = classification_report(decoded_y_test, decoded_y_pred, output_dict=True)
print(class_report)

df_class_report = pd.DataFrame(class_report).transpose()
df_class_report = df_class_report.round(3)
with open(f'output/reports/classification_report_{TARGET}_holdout_classification.md', 'w') as f:
    f.write("# Classification Report\n\n")
    f.write(df_class_report.to_markdown())

# Create the confusion matrix
cm = confusion_matrix(decoded_y_test, decoded_y_pred)

# Plot with seaborn for nice formatting
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=label_encoder.classes_, 
            yticklabels=label_encoder.classes_)

plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.tight_layout()
plt.savefig(f'output/figures/confusion_matrix_{TARGET}_holdout.png')
plt.close()
# plt.show()

# Save predictions to disk.
results_df = pd.DataFrame({
    'standardized_id_num': X_stids.reset_index(drop=True),
    'true_label': decoded_y_test,
    'predicted_label': decoded_y_pred
})
results_df['standardized_id_num'] = results_df['standardized_id_num'].astype(str).str.zfill(13)

filename = f'data/generated_data/prediction_results_{TARGET}_holdout_classification.csv'
results_df.to_csv(filename, index=False)

In [None]:
# INDIVIDUAL BENCHMARKING
RULE_OF_THUMB_FEATURES = ['partisanship_lean_prev', 'partisanship_lean_prev_prev']

# Check and compute average rule-of-thumb if both fields exist
# rule_cols = ['partisanship_lean_prev', 'partisanship_lean_prev_prev']
rule_cols = RULE_OF_THUMB_FEATURES
available_rule_cols = [col for col in rule_cols if col in df_holdout_benchmark.columns]

if len(available_rule_cols) == 0:
    raise Exception("No rule-of-thumb columns found.")

# Calculate rule-of-thumb prediction (average if 2, fallback to 1)
if len(available_rule_cols) == 2:
    df_holdout_benchmark['rule_of_thumb'] = df_holdout_benchmark[available_rule_cols].mean(axis=1)
else:
    df_holdout_benchmark['rule_of_thumb'] = df_holdout_benchmark[available_rule_cols[0]]

# Round and convert benchmark to original categorical labels
df_holdout_benchmark['rule_of_thumb_rounded'] = df_holdout_benchmark['rule_of_thumb'].round()

# Filter to rows with non-null target and benchmark
mask = df_holdout_benchmark['rule_of_thumb_rounded'].notna() & df_holdout_benchmark[TARGET].notna()
y_true = df_holdout_benchmark.loc[mask, TARGET]
y_benchmark = df_holdout_benchmark.loc[mask, 'rule_of_thumb_rounded']

# Fit encoder on all labels if needed
label_encoder.fit(df_holdout_benchmark[TARGET].dropna().astype(str).unique())

# Encode labels
y_true_encoded = label_encoder.transform(y_true.astype(str))
y_benchmark_encoded = label_encoder.transform(y_benchmark.astype(str))

print("\nRule-of-Thumb Benchmark (Average of Previous Values):")
print(classification_report(y_true_encoded, y_benchmark_encoded, target_names=label_encoder.classes_))

# Save benchmark results
benchmark_df = pd.DataFrame({
    'standardized_id_num': df_holdout_benchmark.loc[mask, 'standardized_id_num'],
    'true_label': y_true,
    'benchmark_label': y_benchmark
})

filename = f"data/generated_data/benchmark_results_{TARGET}_{year}_{office}_classification.csv"
benchmark_df.to_csv(filename, index=False)
print(f"Saved benchmark predictions to {filename}")

#### Measure Feature Importance

In [None]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.linear_model import LogisticRegression
# from xgboost import XGBClassifier

# def build_pipeline(estimator):
#     return Pipeline(steps=[
#         ('preprocessor', preprocessor),
#         ('classifier', estimator)
#     ])

# models = {
#     'Random Forest': RandomForestClassifier(random_state=0),
#     'Logistic Regression': LogisticRegression(max_iter=1000),
#     'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
# }

# for name, estimator in models.items():
#     print(f"Training: {name}")
#     pipeline = build_pipeline(estimator)
#     pipeline.fit(X_train, y_train)
#     y_pred = pipeline.predict(X_test)

#     print(classification_report(y_test, y_pred))
#     print("="*60)

##### Feature Selection – Basic

In [None]:
# from sklearn.model_selection import RandomizedSearchCV
# from scipy.stats import randint, uniform
# from sklearn.experimental import enable_halving_search_cv  # noqa
# from sklearn.model_selection import HalvingRandomSearchCV

# param_dist = {
#     "classifier__max_depth": randint(3, 10), # 3, 10
#     "classifier__learning_rate": uniform(0.01, 0.2),
#     "classifier__n_estimators": randint(50, 200), # 100, 500
#     "classifier__subsample": uniform(0.6, 0.4),
#     "classifier__colsample_bytree": uniform(0.6, 0.4),
# }

# search = HalvingRandomSearchCV(
#     model,
#     param_distributions=param_dist,
#     # n_iter=3,
#     cv=3,
#     scoring="accuracy",
#     verbose=2,
#     n_jobs=-1,
#     random_state=42,
# )

# search.fit(X_train, y_train)
# print("Best accuracy:", search.best_score_)
# print("Best params:", search.best_params_)

##### Feature Selection – Comprehensive

In [None]:
# from sklearn.model_selection import GridSearchCV

# def grid_search_pipeline(estimator, param_grid, name):
#     pipeline = build_pipeline(estimator)
#     grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
#     grid_search.fit(X_train, y_train)
#     return {
#         'name': name,
#         'best_estimator': grid_search.best_estimator_,
#         'best_score': grid_search.best_score_,
#         'best_params': grid_search.best_params_
#     }

In [None]:
# searches = [
#     print(f'Grid search: RandomForestClassifier()')
#     grid_search_pipeline(RandomForestClassifier(), {
#         'classifier__n_estimators': [100, 200],
#         'classifier__max_depth': [5, 10]
#     }, 'Random Forest'),

#     print(f'Grid search: LogisticRegression()')
#     grid_search_pipeline(LogisticRegression(max_iter=1000), {
#         'classifier__C': [0.1, 1, 10]
#     }, 'Logistic Regression'),

#     print(f'Grid search: XGBClassifier()')
#     grid_search_pipeline(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
#         'classifier__n_estimators': [100, 200],
#         'classifier__max_depth': [3, 6]
#     }, 'XGBoost')
# ]

In [None]:
# from sklearn.inspection import permutation_importance

# final_model = searches[0]['best_estimator_']  # Example: pick best model
# result = permutation_importance(final_model, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1)

# import pandas as pd
# perm_df = pd.DataFrame({
#     'feature': final_model.named_steps['preprocessor'].get_feature_names_out(),
#     'importance': result.importances_mean
# }).sort_values(by='importance', ascending=False)

In [None]:
# import shap

# # Extract trained classifier from pipeline
# xgb_model = searches[2]['best_estimator_'].named_steps['classifier']
# explainer = shap.Explainer(xgb_model)
# shap_values = explainer(xgb_model.get_booster().predict(X_test))

# # SHAP summary plot
# shap.summary_plot(shap_values, X_test)

In [None]:
# leaderboard = pd.DataFrame([{
#     'Model': s['name'],
#     'Best F1 Score': s['best_score'],
#     'Best Params': s['best_params']
# } for s in searches]).sort_values(by='Best F1 Score', ascending=False)

# print(leaderboard)