In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.metrics import classification_report

In [None]:
import warnings
warnings.filterwarnings("ignore")

pd.set_option("display.max_columns", None)

### Misc. Models
Poor performance

In [None]:
# Load data
df = pd.read_csv("X22.csv")

target_col = "partisan_temp_category"
X = df.drop(columns=["partisan_temp", "partisan_temp_change_curr", "standardized_id_num", target_col], errors='ignore')
y = df[target_col]

from sklearn.preprocessing import OrdinalEncoder

mapping = {
    "scorching democrat": -4,
    "blazing democrat": -3,
    "hot democrat": -2,
    "warm democrat": -1,
    "neutral": 0,
    "warm republican": 1,
    "hot republican": 2,
    "blazing republican": 3,
    "scorching republican": 4,
}
df["partisan_temp_score"] = df["partisan_temp_category"].map(mapping)

# Identify column types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# Fix mixed-type issues
for col in categorical_cols:
    X[col] = X[col].astype(str)


# Define transformers
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])

# Define model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(n_estimators=200, random_state=42))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report (Random Forest):")
print(classification_report(y_test, y_pred))

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import classification_report

# Load data
df = pd.read_csv("X22.csv")

target_col = "partisan_temp_category"
X = df.drop(columns=["partisan_temp", "partisan_temp_change_curr", "standardized_id_num", target_col], errors='ignore')
y = df[target_col]

# Identify feature types
categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

# Fix mixed-type issues
for col in categorical_cols:
    X[col] = X[col].astype(str)

# Define transformers
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# Combine transformers
preprocessor = ColumnTransformer(transformers=[
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])

# Define model pipeline
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train and evaluate
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print("Classification Report (Logistic Regression):")
print(classification_report(y_test, y_pred))

### XG Boost
Decent performance

In [None]:
# If analysis file has been run to make preds,
# then top features can be used instead of all features.
FEATURES_ALREADY_RANKED = False
TOP_N_FEATURES = 4

In [None]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

df = pd.read_csv("X22.csv")

target_col = "partisan_temp_category"

if FEATURES_ALREADY_RANKED:
    top_feature_columns = pd.read_csv('feature_rankings.csv')['Feature name'].head(TOP_N_FEATURES).tolist()
    required_columns = ['standardized_id_num', target_col]
    selected_columns = [col for col in top_feature_columns + required_columns if col in df.columns]
    df = df[selected_columns].dropna(subset=[target_col])

df['standardized_id_num'] = df['standardized_id_num'].astype(str).str.zfill(13)
X_stids = df['standardized_id_num']

X = df.drop(columns=[
    target_col, 'standardized_id_num', # DO NOT CHANGE
    'dem_share_change_prev', 
    # 'dem_share_prev', 
    'dem_votes_change_prev', 'office_code', 'oth_share_change_prev', 'oth_share_prev', 'oth_votes_change_prev', 
    'partisan_temp', 'partisan_temp_change_curr', 'partisan_temp_change_prev',
    # 'partisan_temp_prev', 
    'partisanship_lean_change_amount_prev', 'partisanship_lean_change_prev', 'partisanship_lean_curr', 
    'partisanship_lean_prev', 'registered_voters_change_prev', 'rep_share_change_prev', 
    # 'rep_share_prev', 
    'rep_votes_change_prev', 
    'turnout_pct_change_prev',
], errors='ignore')

y = df[target_col]

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

categorical_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = X.select_dtypes(include=['number']).columns.tolist()

for col in categorical_cols:
    X[col] = X[col].astype(str)

categorical_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(handle_unknown='ignore'))
])

numeric_transformer = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer([
    ('cat', categorical_transformer, categorical_cols),
    ('num', numeric_transformer, numeric_cols)
])

model = XGBClassifier(
    objective="multi:softmax",  # multi:softmax or multi:softprob
    num_class=len(y.unique()),
    use_label_encoder=False,
    n_estimators=200,
    max_depth=5,
    learning_rate=0.1,
    random_state=42,
    n_jobs=-1,
    eval_metric="mlogloss"
)

model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', model)
])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, stratify=y, random_state=42)

In [None]:
X_train.sample()

In [None]:
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

decoded_y_test = label_encoder.inverse_transform(y_test)
decoded_y_pred = label_encoder.inverse_transform(y_pred)

print(classification_report(decoded_y_test, decoded_y_pred))

# Save predictions to disk.
results_df = pd.DataFrame({
    'standardized_id_num': X_stids.loc[X_test.index],
    'true_label': decoded_y_test,
    'predicted_label': decoded_y_pred
})
results_df['standardized_id_num'] = results_df['standardized_id_num'].astype(str).str.zfill(13)
results_df.to_csv("prediction_results.csv", index=False)

#### Measure Feature Importance

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score, accuracy_score, f1_score, accuracy_score, classification_report, confusion_matrix
from tqdm import tqdm

feature_performance = []
is_continuous = y.dtype.kind in 'fc' # float or continuous

for feature in tqdm(X.columns):
    X_feature = X[[feature]].copy()
    
    # Handle missing values
    if X_feature[feature].dtype == 'object':
        X_feature = X_feature.fillna(X_feature.mode().iloc[0])
        X_feature = pd.get_dummies(X_feature, drop_first=True)
    
        # Need at least 1 column after one-hot encoding
        if X_feature.shape[1] == 0:
            continue  # Leave as is
    else:
        X_feature = X_feature.fillna(X_feature.mean(numeric_only=True))
    
    # Skip if still empty
    if X_feature.shape[1] == 0:
        continue
    
    X_train_feat, X_test_feat, y_train_feat, y_test_feat = train_test_split(X_feature, y, test_size=0.2, random_state=42)
    
    if is_continuous:  # Regression
        model = LinearRegression()
        model.fit(X_train_feat, y_train_feat)
        y_pred = model.predict(X_test_feat)
        score = r2_score(y_test_feat, y_pred)  # R² for regression
        metric = "LinearRegression"
        
    else:  # Classification
        model = LogisticRegression(max_iter=200)  # or DecisionTreeClassifier()
        model.fit(X_train_feat, y_train_feat)
        y_pred = model.predict(X_test_feat)
        score = accuracy_score(y_test_feat, y_pred)  # Acc. for classification
        metric = "LogisticRegression"
        
    feature_performance.append({"Feature Name": feature, metric: score})
    
feature_performance_df = pd.DataFrame(feature_performance).sort_values(by=metric, ascending=False)
feature_performance_df.to_csv(f'feature_rankings.csv', index=None)

feature_performance_df.head(15)

In [None]:
# feature_performance_df.head(15)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier

def build_pipeline(estimator):
    return Pipeline(steps=[
        ('preprocessor', preprocessor),
        ('classifier', estimator)
    ])

models = {
    'Random Forest': RandomForestClassifier(random_state=0),
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss')
}

for name, estimator in models.items():
    print(f"Training: {name}")
    pipeline = build_pipeline(estimator)
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    print(classification_report(y_test, y_pred))
    print("="*60)

##### Feature Selection – Basic

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint, uniform

from sklearn.experimental import enable_halving_search_cv  # noqa
from sklearn.model_selection import HalvingRandomSearchCV

param_dist = {
    "classifier__max_depth": randint(3, 10), # 3, 10
    "classifier__learning_rate": uniform(0.01, 0.2),
    "classifier__n_estimators": randint(50, 200), # 100, 500
    "classifier__subsample": uniform(0.6, 0.4),
    "classifier__colsample_bytree": uniform(0.6, 0.4),
}

search = HalvingRandomSearchCV(
    model,
    param_distributions=param_dist,
    # n_iter=3,
    cv=3,
    scoring="accuracy",
    verbose=2,
    n_jobs=-1,
    random_state=42,
)

search.fit(X_train, y_train)
print("Best accuracy:", search.best_score_)
print("Best params:", search.best_params_)

##### Feature Selection – Comprehensive

In [None]:
from sklearn.model_selection import GridSearchCV

def grid_search_pipeline(estimator, param_grid, name):
    pipeline = build_pipeline(estimator)
    grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='f1_weighted', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    return {
        'name': name,
        'best_estimator': grid_search.best_estimator_,
        'best_score': grid_search.best_score_,
        'best_params': grid_search.best_params_
    }

In [None]:
searches = [
    print(f'Grid search: RandomForestClassifier()')
    grid_search_pipeline(RandomForestClassifier(), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [5, 10]
    }, 'Random Forest'),

    print(f'Grid search: LogisticRegression()')
    grid_search_pipeline(LogisticRegression(max_iter=1000), {
        'classifier__C': [0.1, 1, 10]
    }, 'Logistic Regression'),

    print(f'Grid search: XGBClassifier()')
    grid_search_pipeline(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), {
        'classifier__n_estimators': [100, 200],
        'classifier__max_depth': [3, 6]
    }, 'XGBoost')
]

In [None]:
from sklearn.inspection import permutation_importance

final_model = searches[0]['best_estimator_']  # Example: pick best model
result = permutation_importance(final_model, X_test, y_test, n_repeats=10, random_state=0, n_jobs=-1)

import pandas as pd
perm_df = pd.DataFrame({
    'feature': final_model.named_steps['preprocessor'].get_feature_names_out(),
    'importance': result.importances_mean
}).sort_values(by='importance', ascending=False)

In [None]:
import shap

# Extract trained classifier from pipeline
xgb_model = searches[2]['best_estimator_'].named_steps['classifier']
explainer = shap.Explainer(xgb_model)
shap_values = explainer(xgb_model.get_booster().predict(X_test))

# SHAP summary plot
shap.summary_plot(shap_values, X_test)

In [None]:
leaderboard = pd.DataFrame([{
    'Model': s['name'],
    'Best F1 Score': s['best_score'],
    'Best Params': s['best_params']
} for s in searches]).sort_values(by='Best F1 Score', ascending=False)

print(leaderboard)

### Run after 08_analysis
Top features and clusters and etc.