In [1]:
!pip -q install scikit-learn pandas matplotlib seaborn gradio

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True) # Mount your drive
%cd /content/drive/MyDrive/ADS2_Aaron/data

Mounted at /content/drive
/content/drive/MyDrive/ADS2_Aaron/data


In [3]:
import pickle
with open("01_baseline_svm_and_rf.pkl", "rb") as file:
  data = pickle.load(file)

rf_classifier = data['models']['random_forest']
svm_classifier = data['models']['svm']

X_train = data['data']['X_train']
X_test = data['data']['X_test']
y_train = data['data']['y_train']
y_test = data['data']['y_test']

X_train_unprocessed = data['data']['X_train_unprocessed']
feature_order = list(X_train.columns)

num_imputer = data['preprocessors']['num_imputer']
scaler = data['preprocessors']['scaler']
cat_imputer = data['preprocessors']['cat_imputer']
label_encoder = data['preprocessors']['label_encoder']
onehot_encoder = data['preprocessors']['onehot_encoder']

numeric_features = data['cols']['numeric_features']
categorical_features = data['cols']['categorical_features']
label_encode_col = data['cols']['label_encode_col']
onehot_cols = data['cols']['onehot_cols']

target_col = "class"

In [4]:
def preprocess_input(user_dict):
    """
    Apply the SAME transforms as train-time to a single-row user_dict and
    return a 1-row DataFrame with columns exactly matching feature_order.
    """
    # 1) Raw one-row frame
    df_in = pd.DataFrame([user_dict])

    # 2) Numeric -> impute -> scale
    Xn = pd.DataFrame(num_imputer.transform(df_in[numeric_features]),
                      columns=numeric_features)
    Xn_scaled = pd.DataFrame(scaler.transform(Xn), columns=numeric_features)

    # 3) Categorical -> impute -> label-encode ONE col -> one-hot the rest
    df_cat = df_in[categorical_features].copy()
    df_cat = pd.DataFrame(
      cat_imputer.transform(df_in[categorical_features]),
      columns=categorical_features
    )

    # Label-encode the chosen column (assumes no unseen categories)
    df_cat[label_encode_col] = label_encoder.transform(df_cat[label_encode_col])


    # one-hot the remaining categoricals (trained with handle_unknown="ignore")
    Xc_ohe = pd.DataFrame(
        onehot_encoder.transform(df_cat[onehot_cols]),
        columns=onehot_encoder.get_feature_names_out(onehot_cols)
    )

    # final categorical block = label-encoded col + one-hot block
    Xc_final = pd.concat(
        [df_cat[[label_encode_col]].reset_index(drop=True),
         Xc_ohe.reset_index(drop=True)],
        axis=1
    )

    # 4) Assemble numeric + categorical
    X_final = pd.concat(
        [Xn_scaled.reset_index(drop=True), Xc_final.reset_index(drop=True)],
        axis=1
    )

    # 5) Enforce EXACT training columns & order in ONE step
    # - drops any extra columns
    # - inserts any missing one-hot columns filled with 0
    #X_final = X_final.reindex(columns=feature_order, fill_value=0)
    # at end of preprocess_input
    expected_cols = list(getattr(rf_classifier, "feature_names_in_", feature_order))
    X_final = X_final.reindex(columns=expected_cols, fill_value=0)
    return X_final


    #return X_final


In [7]:
import plotly.express as px

# Simple prediction function returning label + probability of >50K if available
def predict_compare(age=37, hours_per_week=40, education_num=10, capital_gain=0, capital_loss=0, sex="Male", workclass="Federal-gov", occupation="Adm-clerical", marital_status="Divorced", native_country="Cambodia", threshold=0.5, model="Both"):
    raw = {
        "age": age,
        "hours-per-week": hours_per_week,
        "education-num": education_num,
        "sex": sex,               # label-encoded column
        "workclass": workclass,   # one-hot
        "occupation": occupation, #dropdown
        "marital-status": marital_status, #dropdown
        "native-country": native_country, #dropdown
        "capital-gain": capital_gain, # int
        "capital-loss": capital_loss, # int
        "threshold": threshold, # float
    }

    Xf = preprocess_input(raw)
    classes = list(rf_classifier.classes_)
    out = ""

    if model in ("RF", "Both"):
      label = rf_classifier.predict_proba(Xf)[:,classes.index(">50K")][0] > threshold
      out += f"RF Prediction: {label}"
      proba = rf_classifier.predict_proba(Xf)[0]
      if ">50K" in classes:
          p = proba[classes.index(">50K")]
          out += f"  |  P(>50K) = {p:.2f}"
          out += f" | Threshold = {threshold}"

    if model in ("SVM", "Both"):
      out += "\n"
      label = svm_classifier.predict_proba(Xf)[:,classes.index(">50K")][0] > threshold
      out += f"SVM Prediction: {label}"
      proba = rf_classifier.predict_proba(Xf)[0]
      if ">50K" in classes:
          p = proba[classes.index(">50K")]
          out += f"  |  P(>50K) = {p:.2f}"
          out += f" | Threshold = {threshold}"


    return out

In [9]:
# Use categories from your training set (already imputed)
workclass_choices = sorted(pd.unique(X_train_unprocessed["workclass"].dropna()))
sex_choices = sorted(pd.unique(X_train_unprocessed["sex"].dropna()))
occupation_choices = sorted(pd.unique(X_train_unprocessed["occupation"].dropna()))
occupation_choices = sorted(pd.unique(X_train_unprocessed["occupation"].dropna()))
marital_status_choices = sorted(pd.unique(X_train_unprocessed["marital-status"].dropna()))
native_country_choices = sorted(pd.unique(X_train_unprocessed["native-country"].dropna()))

demo = gr.Interface(
    fn=predict_compare,
    inputs=[
        gr.Slider(17, 90, value=37, step=1, label="age"),
        gr.Slider(1, 80, value=40, step=1, label="hours_per_week"),
        gr.Slider(1, 16, value=10, step=1, label="education_num"),
        gr.Slider(0, 10000, value=0, step=1000, label="capital_gain"),
        gr.Slider(0, 5000, value=0, step=100, label="capital_loss"),
        gr.Radio(list(sex_choices), value="Male", label="sex"),
        gr.Dropdown(list(workclass_choices), value="Federal-gov", label="workclass"),
        gr.Dropdown(list(occupation_choices), value="Adm-clerical", label="occupation"),
        gr.Dropdown(list(marital_status_choices), value="Divorced", label="marital_status"),
        gr.Dropdown(list(native_country_choices), value="Cambodia", label="native_country"),
        gr.Slider(0.0, 1.0, value=0.5, step=0.01, label="threshold"),
        gr.Radio(["Both", "RF", "SVM"], value="Both", label="Models to run"),
    ],
    outputs=[
        gr.Textbox(label="Summary"),
    ],
    title="Income Classifier: RF vs SVM (Output Comparison)",
    description="Compares predictions and P(>50K) from Random Forest and SVM at a chosen threshold."
)

demo.launch(share=True, show_error=True, debug=True)

KeyboardInterrupt: 