In [None]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
import joblib


# =========================
# 1) Load embeddings + labels
# =========================
X = np.load("bert_embeddings.npy")  # e.g., (50000, 384)

df = pd.read_csv("IMDB Dataset.csv")  # must align with embeddings row order
y = (df["sentiment"].str.lower() == "positive").astype(int).to_numpy()

assert len(y) == X.shape[0], f"y length {len(y)} != X rows {X.shape[0]}"


# =========================
# 2) Split data
# =========================
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# For final XGBoost early stopping refit (NOT used in CV)
X_tr, X_val, y_tr, y_val = train_test_split(
    X_train, y_train, test_size=0.1, random_state=42, stratify=y_train
)

cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

def eval_binary(y_true, y_pred):
    return {
        "Accuracy": accuracy_score(y_true, y_pred),
        "Precision": precision_score(y_true, y_pred, zero_division=0),
        "Recall": recall_score(y_true, y_pred, zero_division=0),
        "F1": f1_score(y_true, y_pred, zero_division=0),
    }


# =========================
# 3) Define models + SMALL randomized searches
# =========================
models = {
    "LogReg": {
        "estimator": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LogisticRegression(max_iter=3000))
        ]),
        "param_distributions": {
            "clf__C": [0.1, 1, 10],
            "clf__solver": ["lbfgs", "liblinear"],
        },
        "n_iter": 6,
    },

    "LinearSVM": {
        "estimator": Pipeline([
            ("scaler", StandardScaler()),
            ("clf", LinearSVC())
        ]),
        "param_distributions": {
            "clf__C": [0.1, 1, 10],
        },
        "n_iter": 3,
    },

    "RandomForest": {
        "estimator": RandomForestClassifier(random_state=42, n_jobs=-1),
        "param_distributions": {
            "n_estimators": [200, 400],
            "max_depth": [None, 20],
            "max_features": ["sqrt"],
            "min_samples_split": [2, 5],
        },
        "n_iter": 6,
    },

    # XGBoost: search WITHOUT early stopping first (fast + clean with CV)
    "XGBoost": {
        "estimator": XGBClassifier(
            objective="binary:logistic",
            eval_metric="logloss",     
            random_state=42,
            n_jobs=-1,
            tree_method="hist",
            n_estimators=300           # keep small for tuning speed
        ),
        "param_distributions": {
            "max_depth": [3, 5, 7],
            "learning_rate": [0.05, 0.1],
            "subsample": [0.8, 1.0],
            "colsample_bytree": [0.8, 1.0],
        },
        "n_iter": 8,
    }
}


# =========================
# 4) Train, tune, evaluate
# =========================
rows = []
best_name, best_model, best_f1 = None, None, -1

best_xgb_params = None  # store best params for later early-stopping refit

for name, cfg in models.items():
    search = RandomizedSearchCV(
        estimator=cfg["estimator"],
        param_distributions=cfg["param_distributions"],
        n_iter=cfg["n_iter"],
        scoring="f1",
        cv=cv,
        n_jobs=-1,
        verbose=2,
        random_state=42,
        error_score="raise",
        refit=True
    )

    # CV fit (fast)
    search.fit(X_train, y_train)

    # Evaluate on test
    y_pred = search.best_estimator_.predict(X_test)
    metrics = eval_binary(y_test, y_pred)

    rows.append({
        "Model": name,
        "BestParams": search.best_params_,
        **metrics
    })

    # Track best overall model (temporary for XGB; we will refit with early stopping below)
    if metrics["F1"] > best_f1:
        best_f1 = metrics["F1"]
        best_name = name
        best_model = search.best_estimator_

    if name == "XGBoost":
        best_xgb_params = search.best_params_


if best_xgb_params is not None:
    xgb_final = XGBClassifier(
        objective="binary:logistic",
        eval_metric="logloss",
        random_state=42,
        n_jobs=-1,
        tree_method="hist",
        n_estimators=2000,           
        early_stopping_rounds=30,     
        **best_xgb_params
    )

    xgb_final.fit(
        X_tr, y_tr,
        eval_set=[(X_val, y_val)],
        verbose=False
    )

    y_pred_xgb = xgb_final.predict(X_test)
    metrics_xgb = eval_binary(y_test, y_pred_xgb)

    # Update the XGBoost row to reflect final early-stopping performance
    for r in rows:
        if r["Model"] == "XGBoost":
            r["Accuracy"] = metrics_xgb["Accuracy"]
            r["Precision"] = metrics_xgb["Precision"]
            r["Recall"] = metrics_xgb["Recall"]
            r["F1"] = metrics_xgb["F1"]
            r["BestParams"] = {**best_xgb_params, "early_stopping_rounds": 30, "n_estimators(max)": 2000}
            break

    # Update best model if XGBoost final becomes best
    if metrics_xgb["F1"] > best_f1:
        best_f1 = metrics_xgb["F1"]
        best_name = "XGBoost"
        best_model = xgb_final


# =========================
# 6) Outputs
# =========================
results = pd.DataFrame(rows).sort_values("F1", ascending=False)
results.to_csv("embedding_model_comparison.csv", index=False)
joblib.dump(best_model, f"best_embedding_model_{best_name}.joblib")

print("Saved: embedding_model_comparison.csv")
print(f"Best embedding model: {best_name}, F1={best_f1:.4f}")
print(results)

Fitting 3 folds for each of 6 candidates, totalling 18 fits
[CV] END ......................clf__C=0.1, clf__solver=lbfgs; total time=   4.2s
[CV] END ......................clf__C=0.1, clf__solver=lbfgs; total time=   4.6s
[CV] END ......................clf__C=0.1, clf__solver=lbfgs; total time=   5.2s
[CV] END ........................clf__C=1, clf__solver=lbfgs; total time=   5.5s
[CV] END ........................clf__C=1, clf__solver=lbfgs; total time=   5.7s
[CV] END ........................clf__C=1, clf__solver=lbfgs; total time=   4.3s
[CV] END .......................clf__C=10, clf__solver=lbfgs; total time=   5.4s
[CV] END .......................clf__C=10, clf__solver=lbfgs; total time=   5.0s
[CV] END .......................clf__C=10, clf__solver=lbfgs; total time=   4.9s
[CV] END ..................clf__C=0.1, clf__solver=liblinear; total time=  17.7s
[CV] END ..................clf__C=0.1, clf__solver=liblinear; total time=  19.5s
[CV] END ..................clf__C=0.1, clf__solve