In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    recall_score,
    precision_score
)
import time
import warnings
warnings.filterwarnings("ignore")

df = pd.read_csv("Friday-Merged-Binary.csv")
df = df.drop(columns=[" Label"])
X = df.drop(columns=["Attempted Category"])
y = df["Attempted Category"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.4, random_state=42, stratify=y
)

y_train = y_train.replace(-1, 0)
y_test = y_test.replace(-1, 0)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

print("Train shape:", X_train.shape, y_train.shape)
print("Test shape:", X_test.shape, y_test.shape)

xgboost_model = XGBClassifier(
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=0
)

param_dist = {
    'n_estimators': [100, 200, 300, 500],
    'max_depth': [3, 4, 5, 6],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.3, 0.5],
    'reg_alpha': [0, 0.1, 0.5],
    'reg_lambda': [1, 1.5, 2]
}

random_search = RandomizedSearchCV(
    estimator=xgboost_model,
    param_distributions=param_dist,
    n_iter=20,
    scoring='f1_macro',
    cv=3,
    verbose=1,
    n_jobs=-1,
    random_state=0
)

start = time.time()
random_search.fit(X_train, y_train)
end = time.time()
print("RandomizedSearchCV fitting time: {:.2f} seconds".format(end - start))

print("Best hyperparameters:")
print(random_search.best_params_)

best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("Accuracy:", acc)
f1_macro = f1_score(y_test, y_pred, average="macro")
f1_micro = f1_score(y_test, y_pred, average="micro")
print("F1 score (macro):", f1_macro)
print("F1 score (micro):", f1_micro)
precision = precision_score(y_test, y_pred, zero_division=0)
recall = recall_score(y_test, y_pred, zero_division=0)
print("Precision:", precision)
print("Recall:", recall)


Train shape: (369259, 78) (369259,)
Test shape: (246174, 78) (246174,)
Fitting 3 folds for each of 20 candidates, totalling 60 fits
RandomizedSearchCV fitting time: 108.59 seconds
Best hyperparameters:
{'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0.5, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.01, 'gamma': 0.3, 'colsample_bytree': 0.8}
Accuracy: 0.9986676090895058
F1 score (macro): 0.9985506916578982
F1 score (micro): 0.9986676090895058
Precision: 0.9996817856371675
Recall: 0.9966010672648788
