In [7]:
import pandas as pd
import optuna

from xgboost import  XGBClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report, accuracy_score, confusion_matrix


In [8]:
X_train = pd.read_csv("data/x_train.csv", header=None)
X_test = pd.read_csv("data/x_text.csv", header=None)
y_train = pd.read_csv("data/y_train.csv", header=None)
y_test = pd.read_csv("data/y_test.csv", header=None)



print("X train ",X_train.shape)
print("X test",X_test.shape)
print("Y_Train", y_train.shape)
print("Y_test", y_test.shape)

X train  (3417, 1000)
X test (375, 1000)
Y_Train (3417, 1)
Y_test (375, 1)


In [9]:
# replace 1 with 0 and -1 with 1.

# because XGBoost excepts: ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1], got [-1  1]
y_train.replace(1, 0, inplace=True)
y_train.replace(-1, 1, inplace=True)


y_test.replace(1, 0, inplace=True)
y_test.replace(-1, 1, inplace=True)

In [10]:
def objective(trial):
    # Define hyperparameters to tune
    param = {
        'objective': 'binary:logistic',
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 200),
        'subsample': trial.suggest_float('subsample', 0.6, 1.0),
        'gamma': trial.suggest_float('gamma', 0.0, 1.0),
        'scale_pos_weight': 1
    }

    model = XGBClassifier(**param, use_label_encoder=False, eval_metric='logloss')

    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred)

    return precision

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)

print(f"Best hyperparameters: {study.best_params}")
print(f"Best precision: {study.best_value}")

[I 2024-11-30 22:54:02,949] A new study created in memory with name: no-name-f30d9da2-990c-44cc-b039-a6f6ef7e7c1f


Parameters: { "use_label_encoder" } are not used.

[I 2024-11-30 22:56:07,629] Trial 0 finished with value: 0.046511627906976744 and parameters: {'max_depth': 9, 'learning_rate': 0.28400861278242023, 'n_estimators': 150, 'subsample': 0.7829637164723547, 'gamma': 0.6441736721396305}. Best is trial 0 with value: 0.046511627906976744.
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-30 22:57:58,117] Trial 1 finished with value: 0.078125 and parameters: {'max_depth': 7, 'learning_rate': 0.2587048337731825, 'n_estimators': 145, 'subsample': 0.8492330507799103, 'gamma': 0.44168416852096093}. Best is trial 1 with value: 0.078125.
Parameters: { "use_label_encoder" } are not used.

[I 2024-11-30 23:01:18,216] Trial 2 finished with value: 0.08163265306122448 and parameters: {'max_depth': 5, 'learning_rate': 0.07347185444552125, 'n_estimators': 140, 'subsample': 0.6478069602051036, 'gamma': 0.1913956509419259}. Best is trial 2 with value: 0.08163265306122448.
Parameters: { "use_label

Best hyperparameters: {'max_depth': 9, 'learning_rate': 0.27575078431012434, 'n_estimators': 69, 'subsample': 0.6785914805836307, 'gamma': 0.5737136862039166}
Best precision: 0.2
