In [1]:
import joblib
X_train_sm, X_val, X_test, y_train_sm, y_val, y_test = joblib.load('split_data.pkl')

In [2]:
print(f"Training set shape: {X_train_sm.shape, y_train_sm.shape}")
print(f"Validation set shape: {X_val.shape, y_val.shape}")
print(f"Test set shape: {X_test.shape, y_test.shape}")

Training set shape: ((12856, 6), (12856,))
Validation set shape: ((1414, 6), (1414,))
Test set shape: ((1414, 6), (1414,))


In [6]:
import xgboost as xgb

In [7]:
xgb.XGBClassifier().get_params() 

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [8]:
xgb_model = xgb.XGBClassifier(
    learning_rate=0.1,
    max_depth=5,
    subsample = 1,
    min_child_weight=1,
    n_estimators=100,
    reg_alpha=0,
    random_state=42,

)
#xgb_model.fit(X_train_sm, y_train_sm)
xgb_model.fit(X_train_sm, y_train_sm)
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, mean_squared_error

# Predict on validation data
y_val_pred = xgb_model.predict(X_val)

# Evaluate model performance
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
recall = recall_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
mse = mean_squared_error(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)

print(f"Initial model accuracy: {accuracy}")
print(f"Initial model precision: {precision}")
print(f"Initial model recall: {recall}")
print(f"Initial model F1-score: {f1}")
print(f"Initial model MSE on : {mse}")
print(f"Confusion Matrix:\n{conf_matrix}")

Initial model accuracy: 0.9533239038189534
Initial model precision: 0.36046511627906974
Initial model recall: 0.7380952380952381
Initial model F1-score: 0.484375
Initial model MSE on : 0.04667609618104668
Confusion Matrix:
[[1317   55]
 [  11   31]]


In [9]:
from sklearn.model_selection import GridSearchCV

# Define the initial XGBoost model
xgb_model = xgb.XGBClassifier(objective='binary:logistic', random_state=42)

# Define the parameter grid to search
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15, 20],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [ 0.4, 0.6, 0.8],
    'colsample_bytree': [0.4, 0.6, 0.8],
     'reg_alpha' : [0.0, 10.0],
    'min_child_weight': [1, 5],
}

# Set up GridSearchCV
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='accuracy', 
    cv=5,              
    verbose=2,
    n_jobs=-1          
)

# Fit GridSearchCV on the training data
grid_search.fit(X_train_sm, y_train_sm)

# Best model from grid search
best_xgb_model = grid_search.best_estimator_

# Save the best model for future use
import joblib
joblib.dump(best_xgb_model, 'xgb_best_model.pkl')

# Print the best parameters found
print(f"Best parameters: {grid_search.best_params_}")


Fitting 5 folds for each of 1296 candidates, totalling 6480 fits
Best parameters: {'colsample_bytree': 0.6, 'learning_rate': 0.2, 'max_depth': 20, 'min_child_weight': 1, 'n_estimators': 300, 'reg_alpha': 0.0, 'subsample': 0.8}


In [10]:
# Predict using the best model
y_val_pred_opt = best_xgb_model.predict(X_val)

# Evaluate the optimized model
accuracy_opt = accuracy_score(y_val, y_val_pred_opt)
precision_opt = precision_score(y_val, y_val_pred_opt)
recall_opt = recall_score(y_val, y_val_pred_opt)
f1_opt = f1_score(y_val, y_val_pred_opt)
mse = mean_squared_error(y_val, y_val_pred)
conf_matrix_opt = confusion_matrix(y_val, y_val_pred_opt)

print(f"Optimized model accuracy: {accuracy_opt}")
print(f"Optimized model precision: {precision_opt}")
print(f"Optimized model recall: {recall_opt}")
print(f"Optimized model F1-score: {f1_opt}")
print(f"Initial model MSE on : {mse}")
print(f"Confusion Matrix:\n{conf_matrix_opt}")



Optimized model accuracy: 0.978076379066478
Optimized model precision: 0.6122448979591837
Optimized model recall: 0.7142857142857143
Optimized model F1-score: 0.6593406593406593
Initial model MSE on : 0.04667609618104668
Confusion Matrix:
[[1353   19]
 [  12   30]]


In [11]:
#Make predictions on the test dataset
y_test_pred = best_xgb_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Model Accuracy on Test Data: {test_accuracy:.4f}")

Model Accuracy on Test Data: 0.9837
