In [None]:
### LITHOLOGICAL PREDICTION MODELS 

## IMPORT THE NECESSARY LIBRARIES

import joblib
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, classification_report, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, StratifiedShuffleSplit, RepeatedStratifiedKFold, GridSearchCV

# LOAD THE LABELLED DATASET FROM EXCEL FILE

data = pd.read_excel('name_of_the_excel_sheet_with_dataset')

# DEFINE THE INPUTS

X = data[['name_of_the_input_columns']]

# DEFINE THE OUTPUT

y = data[['name_of_the_output_column']]

# DIVIDE THE DATASET INTO TWO AS X (INPUTS) AND Y (OUTPUT), DEFINE THE TRAIN AND TEST SET WHILE PRESERVING THE PERCENTAGE OF SAMPLES FOR EACH CLASS

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.20, random_state=33)
sss.get_n_splits(X, y)
for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
X_train

# TRAIN AND SEE THE ACCURACY FOR EACH XGBOOST MODEL WITH DIFFERENT HYPERPARAMETERS

model_2 = XGBClassifier() 
n_estimators = range(50,100)
depth = [1,2,3]

grid = dict(n_estimators=n_estimators, max_depth=depth)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1) 

#Repeats Stratified K-Fold n times with different randomization in each repetition.
#This cross-validation object is a variation of KFold that returns stratified folds. The folds are made by preserving the percentage of samples for each class.
#n_splits, Number of folds. Must be at least 2.
#n_repeats, Number of times cross-validator needs to be repeated.

grid_search = GridSearchCV(estimator=model, param_grid=grid, n_jobs=-1, cv=cv)
grid_result = grid_search.fit(X_train, y_train)

print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

# FIT THE BEST XGB MODEL TO SAVE IT AFTER GRID SEARCH

grid_search.best_params_
model_2 = grid_search.best_estimator_
model_2.fit(X_train, y_train)

# PREDICT THE TRAIN AND TEST OUTPUT TO SEE WHETHER THE XGB MODEL OVERFITS OT NOT

train_pred_2 = model_2.predict(X_train)
y_pred_2 = model_2.predict(X_test)

# VISUALIZE THE RESULT AND CALCULATE THE ACCURACY FOR TEST DATA

cm = confusion_matrix(y_test, y_pred_2)
ax = sns.heatmap(cm, annot=True, annot_kws={"size": 13.5}, fmt=".0f", cmap = 'RdYlBu')
ax.xaxis.set_ticklabels([1,2,3,4,5,6])
ax.yaxis.set_ticklabels([1,2,3,4,5,6])
ax.set_title('Confusion matrix', fontsize=17)
ax.set_xlabel('Predictions', fontsize=17)
ax.set_ylabel('True values', fontsize=17)
plt.show();
print('test acc:', model_2.score(X_test, y_test))

# VISUALIZE THE RESULT AND CALCULATE THE ACCURACY FOR TRAIN DATA

cm = confusion_matrix(y_train, train_pred_2)
ax = sns.heatmap(cm, annot=True, annot_kws={"size": 13.5}, fmt=".0f", cmap = 'RdYlBu')
ax.xaxis.set_ticklabels([1,2,3,4,5,6])
ax.yaxis.set_ticklabels([1,2,3,4,5,6])
ax.set_title('Confusion matrix', fontsize=17)
ax.set_xlabel('Predictions', fontsize=17)
ax.set_ylabel('True values', fontsize=17)
plt.show();
print('train acc:', model_2.score(X_train, y_train))

# SAVE THE XGBOOST MODEL FOR FUTURE USE

filename = 'name_of_the_XGB_model.sav'
joblib.dump(model_2, filename)