In [None]:
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pandas as pd

data = pd.read_csv("https://raw.githubusercontent.com/lmitchell33/Fluid-Solutions-ML/refs/heads/main/data/processed/FINAL_TRAINING_DATA.csv")

In [None]:
# separte the data into X (input) and y (label)
X, y = data.drop(columns=['label', 'Unnamed: 0']).to_numpy(), data['label'].to_numpy()

In [None]:
# transfer the categorical data into numerical (models can only understand numerical data)
label_encoder = LabelEncoder()
y_labeled = label_encoder.fit_transform(y)

# Labels are as follows:
# high -> 0
# low -> 1
# normal -> 2

label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))
label_mapping

In [None]:
feature_names = data.drop(columns=['label', 'Unnamed: 0']).columns.to_list()
feature_names

In [None]:
# randomly split the dat into training and testing subsets
# this is a 75/25 train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y_labeled, test_size=0.20, random_state=42)

In [None]:
y_train

In [None]:
# create a parameter grid to perfomr the grid search with (xgboost paramters this time)
param_grid = {
    'n_estimators': [50, 100, 150],      # for the regular RF this was 100
    'max_depth': [3, 4, 5],              # for the rgular RF this was None
    'eta': [0.01, 0.05, 0.1],  
    'subsample': [0.8, 1.0],             
    'colsample_bytree': [0.8, 1.0],      
    'gamma': [0, 0.1, 0.2],              
    'min_child_weight': [1, 3]           
}

In [None]:
# decalre the model NOTE: this model automatically uses logloss for the validation data
xgb_rf = xgb.XGBClassifier(random_state=42)
grid_search = GridSearchCV(xgb_rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

In [None]:
print(f"Best parameters: {grid_search.best_params_}")

In [None]:
best_xg_boost = grid_search.best_estimator_
best_params = grid_search.best_params_

In [None]:
y_pred = best_xg_boost.predict(X_test)
print("Test Set Accuracy:", accuracy_score(y_test, y_pred))

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))

In [None]:
best_xg_boost.save_model("/content/xgboost_model.json")

In [None]:
# create a confusion matrix to see how well the model distinguishes between classes
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Labels are as follows:
# high -> 0
# low -> 1
# normal -> 2

cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()
plt.title("Confusion Matrix")
plt.show()

In [None]:
# Check to see how the model is fitting to the data by plotting the learning curve
from sklearn.model_selection import learning_curve
import numpy as np

best_model = xgb.XGBClassifier(**best_params, random_state=42)

train_sizes, train_scores, cv_scores = learning_curve(
    best_model, X, y_labeled, cv=5, scoring='accuracy',
    train_sizes=np.linspace(0.1, 1.0, 10), n_jobs=-1
)

train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
cv_scores_mean = np.mean(cv_scores, axis=1)
cv_scores_std = np.std(cv_scores, axis=1)

plt.figure(figsize=(8, 6))
plt.fill_between(train_sizes, train_scores_mean - train_scores_std,
                 train_scores_mean + train_scores_std, alpha=0.1, color="r")
plt.fill_between(train_sizes, cv_scores_mean - cv_scores_std,
                 cv_scores_mean + cv_scores_std, alpha=0.1, color="g")
plt.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
plt.plot(train_sizes, cv_scores_mean, 'o-', color="g", label="Cross-validation score")
plt.xlabel("Number of Training Examples")
plt.ylabel("Accuracy Score")
plt.title("Learning Curve")
plt.legend(loc="best")
plt.grid(True)
plt.show()

In [None]:
# found this in the documentation and I figured I would try it
xgb.plot_importance(best_xg_boost)