#### 

## Train an XGBoost model
#### Goals

* Build only an XGBoost model without leveraging a GNN.
* Establish a baseline performance using the XGBoost model.

__NOTE__: This XGBoost model does not leverage embeddings from the GNN (GraphSAGE) model.

#### Dataset names

In [1]:
# Name of the datasets to choose from
TABFORMER = "TabFormer"
SPARKOV = "Sparkov"

### Select the dataset to train the model on
__Note__:  This notebook works for both __TabFormer__ and __Sparkov__ dataset. 
Make sure that the right dataset is selected.
For yhe TabFormer dataset, set

```code
    DATASET = TABFORMER
```
and for the Sparkov dataset, set

```code
    DATASET = SPARKOV
```

In [2]:
# Change this to either TABFORMER or SPARKOV
DATASET = TABFORMER

### Import necessary libraries, packages, and functions

In [3]:

import os
from collections import defaultdict

import cudf
import cupy
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.metrics import auc, f1_score, precision_score, recall_score

from cuml.metrics import confusion_matrix, precision_recall_curve, roc_auc_score
from cuml.metrics.accuracy import accuracy_score

##### Path to pre-processed data and directory to save models

In [4]:
dateset_name_to_path= defaultdict(lambda: "../data/TabFormer")

dateset_name_to_path['TabFormer'] = '../data/TabFormer'
dateset_name_to_path['Sparkov'] = '../data/Sparkov'
dataset_dir = dateset_name_to_path[DATASET]
xgb_data_dir = os.path.join(dataset_dir, 'xgb')
models_dir = os.path.join(dataset_dir, 'models')
model_file_name = 'xgboost_model.json'

##### Load and prepare training and validation data

In [5]:

train_data_path = os.path.join(xgb_data_dir, "training.csv")
df = cudf.read_csv(train_data_path)

# Target column
target_col_name = df.columns[-1]

# Split the dataframe into features (X) and labels (y)
y = df[target_col_name]
X = df.drop(target_col_name, axis=1)

# Split data into trainand testing sets
from cuml.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

# Convert the training and test data to DMatrix
dtrain = xgb.DMatrix(data=X_train, label=y_train)
deval = xgb.DMatrix(data=X_val, label=y_val)


#### Parameter grid to search for the best hyper-parameters for the input data

In [None]:
import itertools

# Define the parameter grid for manual search
param_grid = {
    'max_depth': [5, 6],
    'learning_rate': [0.3, 0.4, 0.45],
    'n_estimators': [100, 150],
    'gamma': [0, 0.1],
}

# Generate all combinations of hyperparameters
param_combinations = list(itertools.product(*param_grid.values()))

# Print all combinations of hyperparameters (optional)
print("Total number of parameter combinations:", len(param_combinations))

#### Grid search for the best hyperparameters

In [7]:
best_score = float("inf")  # Initialize best score
best_params = None  # To store best hyperparameters

for params_comb in param_combinations:
    
    # Create a dictionary of parameters
    params = {
        'max_depth': params_comb[0],
        'learning_rate': params_comb[1],
        'gamma': params_comb[3],
        'eval_metric': 'logloss',
        'objective': 'binary:logistic',  # For binary classification
        'tree_method': 'hist',  # GPU support
        'device': 'cuda'
    }

    # Train the model using xgb.train and the Booster
    evals = [(dtrain, 'train'), (deval, 'eval')]
    bst = xgb.train(params, dtrain, num_boost_round=params_comb[2], evals=evals, 
                    early_stopping_rounds=10, verbose_eval=False)
    
    # Get the evaluation score (logloss) on the validation set
    score = bst.best_score  # The logloss score (or use other eval_metric)

    # Update the best parameters if the current model is better
    if score < best_score:
        best_score = score
        best_params = params
        best_num_boost_round = bst.best_iteration

In [None]:
best_params, best_score, best_num_boost_round

### Train the model with the best hyperparameters

In [9]:
# Train the final model using the best parameters and best number of boosting rounds
dtrain = xgb.DMatrix(data=X, label=y)
final_model = xgb.train(best_params, dtrain, num_boost_round=best_num_boost_round)


In [10]:

# Save the best model
if not os.path.exists(models_dir):
    os.makedirs(models_dir)
final_model.save_model(os.path.join(models_dir, model_file_name))

___
### Evaluate the model on the same unseen data that is used for testing GNN based XGBoost

##### Load the saved model

In [11]:

# Load the model from the file
best_model_loaded = xgb.Booster()
best_model_loaded.load_model(os.path.join(models_dir, model_file_name))


#### Load and prepare unseen test data

In [12]:

test_data_path = os.path.join(xgb_data_dir, "test.csv")

test_df = cudf.read_csv(test_data_path)

dnew = xgb.DMatrix(test_df.drop(target_col_name, axis=1))


##### Predict targets

In [13]:

# Make predictions
y_pred_prob = best_model_loaded.predict(dnew)
y_pred = (y_pred_prob >= 0.5).astype(int)

#### Compute metrics to evaluate model performance

In [None]:

y_test = test_df[target_col_name].values 

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion Matrix
conf_mat = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_mat)

# ROC AUC Score
r_auc = roc_auc_score(y_test, y_pred_prob)
print(f'ROC AUC Score: {r_auc:.4f}')

y_test = cupy.asnumpy(y_test)
# Precision
precision = precision_score(y_test, y_pred)
print(f'Precision: {precision:.4f}')

# Recall
recall = recall_score(y_test, y_pred)
print(f'Recall: {recall:.4f}')

# F1 Score
f1 = f1_score(y_test, y_pred)
print(f'F1 Score: {f1:.4f}')


#### Plot Precision-Recall curve
* A Precision-Recall Curve shows the trade-off between precision and recall for a model at various thresholds, helping assess performance, especially on imbalanced data

In [15]:

# Compute Precision, Recall, and thresholds
precision, recall, thresholds = precision_recall_curve(y_test, y_pred_prob)

# Compute the Area Under the Curve (AUC) for Precision-Recall
pr_auc = auc(recall, precision)

##### Plot precision-recall curve

In [None]:

plt.figure()
plt.plot(recall, precision, label=f'PR AUC = {pr_auc:.2f}')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.title('Precision-Recall Curve')
plt.legend(loc='best')
plt.grid(True)
plt.show()

##### Plot precision-recall curve with thresholds

In [None]:
plt.figure()
plt.plot(thresholds, precision[:-1], label="Precision")
plt.plot(thresholds, recall[:-1], label="Recall")
plt.xlabel("Threshold")
plt.ylabel("Score")
plt.title("Precision-Recall Curve with Thresholds")
plt.legend()
plt.grid()
plt.show()

In [18]:
# One can choose optimal threshold based on the F1 score

## Copyright and License
<hr/>
Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

<br/>

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
 http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.