## Inference on TabFormer Data
This notebook loads a pre-trained GNN (GraphSAGE) model and an XGBoost model and runs inference on raw data.

### Goals
* Outline the steps to transform new raw data before feeding it into the models.
* Simulate the use of trained models on new data during inference.

#### Import packages

In [1]:
import pandas as pd
import cudf
import pickle
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import SAGEConv
import os
import xgboost as xgb

##### Path to the pre-trained GraphSAGE and the XGBoost models

In [2]:
dataset_base_path = '../data/TabFormer'
model_root_dir = os.path.join(dataset_base_path, 'models')
gnn_model_path = os.path.join(model_root_dir, 'node_embedder.pth')
xgb_model_path = os.path.join(model_root_dir, 'embedding_based_xgb_model.json')

#### Definition of the trained GraphSAGE model

In [3]:
class GraphSAGE(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels, out_channels, n_hops, dropout_prob=0.25):
        super(GraphSAGE, self).__init__()

        # list of conv layers
        self.convs = nn.ModuleList()
        # add first conv layer to the list
        self.convs.append(SAGEConv(in_channels, hidden_channels))
        # add the remaining conv layers to the list
        for _ in range(n_hops - 1):
            self.convs.append(SAGEConv(hidden_channels, hidden_channels))
        
        # output layer
        self.fc = nn.Linear(hidden_channels, out_channels)        

    def forward(self, x, edge_index, return_hidden=False):

        for conv in self.convs:
            x = conv(x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, p=0.5, training=self.training)
            
        if return_hidden:
            return x
        else:
            return self.fc(x)


### Load the models

##### Load the pre-trained GraphSAGE model

In [None]:
# Load GNN model for generating node embeddings
gnn_model = torch.load(gnn_model_path)
gnn_model.eval()  # Set the model to evaluation mode

##### Load the pre-trained XGBoost model

In [5]:
# Load xgboost model for node classification
loaded_bst = xgb.Booster()
loaded_bst.load_model(xgb_model_path)

#### Define a function to evaluate the XGBoost model

In [6]:
from cuml.metrics import confusion_matrix
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
import cupy as cp
from torch.utils.dlpack import to_dlpack

def evaluate_xgboost(bst, embeddings, labels):
    """
    Evaluates the performance of the XGBoost model by calculating different metrics.

    Parameters:
    ----------
    bst : xgboost.Booster
        The trained XGBoost model to be evaluated.
    embeddings : torch.Tensor
        The input feature embeddings for transaction nodes.
    labels : torch.Tensor
        The target labels (Fraud or Non-fraud) transaction, with the same length as the number of 
        rows in `embeddings`.
    Returns:
    -------
     Confusion matrix
    """

    # Convert embeddings to cuDF DataFrame
    embeddings_cudf = cudf.DataFrame(cp.from_dlpack(to_dlpack(embeddings)))
    
    # Create DMatrix for the test embeddings
    dtest = xgb.DMatrix(embeddings_cudf)
    
    # Predict using XGBoost on GPU
    preds = bst.predict(dtest)
    pred_labels = (preds > 0.5).astype(int)

    # Move labels to CPU for evaluation
    labels_cpu = labels.cpu().numpy()

    # Compute evaluation metrics
    accuracy = accuracy_score(labels_cpu, pred_labels)
    precision = precision_score(labels_cpu, pred_labels, zero_division=0)
    recall = recall_score(labels_cpu, pred_labels, zero_division=0)
    f1 = f1_score(labels_cpu, pred_labels, zero_division=0)
    roc_auc = roc_auc_score(labels_cpu, preds)

    print(f"Performance of XGBoost model trained on node embeddings")
    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    conf_mat = confusion_matrix(labels.cpu().numpy(), pred_labels)
    print('Confusion Matrix:', conf_mat)

___
### Evaluate the XGBoost model on untransformed test data (saved in the preprocessing notebook)

##### Read untransformed data

In [None]:
pd.set_option('future.no_silent_downcasting', True)    
path_to_untransformed_data = os.path.join(dataset_base_path, 'xgb', 'untransformed_test.csv')
untransformed_df = pd.read_csv(path_to_untransformed_data)
untransformed_df.head(5)

#### Load the data transformer and transform the data using the loaded transformer

In [8]:
with open(os.path.join(dataset_base_path, 'preprocessor.pkl'),'rb') as f:
    loaded_transformer = pickle.load(f)
    transformed_data = loaded_transformer.transform(untransformed_df.loc[:, untransformed_df.columns[:-1]])

##### Evaluate the model on the transformed data

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# Convert data to torch tensors
X = torch.tensor(transformed_data).to(torch.float32).to(device)
y = torch.tensor(untransformed_df[untransformed_df.columns[-1]].values ).to(torch.long).to(device)

In [10]:
# Generate node embedding using the GNN model
test_embeddings = gnn_model(
    X.to(device), torch.tensor([[], []], dtype=torch.int).to(device), return_hidden=True)

In [None]:
# Evaluate the XGBoost model
evaluate_xgboost(loaded_bst, test_embeddings, y)

___
## Predictions on raw input
The purpose is to demonstrate the use of the models during inference

##### Read raw data

In [12]:
# Read example raw inputs
raw_file_path = os.path.join(dataset_base_path, 'xgb', 'example_transactions.csv')
data = pd.read_csv(raw_file_path)
data = data[data.columns[:-1]]
original_data = data.copy()

### Transform raw data
* Perform the same set of transformations on the raw data as was done on the training data.

#### Rename columns before the data is fed into the pre-fitted data transformer

In [13]:
# Rename columns before the data is fed into the data transformer
COL_USER = 'User'
COL_CARD = 'Card'
COL_AMOUNT = 'Amount'
COL_MCC = 'MCC'
COL_TIME = 'Time'
COL_DAY = 'Day'
COL_MONTH = 'Month'
COL_YEAR = 'Year'

COL_MERCHANT = 'Merchant'
COL_STATE ='State'
COL_CITY ='City'
COL_ZIP = 'Zip'
COL_ERROR = 'Errors'
COL_CHIP = 'Chip'


_ = data.rename(columns={
    "Merchant Name": COL_MERCHANT,
    "Merchant State": COL_STATE,
    "Merchant City": COL_CITY,
    "Errors?": COL_ERROR,
    "Use Chip": COL_CHIP
    },
    inplace=True
)


#### Handle unknown values as was done for the training data

In [14]:
UNKNOWN_STRING_MARKER = 'XX'
UNKNOWN_ZIP_CODE = 0
MAX_NR_CARDS_PER_YEAR = 9

data[COL_STATE] = data[COL_STATE].fillna(UNKNOWN_STRING_MARKER)
data[COL_ERROR] = data[COL_ERROR].fillna(UNKNOWN_STRING_MARKER)
data[COL_ZIP] = data[COL_ZIP].fillna(UNKNOWN_ZIP_CODE)

#### Convert column type and remove "$" and "," as was done for the training data

In [15]:

data[COL_AMOUNT] = data[COL_AMOUNT].str.replace("$","").astype("float")
data[COL_STATE] = data[COL_STATE].astype('str')
data[COL_MERCHANT] = data[COL_MERCHANT].astype('str')
data[COL_ERROR] = data[COL_ERROR].str.replace(",","")

#### Combine User and Card to generate unique numbers as was done for the training data

In [16]:

data[COL_CARD] = data[COL_USER] * MAX_NR_CARDS_PER_YEAR  + data[COL_CARD]
data[COL_CARD] = data[COL_CARD].astype('int')

##### Check if the transactions have unknown users or merchants

In [17]:
# Find the known merchants and (users, cards), i.e. the merchants and (users, cards) that are in training data
known_merchants = set()
known_cards = set()

for enc in  loaded_transformer.named_transformers_['binary'].named_steps['binary'].ordinal_encoder.mapping:
    if enc['col'] == COL_MERCHANT:
        known_merchants = set(enc['mapping'].keys())
    if enc['col'] == COL_CARD:
        known_cards = set(enc['mapping'].keys())

In [18]:
# Is user, card already known
data['Is_card_known'] = data[COL_CARD].map(lambda c: c in known_cards)

In [19]:
# Is merchant already known
data['Is_merchant_known'] = data[COL_MERCHANT].map(lambda m: m in known_merchants )

#### Use the same set of predictor columns as used for training

In [20]:
numerical_predictors = [COL_AMOUNT]
nominal_predictors = [COL_ERROR, COL_CARD, COL_CHIP, COL_CITY, COL_ZIP, COL_MCC, COL_MERCHANT]

predictor_columns = numerical_predictors + nominal_predictors

##### Load the data transformer and transform the raw data

In [21]:
with open(os.path.join(dataset_base_path, 'preprocessor.pkl'),'rb') as f:
    loaded_transformer = pickle.load(f)
    transformed_data = loaded_transformer.transform(data[predictor_columns])

#### Run prediction

In [22]:
# Set the device to GPU if available, otherwise default to CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Convert data to torch tensors
X = torch.tensor(transformed_data).to(torch.float32).to(device)


In [23]:

# Generate node embedding using the GraphSAGE model
transaction_embeddings = gnn_model(
    X.to(device), torch.tensor([[], []], dtype=torch.int).to(device), return_hidden=True)

embeddings_cudf = cudf.DataFrame(cp.from_dlpack(to_dlpack(transaction_embeddings)))

In [24]:
# predict if the transaction(s) are fraud
preds = loaded_bst.predict(xgb.DMatrix(embeddings_cudf))
pred_labels = (preds > 0.5).astype(int)

#### If the transactions have unknown (user, card) or merchant, mark it as fraud

In [25]:
# Name of the target column
target_col_name = 'Is Fraud?'

data[target_col_name] = pred_labels
data[target_col_name] = data.apply(
    lambda row: 
    (row[target_col_name] == 1) or (row['Is_card_known'] == False) or (row['Is_merchant_known'] == False), axis=1)

#### Label the raw data as Fraud or Non-Fraud, based on prediction

In [26]:

# Change 0 to No (non-Fraud) and 1 to Yes (Fraud)
binary_to_fraud = { False: 'No', True : 'Yes'}
data[target_col_name] = data[target_col_name].map(binary_to_fraud).astype('str')
original_data[target_col_name] = data[target_col_name]

#### Raw data with predicted labels (Fraud or Non-Fraud)

In [None]:
original_data

## Copyright and License
<hr/>
Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

<br/>

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
 http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.