## This notebook loads a pre-trained XGBoost model and runs inference on raw data
__NOTE__: This XGBoost model does not leverage embeddings from the GNN (GraphSAGE) model.

### Goals
* Outline the steps to transform new raw data before feeding it into the model.
* Simulate the use of the trained model on new data during inference.

#### Import packages

In [1]:
import pickle
import json
import os
import xgboost as xgb
from cuml.metrics import confusion_matrix
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    roc_auc_score)
import numpy as np
import pandas as pd

##### Path to the pre-trained XGBoost model and data

In [2]:
dataset_base_path = '../data/Sparkov'
model_root_dir = os.path.join(dataset_base_path, 'models')
model_file_name = 'xgboost_model.json'
xgb_model_path = os.path.join(model_root_dir, model_file_name)

#### Load the model

In [3]:
# Load xgboost model for node classification
loaded_bst = xgb.Booster()
loaded_bst.load_model(xgb_model_path)

#### Load column names and other global variable saved during the training

In [4]:
# Read the JSON file
with open(os.path.join(dataset_base_path, 'variables.json'), 'r') as json_file:
    column_names = json.load(json_file)

# Repopulate the variables in the global namespace
globals().update(column_names)

___
#### Evaluate the XGBoost model on untransformed test data (saved in the preprocessing notebook)

##### Read untransformed data

In [None]:
pd.set_option('future.no_silent_downcasting', True)    
path_to_untransformed_data = os.path.join(dataset_base_path, 'xgb', 'untransformed_test.csv')
untransformed_df = pd.read_csv(path_to_untransformed_data)
untransformed_df.head(5)

#### Load the data transformer and transform the data using the loaded transformer

In [6]:
with open(os.path.join(dataset_base_path, 'preprocessor.pkl'),'rb') as f:
    loaded_transformer = pickle.load(f)
    transformed_data = loaded_transformer.transform(
        untransformed_df.loc[:, untransformed_df.columns[:-1]])

##### Evaluate the model on the transformed data

In [7]:
# Predictor columns used for training
numerical_predictors = [COL_AMOUNT, COL_SPEED, COL_AGE]
nominal_predictors = [COL_CARD, COL_ZIP, COL_MCC, COL_MERCHANT, COL_JOB]

predictor_columns = numerical_predictors + nominal_predictors

target_column = [COL_FRAUD]

# transformed column names
columns_of_transformed_data = list(
    map(lambda name: name.split('__')[1],
        list(loaded_transformer.get_feature_names_out(predictor_columns))))

In [8]:
# Prepare features (X) and target (y)

X = pd.DataFrame(
    transformed_data, columns=columns_of_transformed_data)

y = untransformed_df[untransformed_df.columns[-1]].values

In [9]:
# Make predictions
y_pred_prob = loaded_bst.predict(xgb.DMatrix(data=X, label=y))

y_pred = (y_pred_prob >= 0.5).astype(int)


#### Compute metrics to evaluate model performance

In [None]:

# Accuracy
accuracy = accuracy_score(y, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion Matrix
conf_mat = confusion_matrix(y, y_pred)
print('Confusion Matrix:')
print(conf_mat)

# ROC AUC Score
r_auc = roc_auc_score(y, y_pred_prob)
print(f'ROC AUC Score: {r_auc:.4f}')

# y = cupy.asnumpy(y)
# Precision
precision = precision_score(y, y_pred)
print(f'Precision: {precision:.4f}')

# Recall
recall = recall_score(y, y_pred)
print(f'Recall: {recall:.4f}')

# F1 Score
f1 = f1_score(y, y_pred)
print(f'F1 Score: {f1:.4f}')



___
### Prediction on raw inputs
* The purpose is to demonstrate the use of the model during inference

##### Read raw data

In [11]:

raw_file_path = os.path.join(dataset_base_path, 'xgb', 'example_transactions.csv')
data = pd.read_csv(raw_file_path)
data = data[data.columns[:-1]]
original_data = data.copy()

##### Check if the transactions have unknown users or merchants

In [12]:
# Find the known merchants and (users, cards), i.e. the merchants and (users, cards) that are in training data
known_merchants = set()
known_cards = set()

for enc in  loaded_transformer.named_transformers_['binary'].named_steps['binary'].ordinal_encoder.mapping:
    if enc['col'] == COL_MERCHANT:
        known_merchants = set(enc['mapping'].keys())
    if enc['col'] == COL_CARD:
        known_cards = set(enc['mapping'].keys())

In [13]:
# Is user, card already known
data['Is_card_known'] = data[COL_CARD].map(lambda c: c in known_cards)

In [14]:
# Is merchant already known
data['Is_merchant_known'] = data[COL_MERCHANT].map(lambda m: m in known_merchants )

#####  From ('lat', 'long'), ('merchant_lat', 'merchant_long') and unix_time to compute transaction speed

In [15]:

temp_df = pd.DataFrame()
import math
# Haversine formula function
def haversine(lat1, lon1, lat2, lon2):
    # Radius of Earth in km
    R = 6371.0

    # Convert degrees to radians
    lat1 = math.radians(lat1)
    lon1 = math.radians(lon1)
    lat2 = math.radians(lat2)
    lon2 = math.radians(lon2)

    # Differences in coordinates
    dlat = lat2 - lat1
    dlon = lon2 - lon1

    # Haversine formula
    a = math.sin(dlat / 2)**2 + math.cos(lat1) * math.cos(lat2) * math.sin(dlon / 2)**2
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1 - a))

    # Distance in kilometers
    distance = R * c
    return distance


temp_df=  data[['unix_time', 'lat', 'long', 'merch_lat', 'merch_long']].copy()
temp_df['tx_duration'] = temp_df['unix_time'].apply(lambda x: x/1e9)
temp_df['distance_km'] = temp_df.apply(
    lambda row: haversine(row['lat'], row['long'], row['merch_lat'], row['merch_long']), axis=1)

data['speed'] =  (temp_df['distance_km']/temp_df['tx_duration'])
del temp_df

#### Convert 'dob' to 'age' w.r.t. a reference date

In [16]:

data['dob'] = pd.to_datetime(data['dob'])
one_nanosecond = np.timedelta64(1, 'ns')
nanoseconds_in_year = 365.25 * 24 * 60 * 60 * 1e9
reference_date =  pd.to_datetime('2024-10-30') 
data['age'] = data['dob'].apply(lambda dob: (reference_date - dob)/ one_nanosecond / nanoseconds_in_year )

##### Set of predictor columns used for training the model

In [17]:

numerical_predictors = [COL_AMOUNT, COL_SPEED, COL_AGE]
nominal_predictors = [COL_CARD, COL_ZIP, COL_MCC, COL_MERCHANT, COL_JOB]

predictor_columns = numerical_predictors + nominal_predictors

target_column = [COL_FRAUD]

##### Transform input data using the pre-fitted data transformer

In [18]:
with open(os.path.join(dataset_base_path, 'preprocessor.pkl'),'rb') as f:
    loaded_transformer = pickle.load(f)
    transformed_data = loaded_transformer.transform(data[predictor_columns])

#### Prepare data and predict if the transactions are fraud

In [19]:

X = pd.DataFrame(
    transformed_data, columns=columns_of_transformed_data)

# Predict transactions
pred_probs = loaded_bst.predict(xgb.DMatrix(X))
pred_labels = (pred_probs >= 0.5).astype(int)

# Name of the target column
target_col_name = 'Is Fraud?'

data[target_col_name] = pred_labels


#### If the transactions have unknown (user, card) or merchant, mark it as fraud

In [20]:

data[target_col_name] = data.apply(
    lambda row: (row[target_col_name] == 1) or (row['Is_card_known'] == False) or (row['Is_merchant_known'] == False), axis=1)


#### Label the raw data as Fraud or Non-Fraud, based on prediction

In [21]:

# Change 0 to No (non-Fraud) and 1 to Yes (Fraud)
binary_to_text = { False: 'No', True : 'Yes'}
data[target_col_name] = data[target_col_name].map(binary_to_text).astype('str')
original_data[target_col_name] = data[target_col_name]

#### Transactions with predicted labels

In [None]:
original_data

## Copyright and License
<hr/>
Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.

<br/>

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
 
 http://www.apache.org/licenses/LICENSE-2.0
 
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License.