# This notebook is for inference on test data given in the interview

## Load libraries

In [1]:
import utils
from feature_engineering import FeatureEngineering
import metrics

In [2]:
import pandas as pd
import numpy as np
import metrics
import utils
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
import matplotlib.pyplot as plt
import pickle
import glob
import joblib

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight
import xgboost as xgb

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


## Declare path to test data

In [4]:
# we define the path to the test dataset
path_to_test_data = 'testdata'# 'testdata'

## Internal processing

In [5]:
# here are some internal constants
path_to_data4dev = 'assets/data4dev.csv'
path_to_data_scaler = 'assets/data_scaler.pkl'
path_to_data_inf_org = 'assets/path_to_data_orig_inf.csv'
path_to_data_inf_transformed = 'assets/path_to_data_dev_inf.csv'
model_path = {'xgb': 'results/xgb_model.pkl'}


In [6]:
# load data
df = utils.load_data(nonfraud_path=glob.glob(f"{path_to_test_data}/*_nonfraud_*.csv"), fraud_path=glob.glob(f"{path_to_test_data}/*_fraud*.csv"))
df.to_csv(path_to_data_inf_org, index=False) # we need to store this for report generation

In [None]:
# transform data
fe = FeatureEngineering(df)
transformed_df = fe.transform() 
transformed_df.to_csv(path_to_data_inf_transformed, index=False)

In [None]:
# prepare data for inference
col_to_drop = ['id', 'fraud']
X = transformed_df.drop(col_to_drop, axis=1)
y = transformed_df['fraud']


In [None]:
# scale data
data_scaler = joblib.load(path_to_data_scaler)
X_standardized = data_scaler.transform(X)

In [None]:
model_to_use = 'xgb'
clf = utils.load_model(model_path[model_to_use])
model = model_to_use + '_test'

if model_to_use in ['lr', 'dt']:
    y_pred = clf.predict(X_standardized)
elif model_to_use == 'xgb':
    dtest = xgb.DMatrix(X_standardized, label=y, feature_names=X.columns.tolist())
    y_pred_proba = clf.predict(dtest)    

# Evaluate models and generate report

In [None]:
# Try to find the optimal threshold using F2 scores
# Reference: https://www.giskard.ai/glossary/f-score#:~:text=The%20F%2D2%20score%20is,2%20*%20precision%20%2B%20recall).
from sklearn.metrics import precision_recall_curve, fbeta_score

precision, recall, thresholds = precision_recall_curve(y, y_pred_proba)

# f2 score
precision, recall, thresholds = precision_recall_curve(y, y_pred_proba)
f2_scores = (5 * precision * recall) / (4 * precision + recall)
f2_scores = np.concatenate([[0], f2_scores])  # Add 0 for the last threshold (threshold=0)

# Find the threshold that maximizes the F2 score
optimal_threshold = thresholds[np.argmax(f2_scores[1:])]  # Ignore the first threshold (for threshold=0)

# Classify predictions with the optimal threshold
y_pred = (y_pred_proba >= optimal_threshold).astype(int)


In [None]:
print("F1:", metrics.calc_f1(y, y_pred))
print("AUC ROC:", metrics.calc_auc_roc(y, y_pred))
print("AUC PR:", metrics.calc_auc_pr(y, y_pred))
print("Sensitivity (Recall):", metrics.calc_sensitivity(y, y_pred))
print("Specificity:", metrics.calc_specificity(y, y_pred))
metrics.generate_confusion_matrix(y, y_pred, 'result_on_test/confusion_matrix.png')

I can see the performance on the test have some false negatives and false postives. Given the fact that Recall is 0.7, I think there is room for improvement. I already tried to find the optimal threshold using F2 score which balance precision and recall but put more focus on recall.

In [None]:
# The report can be found at prediction_report.csv
if model_to_use in ['lr', 'dt']:
    y_pred_proba = clf.predict_proba(X_standardized)
elif model_to_use == 'xgb':
    y_pred_proba = y_pred_proba.reshape((-1, 1))
    y_pred_proba = np.hstack([np.zeros((y_pred_proba.shape[0], 1)), y_pred_proba])
    
utils.generate_prediction_report(path_to_data_inf_org, path_to_data_inf_transformed, 
                                 y, y_pred, y_pred_proba, 
                                 'result_on_test/prediction_report.csv')

In [None]:
# Below is the list of false negatives and false positives. 
# I hope we can make it better at Vipps MobilePay
false_predictions = metrics.identify_false_predictions(path_to_data_inf_org, path_to_data_inf_transformed, y, y_pred)
false_predictions.T