### Classification
What we aim to do here is 
1. Train a classification model with the reconstruction error as a feature to it 
2. Check the performance on testing dataset for model with feature


In [42]:
import json
import os
import time
from datetime import datetime
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.base import BaseEstimator, TransformerMixin
from category_encoders import TargetEncoder
import xgboost as xgb
from sklearn.metrics import average_precision_score
import joblib
import category_encoders as ce


print(f"{datetime.now()} AutoEncoder::Status - Training Script Init. Packages loaded")

cat_cols = ['step', 'type']
categorical_transformer = TargetEncoder()

preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, cat_cols)
        ],
        remainder='passthrough'  # Numerical columns remain unchanged
    )

# Define the model with preprocessing
model = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss'))
])

print(f"{datetime.now()} AutoEncoder::Status - Functions defined.")

2023-10-09 23:11:18.153388 AutoEncoder::Status - Training Script Init. Packages loaded
2023-10-09 23:11:18.154828 AutoEncoder::Status - Functions defined.


In [43]:
print(f"{datetime.now()} AutoEncoder::Status - training data read.")
df_train=pd.read_csv('train_with_enc_field.csv')
print(f'Train data size is {df_train.shape}')
print(f"Train data label distribution is {df_train['isFraud'].value_counts()}")


X_train = df_train[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest', 'log_amount', 'enc_field']]
y_train = df_train['isFraud'] 

print(f"{datetime.now()} AutoEncoder::Status - training data processed.")
print(f'Train data size is {X_train.shape}')

2023-10-09 23:11:21.069379 AutoEncoder::Status - training data read.
Train data size is (120731, 10)
Train data label distribution is isFraud
0    114982
1      5749
Name: count, dtype: int64
2023-10-09 23:11:21.165572 AutoEncoder::Status - training data processed.
Train data size is (120731, 9)


In [44]:
print(f"{datetime.now()} AutoEncoder::Status - training flow init.")
model.fit(X_train, y_train)

2023-10-09 23:11:22.574328 AutoEncoder::Status - training flow init.


In [45]:
y_train_pred = model.predict_proba(X_train)[:, 1]
aucpr_train = average_precision_score(y_train, y_train_pred)
print(f'Training AUC-PR: {aucpr_train:.20f}')

Training AUC-PR: 0.99999999999999988898


In [46]:
ohe_feature_names = model.named_steps['preprocessor'].named_transformers_['cat'].get_feature_names_out()
feature_names = np.concatenate([['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest', 'log_amount', 'enc_field'], ohe_feature_names])

feature_importances = model.named_steps['classifier'].feature_importances_
sorted_idx = feature_importances.argsort()

# Print feature importances
print("Feature Importances:")
for index in sorted_idx[::-1]:
    print(f"{feature_names[index]}: {feature_importances[index]:.3f}")

Feature Importances:
enc_field: 0.965
newbalanceDest: 0.020
newbalanceOrig: 0.007
amount: 0.004
step: 0.003
oldbalanceOrg: 0.001
oldbalanceDest: 0.001
log_amount: 0.000
type: 0.000


In [47]:
df_test=pd.read_csv('test_enc_field.csv')
X_test = df_test[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest', 'log_amount', 'enc_field']]
y_test = df_test['isFraud']
y_test_pred = model.predict_proba(X_test)[:, 1]
aucpr_test = average_precision_score(y_test, y_test_pred)
print(f'Training AUC-PR: {aucpr_train:.20f}')

Training AUC-PR: 0.99999999999999988898


In [48]:
X_train = df_train[['step', 'type', 'amount', 'oldbalanceOrg', 'newbalanceOrig','oldbalanceDest', 'newbalanceDest', 'log_amount']]
y_train = df_train['isFraud'] 

print(f"{datetime.now()} AutoEncoder::Status - training data processed.")
print(f'Train data size is {X_train.shape}')

2023-10-09 23:11:46.069765 AutoEncoder::Status - training data processed.
Train data size is (120731, 8)


In [49]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_test_pred>0.5)
print(cm)

[[49278     0]
 [    0  2464]]
