# Importing Classes

In [None]:
# Importing Pandas and other dependencies reqiuired for feature engineering.
import pandas as pd
import numpy as np
import datetime

# Importing classes and functions from scikit-learn for data preprocessing and modeling
from sklearn.compose import ColumnTransformer   # For transforming specific columns in a DataFrame
from sklearn.pipeline import Pipeline           # For creating a data processing pipeline
from sklearn.impute import SimpleImputer        # For handling missing values
from sklearn.preprocessing import StandardScaler, OrdinalEncoder   # For feature scaling and one-hot encoding
from sklearn.feature_selection import SelectPercentile, chi2    # For feature selection using chi-squared test
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score

# Importing XgBoost for training model
import xgboost as xgb

# Importing matplotlib and seaborn for plotting.
from matplotlib import pyplot
import seaborn as sns

# Importing Joblib for saving model
import joblib
import os

# Mute warnings
import warnings
warnings.filterwarnings("ignore")

# Reading Data 

In [None]:
credit_cards_df = pd.read_csv(
    "https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/credit_cards.csv",
)

In [None]:
profiles_df = pd.read_csv(
    "https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/profiles.csv", 
    parse_dates=["birthdate"],
)

In [None]:
trans_df = pd.read_csv(
    "https://repo.hops.works/master/hopsworks-tutorials/data/card_fraud_data/transactions.csv", 
    parse_dates=["datetime"],
).rename(columns={"city":"transaction_city", "country":"transaction_country"})

# Feature Engineering

In [None]:
credit_cards_extended = credit_cards_df.merge(profiles_df, on="cc_num")

In [None]:
all_data_df = trans_df.merge(credit_cards_extended, on="cc_num")

In [None]:
all_data_df = all_data_df.rename(columns={"datetime":"transaction_time", 
                                          "City":"city", 
                                          "Country":"country", 
                                          "sex":"fg2_sex", 
                                          "City":"fg2_city", 
                                          "Country":"fg2_country", 
                                         })

In [None]:
all_data_df["fg2_days_to_expiry"] = (datetime.datetime.now() - pd.to_datetime(all_data_df["expires"], format="%m/%y")).dt.days

In [None]:
all_data_df["fg2_age_at_transaction"] = (datetime.datetime.now() - all_data_df["birthdate"]).dt.days

# Creating Train-Test Split

In [None]:
train, test = train_test_split(all_data_df, test_size=0.2)

In [None]:
X_train = train[['category', 'amount', 'transaction_city', 'transaction_country', 
                       'fg2_days_to_expiry', 'fg2_age_at_transaction', 'fg2_sex', 'fg2_city','fg2_country']]
Y_train = train[["fraud_label"]]

In [None]:
X_test = test[['category', 'amount', 'transaction_city', 'transaction_country', 
                       'fg2_days_to_expiry', 'fg2_age_at_transaction', 'fg2_sex', 'fg2_city','fg2_country']]
Y_test = test[["fraud_label"]]

# Creating scikit learn pipeline for model training

In [None]:
# Define a Pipeline for numeric features
numeric_transformer = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="median")),
        ("scaler", StandardScaler()),
    ]
)
# Define a Pipeline for categorical features
categorical_transformer = Pipeline(
    steps=[
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ]
)

In [None]:
# Define a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, ['amount', 'fg2_days_to_expiry', 'fg2_age_at_transaction']),
        ("cat", categorical_transformer, ['category', 'transaction_city', 'transaction_country', 'fg2_sex', 'fg2_city', 'fg2_country']),
    ]
)

# Training the model

In [None]:
# Create a pipeline
xgb_classifier = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("classifier", xgb.XGBClassifier()),
    ]
)
# Fit the classifier
xgb_classifier.fit(X_train, Y_train)

# Evaluate the model
y_pred = xgb_classifier.predict(X_test)

# Testing Model 

In [None]:
predictions = xgb_classifier.predict(X_test)
predictions.astype(str)

In [None]:
# Compute f1 score
metrics = {
    "f1_score": f1_score(Y_test, y_pred, average='macro')
}
metrics

In [None]:
# Generate the confusion matrix using the true labels (y_test) and predicted labels (y_pred_test)
results = confusion_matrix(Y_test, y_pred)

# Print the confusion matrix
print(results)

# Plotting Results

In [None]:
# Create a DataFrame from the confusion matrix results with appropriate labels
df_cm = pd.DataFrame(
    results, 
    ['True Normal', 'True Fraud'],
    ['Pred Normal', 'Pred Fraud'],
)

# Create a heatmap using seaborn with annotations
cm = sns.heatmap(df_cm, annot=True)

# Get the figure from the heatmap and display it
fig = cm.get_figure()
fig.show()

# Saving Model

In [None]:
# Specify the model directory
model_dir = "fraud_batch_model"
images_dir = os.path.join(model_dir, "images")

# Create directories if they don't exist
os.makedirs(images_dir, exist_ok=True)

In [None]:
# Specify the directory where the model will be saved
model_dir = "fraud_batch_model"

# Check if the directory exists, and create it if it doesn't
if not os.path.isdir(model_dir):
    os.mkdir(model_dir)

# Save the trained XGBoost model using joblib
joblib.dump(xgb_classifier, model_dir + '/xgb_classifier.pkl')

# Save the confusion matrix heatmap as an image in the model directory
fig.savefig(model_dir + "/confusion_matrix.png")