# Credit Card Fraud Analytics
## Chapter 7
### Predictive Analytics for the Modern Enterprise 

This is jupyter notebook that can be used to follow along the code examples for Chapter 7 Credit Card Fraud Analysis. We will focus on tenorflow examples in this Notebook

The notebook has been tested using the following pre-requisite:

- Python V3.9.13 - https://www.python.org/
- Anaconda Navigator V3 for Python 3.9 - https://www.anaconda.com/
- Jupyter - V6.4.12 - https://jupyter.org/
- Desktop computer - macOS Ventura V13.1

Documentation referece for Scikit Learn: https://scikit-learn.org/stable/

### Pre-requisites


In order to start using tensorflow we will need to install the tensorflow 2.0 in python. Use the followinf command

```bash 
pip3 install tensorflow
```

- Original Dataset can be found here: [Dataset 1 - Credit Card fraud data on Kaggle ](https://www.kaggle.com/mlg-ulb/creditcardfraud)
- Local copy of the dataset can be downloaded here: https://github.com/paforme/predictiveanalytics/blob/main/Chapter7/Datasets/creditcard.csv.zip

In [None]:
#Data processing imports
import numpy as np
import pandas as pd
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

#Tensorflow imports
import tensorflow as tflow
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras import losses
from tensorflow.keras import metrics
from tensorflow.keras.callbacks import EarlyStopping

#Visualization imports
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
import plotly.express as px

#Utilities
import os, tempfile, datetime

In [None]:
#import the Credit Card Dataset
#Change this to the location where you downloaded and unzipped the dataset
url = "./Datasets/creditcard.csv" 
data_df = pd.read_csv(url) #load data in a dataframe
data_df

In [None]:
#Drop Time - It is relevant in the context of the user or the terminal
#however we don't know much about the columns and hence their 
#relationship with time is not relevant
data_cc = data_df.copy()
data_cc.drop(['Time'], axis=1, inplace=True)

In [None]:
#Use this code if you want to explore the features. 
#The loop will be CPU heavy for the machine running the notebook.

features = data_cc.iloc[:,0:28].columns #Get all the features

for f in features: #Loop through the features and for each

    graph = pd.DataFrame(
        #Create a data frame with the feature and the corresponding label
        {str(f): data_cc[f], "Class": data_cc['Class'], } 
    )
    
    #Plot the feature distribution
    fig = px.histogram(
        graph,
        x=f,
        title="Feature " + str(f) + " Distribution",
        color="Class",
        marginal="box",
        labels={"0": "Legitimate", "1": "Fraudulent"},
    )

    fig.update_traces(opacity=0.75)
    fig.show()

In [None]:
labels = pd.DataFrame(data_cc['Class'])
features = data_cc.drop(['Class'], axis = 1)

#Create Training, Validation and Test sets
train_features,test_features,train_label,test_label = train_test_split(
    features,labels,test_size=0.20, random_state=110)
train_features,val_features,train_label,val_label = train_test_split(
    train_features,train_label,test_size=0.20, random_state=110)


In [None]:
#Normalization layer from tensorlfow used later on
#Scale the features with Mean 0 and Standard deviation as 1 - so they can be interpreted on the same scale.
#scaler = StandardScaler()
#train_features = scaler.fit_transform(train_features) #Fit the scaler on training data
#val_features = scaler.transform(val_features)
#test_features = scaler.transform(test_features)

#Clip features to remove outliers
#train_features = np.clip(train_features, -5, 5)
#val_features = np.clip(val_features, -5, 5)
#test_features = np.clip(test_features, -5, 5)

In [None]:
#Calculate the disbalance in the data
fraud = len(data_cc[data_cc['Class']==1].index)
legit = len(data_cc[data_cc['Class']==0].index)
total = len(data_cc)

print("Total fraudulent transactions: ", str(fraud))
print("Total legitimate transactions: ", str(legit))
print("Total transactions: ", str(total))

In [None]:
#Calculate initial bias to improve training speed
initial_bias = np.log([fraud/legit])

#Define bias as a keras constant
bias = tflow.keras.initializers.Constant(initial_bias)

#Define how we want to measure the performance of the model
MON = [
      keras.metrics.AUC(name='prc', curve='PR'),
      keras.metrics.Precision(name='precision'),
      keras.metrics.Recall(name='recall'),
      keras.metrics.BinaryAccuracy(name='accuracy')
]

In [None]:
#Define a normalizer
#A preprocessing layer which normalizes continuous features
normalizer = tflow.keras.layers.Normalization(axis=1) 
normalizer.adapt(np.array(train_features))

#Build the model
cc_model = tflow.keras.Sequential()
cc_model.add(normalizer) #Add a pre-processing layer 
cc_model.add(layers.Dense(
    16, activation='relu',input_shape=(train_features.shape[-1],))) 
#cc_model.add(layers.Dense(16, activation='relu',input_shape=(train_features.shape[-1],))) #Uncomment this for Multiple Hidden Layer NN
cc_model.add(layers.Dropout(0.5))
cc_model.add(keras.layers.Dense(1, activation='sigmoid',bias_initializer=bias)) 

In [None]:
# Define the learning rate
learning_rate = 0.001 

# Compile the model
cc_model.compile(
    optimizer=optimizers.Adam(learning_rate=learning_rate),
    loss=losses.BinaryCrossentropy(),
    metrics=MON
)

# Setting up parameters
CYCLES = 100
BATCH = 2048

# Defining Early Stopping callback
early_stopping = EarlyStopping(
    monitor='val_prc',  # Monitoring validation precision-recall curve
    verbose=1,  # Verbosity level
    patience=10,  # Number of epochs with no improvement 
    mode='max',  # Monitoring mode, maximizing precision-recall curve
    restore_best_weights=True  # Restoring the best model weights
)

In [None]:
#Look at what the model looks like
cc_model.summary()

In [None]:
#Save the model's initial weights
init_weights = os.path.join(tempfile.mkdtemp(), 'iw')
cc_model.save_weights(init_weights)

In [None]:
cc_model.load_weights(init_weights) #Use this on a second run 

#Initialize tensorboard callback
log_dir = "logs/pafme/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = tflow.keras.callbacks.TensorBoard(
    log_dir=log_dir, histogram_freq=1)

#Fit the model on the training data
history = cc_model.fit(
    train_features,
    train_label,
    batch_size=BATCH,
    epochs=CYCLES,
    callbacks=[early_stopping, tensorboard_callback],
    validation_data=(val_features, val_label))

In [None]:
%load_ext tensorboard
%tensorboard --logdir logs/pafme

In [None]:
#Generate predictions for test dataset
test_predictions_baseline = cc_model.predict(test_features, batch_size=BATCH)

In [None]:
#Function to create a confusion matrix and test summary
#def plot_matrix(actual, predictions, threshold=0.5):
#    matrix = confusion_matrix(actual, predictions > threshold)
#    plt.figure(figsize=(4,4))
#    myplot = sns.heatmap(matrix, annot=True, fmt="d", cmap="YlGnBu")

#    plt.title('Actual/Prediction @{:.2f}'.format(threshold))
#    plt.ylabel('Actual')
#    plt.xlabel('Prediction')
    
#    print('\033[1m' + 'Fraud summary (val = 1): ' '\033[0m')
#    print('- Total: ', np.sum(matrix[1])),
#    print('- Detected: ', matrix[1][1]),
#    print('- Missed: ', matrix[1][0]),
    
#    print('\033[1m' + '\nLegit summary (val = 0): ' '\033[0m')
#    print('- Total: ', np.sum(matrix[0])),
#    print('- Detected: ', matrix[0][0]),
#    print('- Missed: ', matrix[0][1])    


In [None]:
#plot_matrix(test_label, test_predictions_baseline)

In [None]:
def plot_matrix(actual, predictions, threshold=0.5):
    matrix = confusion_matrix(actual, predictions > threshold)
    sns.set(font_scale=1.2)
    plt.figure(figsize=(8, 6))
    sns.heatmap(matrix, annot=True, fmt="d", cmap="YlGnBu", 
                xticklabels=['Legitimate', 'Fraud'], 
                yticklabels=['Legitimate', 'Fraud'])
    plt.title('CC Fraud Transactions (Threshold: {:.2f})'.format(threshold))
    plt.xlabel('Predicted')
    plt.ylabel('Actual')
    plt.show()

    fraud_total, fraud_detected, fraud_missed = matrix[1].sum(), matrix[1][1], matrix[1][0]
    legit_total, legit_detected, legit_missed = matrix[0].sum(), matrix[0][0], matrix[0][1]

    print('\033[1m' + 'Fraud summary (actual = 1): ' '\033[0m')
    print('- Total: ', fraud_total)
    print('- Detected: ', fraud_detected)
    print('- Missed: ', fraud_missed)

    print('\033[1m' + '\nLegitimate summary (actual = 0): ' '\033[0m')
    print('- Total: ', legit_total)
    print('- Detected: ', legit_detected)
    print('- Missed: ', legit_missed)

plot_matrix(test_label, test_predictions_baseline)

### Introducing Class weights

In [None]:
#Weight the classes to handle imbalance of data
#class-weight =n_samples_total / (n_classes * n_samples_class)

legit_weight = total / (legit * 2)
fraud_weight = total / (fraud * 2)

print('Weight (legit) class 0:', legit_weight)
print('Weight (fraud) class 1:', fraud_weight)
weight = {0: legit_weight, 1: fraud_weight}


In [None]:
#Step 1 - Define initial bias
bias = tflow.keras.initializers.Constant(initial_bias)

#Step 2 - Build the model
weighted_model = tflow.keras.Sequential()
weighted_model.add(normalizer) #Add a pre-processing layer
weighted_model.add(layers.Dense(16, activation='relu',input_shape=(
    train_features.shape[-1],)))
weighted_model.add(layers.Dense(16, activation='relu',input_shape=(
    train_features.shape[-1],))) #Uncomment this for Multiple Hidden Layer NN
weighted_model.add(layers.Dropout(0.5))
weighted_model.add(keras.layers.Dense(1, activation='sigmoid',
                                      bias_initializer=bias)) 
weighted_model.load_weights(init_weights)

In [None]:
#Step 3 - Setup Tensorflow
#Define Tensorboard log directory
log_dir = "logs/pafme/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") 
tensorboard_callback = tflow.keras.callbacks.TensorBoard(
    log_dir=log_dir, histogram_freq=1)

#Step 4 - Compile the model
weighted_model.compile(
      optimizer=keras.optimizers.Adam(learning_rate=0.001),
      loss=keras.losses.BinaryCrossentropy(),
      metrics=MON)

weighted_model.load_weights(init_weights)

#Step - 5 Fit the model
weighted_history = weighted_model.fit(
    train_features,
    train_label,
    batch_size=BATCH,
    epochs=CYCLES,
    callbacks=[early_stopping, tensorboard_callback],
    validation_data=(val_features, val_label),
    class_weight=weight) #Class weights are passed here

In [None]:
#Load tensorboard
%load_ext tensorboard
%tensorboard --logdir logs/pafme

In [None]:
#Make predictions on test dataset
test_predictions_weighted = weighted_model.predict(test_features, batch_size=BATCH)

In [None]:
#Plot test prediction
plot_matrix(test_label, test_predictions_weighted)