In [1]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn import metrics
from datetime import datetime

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
# This turns all the axes white in all the matplotlib plots. Comment this out if you dont want that
COLOR = 'white'
matplotlib.rcParams['text.color'] = COLOR
matplotlib.rcParams['axes.labelcolor'] = COLOR
matplotlib.rcParams['xtick.color'] = COLOR
matplotlib.rcParams['ytick.color'] = COLOR

In [None]:
df_transaction = pd.read_csv('../datasets/ieee-fraud-detection/train_transaction.csv')

df_transaction.head()

In [None]:
# Features that are used, isFraud is the target
features = ['isFraud', 'TransactionDT',
            'TransactionAmt','ProductCD', 'P_emaildomain','R_emaildomain']

In [None]:
df = df_transaction[features]
df.head()

In [None]:
target = 'isFraud'
# Categorical features
cat = ['TransactionDT','ProductCD', 'P_emaildomain','R_emaildomain']
# Numeric features
num = ['TransactionAmt']

In [None]:
#Drop rows with missing features
df = df.dropna()
y = df[target].values

In [None]:
x_cat = df.filter(items = cat).values
x_num = df.filter(items = num).values

In [None]:
labelencoder_X = LabelEncoder()
# Label encode every categorical column
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])

In [None]:
# Build input vector X, the training data
X = np.concatenate((x_cat, x_num), axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

In [None]:
X_train = np.asarray(X_train).astype('float32')
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1,1))

In [None]:
X_train.shape, y_train.shape

In [None]:
# Initialize model
model = tf.keras.Sequential() # initializing the model
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) # first dense layer with 128 neurons with rectified linear unit for a spectrum of values.
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) # second layer
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) # final layer with sigmoid for binary classification
model.compile(optimizer='adam', # optomizing weight with adam using stochastic gradient descent
              loss='binary_crossentropy', # evaluate performance of model with binary_crossentropy as output is binary
              metrics=['accuracy']) # gives out accuracy of model
model.fit(X_train, y_train, epochs=3) # pass training data 3 times through model and fit

# loss is on training data, lower loss is good but might overfit
# accuracy is on training data

In [None]:
val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)
# accuracy is on validation data - performance in wild

In [None]:
# Adding extra dense layer decreases loss and increases accuracy

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu)) 
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train, y_train, epochs=3)

val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)

In [None]:
# Adding more features to the model
features = ['isFraud', 'TransactionDT',
            'TransactionAmt','ProductCD', 'P_emaildomain','R_emaildomain', 'card4']

df = df_transaction[features]
df.head()

target = 'isFraud'
cat = ['TransactionDT','ProductCD', 'P_emaildomain','R_emaildomain', 'card4']
num = ['TransactionAmt']

df = df.dropna()
y = df[target].values

x_cat = df.filter(items = cat).values 
x_num = df.filter(items = num).values

labelencoder_X = LabelEncoder()
for i in range(len(cat)): 
    x_cat[:, i] = labelencoder_X.fit_transform(x_cat[:, i])
    
X = np.concatenate((x_cat, x_num), axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0) 
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 0)

X_train = np.asarray(X_train).astype('float32') 
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
X_val = np.asarray(X_val).astype('float32')
y_val = np.asarray(y_val).astype('float32').reshape((-1,1))

X_train.shape, y_train.shape

model = tf.keras.Sequential() 
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))
model.add(tf.keras.layers.Dense(128, activation=tf.nn.relu))  
model.add(tf.keras.layers.Dense(1, activation=tf.nn.sigmoid)) 
model.compile(optimizer='adam', 
              loss='binary_crossentropy', 
              metrics=['accuracy']) 
model.fit(X_train, y_train, epochs=3)

val_loss, val_acc = model.evaluate(X_val, y_val)
print(val_loss, val_acc)

# For same number of layers, adding the extra feature of card4 does not really enhance the predictive prowess of our model.

#Feature Importance
We are going to determine which feature is the most important for predicting the target feature "isFraud"

Methology:
Use the get_weights method of the Dense layer object. This method returns a list of two numpy arrays, the first of which contains the weight values and the second contains the bias values for the layer.

Example code:
Methology below compares each feature against "isFraud" target feature.

In [None]:
# This is for the first hidden layer. 

#Get the weights for the first hidden layer
weights = model.layers[0].get_weights()[0]

# Calculate the feature importance scores as the absolute sum of the weights for each feature
importance_scores = np.abs(weights).sum(axis=0)

# Normalize the scores to sum to 1
importance_scores = importance_scores / importance_scores.sum()

# Print the importance scores for each feature
print('Feature Importance Scores:')
for i, feature in enumerate(features[1:]):
    print(f'{feature}: {importance_scores[i]:.3f}')

In [None]:
#The methology below implements feature importance for each layer

# Define a function to calculate and normalize feature importance scores for a given layer
def get_feature_importance(layer):
    # Get the weights for the layer
    weights = layer.get_weights()[0]

    # Calculate the feature importance scores as the absolute sum of the weights for each feature
    importance_scores = np.abs(weights).sum(axis=0)

    # Normalize the scores to sum to 1
    importance_scores = importance_scores / importance_scores.sum()

    return importance_scores

# Calculate the feature importance scores for each layer
layer_importance = {}
for i, layer in enumerate(model.layers):
    if isinstance(layer, tf.keras.layers.Dense):
        layer_importance[f'layer_{i}'] = get_feature_importance(layer)

# Print the importance scores for each layer and feature
for layer_name, importance_scores in layer_importance.items():
    print(f'{layer_name} Feature Importance Scores:')
    for i, feature in enumerate(features[1:]):
        print(f'{feature}: {importance_scores[i]:.3f}')