In [1]:
# Standard libraries generally used
%matplotlib inline
import matplotlib.pyplot as plt
from IPython.display import display
import numpy as np
import pandas as pd
import seaborn as sns; sns.set(style="ticks", color_codes=True)
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import average_precision_score, precision_recall_curve
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
import scipy.stats
from sklearn.model_selection import RandomizedSearchCV


In [2]:
import torch

print(f'PyTorch version= {torch.__version__}')
print(f'CUDA available= {torch.cuda.is_available()}')  # CUDA optional - True when GPU present, CUDA installed

PyTorch version= 1.12.1
CUDA available= False


In [3]:
#****************** UPLOADING RELEVANT FILES *****************
df1 = pd.read_csv('/Users/mukulsherekar/Documents/Modeling Complex System/creditcard.csv')


In [4]:
# data check and basic properties
df1.head()
print(f'shape of orginal data set is {df1.shape}')
print(f'Number of rows (data points) are: {df1.shape[0]}')
print(f'Number of columns (features) are: {df1.shape[1]}')
print(df1.head())
print(df1.dtypes)
print(df1.isnull().any())
df1["is_duplicate"]= df1.duplicated()
print(f"#total= {len(df1)}")
print(f"#duplicated= {len(df1[df1['is_duplicate']==True])}")
print(df1.isna().sum())

shape of orginal data set is (284807, 31)
Number of rows (data points) are: 284807
Number of columns (features) are: 31
   Time        V1        V2        V3        V4        V5        V6        V7  \
0   0.0 -1.359807 -0.072781  2.536347  1.378155 -0.338321  0.462388  0.239599   
1   0.0  1.191857  0.266151  0.166480  0.448154  0.060018 -0.082361 -0.078803   
2   1.0 -1.358354 -1.340163  1.773209  0.379780 -0.503198  1.800499  0.791461   
3   1.0 -0.966272 -0.185226  1.792993 -0.863291 -0.010309  1.247203  0.237609   
4   2.0 -1.158233  0.877737  1.548718  0.403034 -0.407193  0.095921  0.592941   

         V8        V9  ...       V21       V22       V23       V24       V25  \
0  0.098698  0.363787  ... -0.018307  0.277838 -0.110474  0.066928  0.128539   
1  0.085102 -0.255425  ... -0.225775 -0.638672  0.101288 -0.339846  0.167170   
2  0.247676 -1.514654  ...  0.247998  0.771679  0.909412 -0.689281 -0.327642   
3  0.377436 -1.387024  ... -0.108300  0.005274 -0.190321 -1.175575  0.647

In [5]:
# replacing missing values with means
mean_V23 = df1['V23'].mean()
mean_V24 = df1['V24'].mean()
mean_V25 = df1['V25'].mean()
mean_V26 = df1['V26'].mean()
mean_V27 = df1['V27'].mean()
mean_V28 = df1['V28'].mean()

# Impute
df1['V23'] = df1['V23'].fillna(mean_V23)
df1['V24'] = df1['V24'].fillna(mean_V24)
df1['V25'] = df1['V25'].fillna(mean_V25)
df1['V26'] = df1['V26'].fillna(mean_V26)
df1['V27'] = df1['V27'].fillna(mean_V27)
df1['V28'] = df1['V28'].fillna(mean_V28)



In [6]:
df1.isna().sum()

Time            0
V1              0
V2              0
V3              0
V4              0
V5              0
V6              0
V7              0
V8              0
V9              0
V10             0
V11             0
V12             0
V13             0
V14             0
V15             0
V16             0
V17             0
V18             0
V19             0
V20             0
V21             0
V22             0
V23             0
V24             0
V25             0
V26             0
V27             0
V28             0
Amount          0
Class           0
is_duplicate    0
dtype: int64

In [7]:
df1 = df1.dropna(axis=0, subset=['Amount'])

In [8]:
df1 = df1.dropna(axis=0, subset=['Class'])

In [9]:
# checking for unique values
for f in list(df1.columns.values):
    
        print(df1[f].unique())

[0.00000e+00 1.00000e+00 2.00000e+00 ... 1.72787e+05 1.72788e+05
 1.72792e+05]
[-1.35980713  1.19185711 -1.35835406 ...  1.91956501 -0.24044005
 -0.53341252]
[-0.07278117  0.26615071 -1.34016307 ... -0.30125385  0.53048251
 -0.18973334]
[ 2.53634674  0.16648011  1.77320934 ... -3.24963981  0.70251023
  0.70333737]
[ 1.37815522  0.44815408  0.37977959 ... -0.55782812  0.68979917
 -0.50627124]
[-0.33832077  0.06001765 -0.50319813 ...  2.63051512 -0.37796113
 -0.01254568]
[ 0.46238778 -0.08236081  1.80049938 ...  3.0312601   0.62370772
 -0.64961669]
[ 0.23959855 -0.07880298  0.79146096 ... -0.29682653 -0.68617999
  1.57700625]
[ 0.0986979   0.08510165  0.24767579 ...  0.70841718  0.67914546
 -0.41465041]
[ 0.36378697 -0.25542513 -1.51465432 ...  0.43245405  0.39208671
  0.48617951]
[ 0.09079417 -0.16697441  0.20764287 ... -0.48478176 -0.39912565
 -0.91542665]
[-0.55159953  1.61272666  0.62450146 ...  0.41161374 -1.93384882
 -1.04045834]
[-0.61780086  1.06523531  0.06608369 ...  0.06311886

In [10]:
# *************************Question #2******************************************************
# counting class labels
labels = df1['Class'].values
frauds = []
for i in range(len(labels)):
  if labels[i] == 1:
    frauds.append(labels[i])

valids = []
for i in range(len(labels)):
  if labels[i] == 0:
    valids.append(labels[i])


print(f'the number of fraud transactions are {len(frauds)} out of {len(labels)} i.e {len(frauds)/len(labels)*100}%')
print(f'the number of valid transactions are {len(valids)} out of {len(labels)} i.e {len(valids)/len(labels)*100}%')

# Since the class is higly imbalanced, AUPRC should be an evaluation metric


the number of fraud transactions are 492 out of 284807 i.e 0.1727485630620034%
the number of valid transactions are 284315 out of 284807 i.e 99.82725143693798%


In [11]:
# ******************** Question #3 ***********************************************************

# Decistion Tree (DT) and Random Forest (RF) require no scaling needed because they scale invariant
# normalization means scaling to [0,1] which is a special case of min-max scaling
# standardization is more practical specially for optimization algorithms like gradient descent
# for logistic regression and SVM
# standardization means centering feature columns at mean 0 and SD=1 (imp for learing weights)
# DT & RF = no normalization or standarzation
# SVM and MLPC require standadization

In [12]:
# ******************************** Question #4 *************************************************************

# *********************************SETTING UP X & Y ********************************************************

dfX = df1.loc[:, df1.columns != 'Class']
dfy = df1.loc[:, df1.columns == 'Class'].values.ravel()

# Sanity check
print(f'N={len(dfX)}, M={len(dfX.columns)}')


# Set our main data structures X and y
X = dfX.values
y = dfy

N=284807, M=31


In [13]:
# splitting 50-50 for training and testing
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=0)


In [14]:
# 10-fold CV evaluation of a classifier
def eval_classifier(_clf, _X, _y):
    accuracies = []
    kf = StratifiedKFold(n_splits=10, shuffle=False, random_state=None)
    for train_index, test_index in kf.split(_X, _y):
        _clf.fit(_X[train_index], _y[train_index])
        y_pred = _clf.predict(_X[test_index])
        #average_precision += [average_precision_score(_y[test_index], y_pred)]
        precision, recall, thresholds = precision_recall_curve(_y[test_index], y_pred)
        auc_precision_recall = auc(recall, precision)
    return auc_precision_recall

In [15]:
# Evaluating the four classifiers wihtout pruning or regularization
# Decision Tree
auprc = eval_classifier(DecisionTreeClassifier(), X, y)
print(f'AUPRC of Decision Tree accuracy={auprc}')

AUPRC of Decision Tree accuracy=0.7249798959846111


In [16]:
# Random Forest
auprc = eval_classifier(RandomForestClassifier(), X, y)
print(f'AUPRC of Random Forest={auprc}')


AUPRC of Random Forest=0.8223097104010144


In [17]:
# standardizing dataset for SVC and MLPC usiing pipeline


In [18]:
# SVM
pipe_svm = make_pipeline(StandardScaler(),SVC())
                      
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
auprc = auc(recall, precision)
print(f'AUPRC of SVM={auprc}')


AUPRC of SVM=0.7700734295322329


In [19]:
# Neural Network

pipe_nn = make_pipeline(StandardScaler(), MLPClassifier())
                     
pipe_nn.fit(X_train, y_train)
y_pred = pipe_nn.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
auprc = auc(recall, precision)
print(f'AUPRC of MLPC ={auprc}')

AUPRC of MLPC =0.8437719410332563


In [20]:
# Evaluating classifiers with pruning and regularization

# Checking values of C for SVM

# SVM
pipe_svm = make_pipeline(StandardScaler(),SVC(C=0.1))
                      
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
auprc_low = auc(recall, precision)
print(f'AUPRC of SVM with C=0.1 is {auprc_low}')

pipe_svm = make_pipeline(StandardScaler(),SVC(C=0.5))
                      
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
auprc_med = auc(recall, precision)
print(f'AUPRC of SVM with C=0.5={auprc_med}')

pipe_svm = make_pipeline(StandardScaler(),SVC(C=1))
                      
pipe_svm.fit(X_train, y_train)
y_pred = pipe_svm.predict(X_test)
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
auprc_high = auc(recall, precision)
print(f'AUPRC of SVM with C=1 is {auprc_high}')
print(f'Best regularization parameter for SVC is {max(auprc_low, auprc_med, auprc_high)}')

AUPRC of SVM with C=0.1 is 0.5378270429357478
AUPRC of SVM with C=0.5=0.7347151969364136
AUPRC of SVM with C=1 is 0.7700734295322329
Best regularization parameter for SVC is 0.7700734295322329


In [22]:
# Grid Search for regularization parameter for Neural Network
param_range = [0.1,1]
np.random.seed(1)
#param_range.rvs(10)

param_grid = [{'mlpclassifier__alpha': param_range}]
    
rs = RandomizedSearchCV(estimator=pipe_nn, 
                  param_distributions=param_grid, 
                  scoring='accuracy', 
                  refit=True,
                  n_iter=10,
                  cv=10,
                  n_jobs=-1)
    
rs = rs.fit(X_train, y_train)

print(f'Better regularization parameter for MLPC is {rs.best_params_} with a score of {rs.best_score_}')




Better regularization parameter for MLPC is {'mlpclassifier__alpha': 0.1} with a score of 0.9994382140819867


In [24]:
# Pruning for Decision Tree
auprc_low = eval_classifier(DecisionTreeClassifier(ccp_alpha=0.1), X, y)
print(f'AUPRC of Decision Tree with pruning parameter 0.1 is {auprc_low}')

auprc_med = eval_classifier(DecisionTreeClassifier(ccp_alpha=0.5), X, y)
print(f'AUPRC of Decision Tree with pruning parameter 0.5 is {auprc_med}')

auprc_high = eval_classifier(DecisionTreeClassifier(ccp_alpha=1), X, y)
print(f'AUPRC of Decision Tree with pruning parameter 0.5 is {auprc_high}')

print(f'The best pruning parameter performance for Decision Tree classifier is {max(auprc_low, auprc_med, auprc_high)}')



AUPRC of Decision Tree with pruning parameter 0.1 is 0.5008602528089887
AUPRC of Decision Tree with pruning parameter 0.5 is 0.5008602528089887
AUPRC of Decision Tree with pruning parameter 0.5 is 0.5008602528089887
The best pruning parameter performance for Decision Tree classifier is 0.5008602528089887


In [25]:
# ************** Question: 5 & 6 PyTorch Neural Netowork ************************************
print(f'PyTorch version= {torch.__version__}')
print(f'CUDA available= {torch.cuda.is_available()}')  # CUDA optional - True when GPU present, CUDA installed

PyTorch version= 1.12.1
CUDA available= False


In [26]:
# setting up scaled X and y
X_tr_sc = StandardScaler().fit_transform(X_train)
X_ts_sc = StandardScaler().fit_transform(X_test)

In [27]:
class PyTorchMLP(torch.nn.Module):  # One hidden layer
    def __init__(self, n_hidden=10, epochs=100, eta=0.001, minibatch_size=50, seed=0):
        super(PyTorchMLP, self).__init__()
        self.random = np.random.RandomState(seed)  # shuffle mini batches
        self.n_hidden = n_hidden  # size of the hidden layer
        self.epochs = epochs  # number of iterations
        self.eta = eta  # learning rate
        self.minibatch_size = minibatch_size  # size of training batch - 1 would not work
        self.optimizer = None
        self.loss_func = torch.nn.CrossEntropyLoss()
        self.model = None

    def init_layers(self, _M:int, _K:int) -> None:
        # data structure
        self.model = torch.nn.Sequential(
            torch.nn.Linear(_M, self.n_hidden),
            torch.nn.Sigmoid(),
            torch.nn.Linear(self.n_hidden, self.n_hidden),
            torch.nn.Sigmoid(),
            torch.nn.Linear(self.n_hidden, _K),
        )
    
    def predict(self, _X):
        _X = torch.FloatTensor(_X)
        assert self.model is not None
        self.model.eval()
        with torch.no_grad():
            y_pred = np.argmax(self.model(_X), axis=1)
        self.model.train()
        return y_pred.numpy()

    def fit(self, _X_train, _y_train, info=False):
        import sys
        _X_train, _y_train = torch.FloatTensor(_X_train), torch.LongTensor(_y_train)
        n_features= _X_train.shape[1]
        n_output= np.unique(_y_train).shape[0]  # number of class labels
        
        self.init_layers(n_features, n_output)
        self.optimizer = torch.optim.Rprop(self.model.parameters(), lr=self.eta)  # connect model to optimizer

        for i in range(self.epochs):
            indices = np.arange(_X_train.shape[0])
            self.random.shuffle(indices)  # shuffle the data each epoch

            for start_idx in range(0, indices.shape[0] - self.minibatch_size + 1, self.minibatch_size):
                batch_idx = indices[start_idx:start_idx + self.minibatch_size]
                self.optimizer.zero_grad()
                
                net_out = self.model(_X_train[batch_idx])
                
                loss = self.loss_func(net_out, _y_train[batch_idx])
                loss.backward()
                self.optimizer.step()
                
                if info:
                    sys.stderr.write(f"\r{i+1:03d} Loss: {loss.item():6.5f}")
                    sys.stderr.flush()
        return self

In [28]:
# A derived class to have Dropout
class MLP2(PyTorchMLP):
    def init_layers(self, _M, _K):
        self.model = torch.nn.Sequential(
            torch.nn.Linear(_M, self.n_hidden),
            torch.nn.Sigmoid(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(self.n_hidden, self.n_hidden),
            torch.nn.ReLU(),
            torch.nn.Dropout(0.1),
            torch.nn.Linear(self.n_hidden, _K),
        )

In [29]:
# Train 
mlp1 = PyTorchMLP(n_hidden=10, epochs=100, eta=0.001, minibatch_size=X_tr_sc.shape[0]).fit(X_tr_sc,y_train)
mlp2 = MLP2(n_hidden=10, epochs=2000, eta=0.0001, minibatch_size=X_tr_sc.shape[0]).fit(X_tr_sc,y_train)

# Predict
y_pred_mlp1 = mlp1.predict(X_ts_sc)
y_pred_mlp2 = mlp2.predict(X_ts_sc)

precision1, recall1, thresholds1 = precision_recall_curve(y_test, y_pred_mlp1)
auprc_mlp1 = auc(recall1, precision1)
print(f'AUPRC using PyTorchMLP without dropout ={auprc_mlp1}')

precision2, recall2, thresholds2 = precision_recall_curve(y_test, y_pred_mlp2)
auprc_mlp2 = auc(recall2, precision2)
print(f'AUPRC using PyTorchMLP with dropout ={auprc_mlp2}')

AUPRC using PyTorchMLP without dropout =0.796767135062383
AUPRC using PyTorchMLP with dropout =0.7811755082670109


In [30]:
# Question: 6 Comparison of 10-fold Cross validation of Random Forrest & two PyTorch neural networks

# The AUPRC score of RF is 0.82 while those of Neural Netowrks are 0.79 and 0.78. Inspite of being more powerful
# classifier than RF, scores are lower. This could be due to overfitting in RF or underfitting in neural netowrks.
# Also, neural networks might need two hidden layers instead of 1. This could be due to amount of data suited for RF
# because smaller data sets might not suit neural network and can hamper its performance.

# Random Forrest is a machine learning algorithm while Neural Netowrk is a deep learning algorithm. RF is an ensemlbe
# of Decision Tree where is tree is independent. Neural Network is a collection of layers that are interconnected
# and cannot be separated. Maybe for this dataset, is more suited for 
