# Baseline Model

## Mount the drive


In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Importing the Libraries

In [2]:
# Import the libraries
import pandas as pd
import numpy as np

In [3]:
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve
from sklearn.metrics import precision_score, recall_score, roc_auc_score, roc_curve
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import auc,RocCurveDisplay,PrecisionRecallDisplay

In [4]:
from sklearn.model_selection import cross_validate
from sklearn.model_selection import StratifiedKFold

## Importing the Helper functions

In [5]:
# Importing the helper_functions file
!cp /content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Notebooks/helper_functions.py helper_functions.py
!cp /content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Notebooks/helper_functions_ml.py helper_functions_ml.py

In [7]:
# Import the required functions from the helper function file
from helper_functions_ml import createMetricsDF,computeModelMetrics


## Import the Data (feature engineered dataframe)

In [8]:
# Import training data
!cp '/content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Classification_products/df.csv' 'df.csv'

In [9]:
# Read the training data
df_train = pd.read_csv('df.csv')
print(df_train.head())

df_train = df_train.drop('Unnamed: 0', axis=1)
df_train.head(3)
print(df_train.shape)

   Unnamed: 0  TransactionID  isFraud  TransactionDT  TransactionAmt  \
0           0        2987000        0          86400        1.835691   
1           1        2987001        0          86401        1.462398   
2           2        2987002        0          86469        1.770852   
3           3        2987003        0          86499        1.698970   
4           4        2987004        0          86506        1.698970   

   ProductCD  card1  card2  card3  card4  ...  id_36  id_37  id_38  \
0          4  13926   -1.0  150.0      1  ...      2      2      2   
1          4   2755  404.0  150.0      2  ...      2      2      2   
2          4   4663  490.0  150.0      4  ...      2      2      2   
3          4  18132  567.0  150.0      2  ...      2      2      2   
4          1   4497  514.0  150.0      2  ...      0      1      1   

   DeviceType  DeviceInfo  P_emaildomain_addr1_card1  card1_card2  \
0           1        1735                      63363         3655   
1       

## Baseline model

In [None]:
# Creating the baseline(average) scenerio for the model (model predicts all transactions as non-Fraud)
# y_baseline_train = np.zeros(len(y_train))
# y_baseline_dev = np.zeros(len(y_dev))


## Training, Testing and Evaluation of the model 

In [11]:
# Training and testing the baseline model on the training data using StratifiedK-fold

## Creating the X (estimator) and y (target variable)
X = df_train.drop(columns = ['isFraud'])
y = df_train['isFraud']

## Stratified K-fold
"""We chose stratified K-fold beacause stratification helps in addressing the imbalce present in the data set and ensures
the proportions of two classes in both train and test sets remain the same. Also since we have very less data on one class
and a lot fof features different splits can lead to different accuracy/metric chosen. K-fold method imporves the model stabilty
taking into account three splits and computing the evaluation metric for each split."""

numFolds = 3
skf = StratifiedKFold(n_splits=numFolds)

metrics={}
metrics['baseline']=[]
train_ids = []
dev_ids = []
for train, dev in skf.split(X, y):
  y_train = y[train]              # takes the indices of the split and assigns it to train 
  y_dev = y[dev]                  # takes the indices of the split and assigns it to dev

  y_baseline_train = np.zeros(len(y_train))      # Assigning the majority class to all the y train sample
  y_baseline_dev = np.zeros(len(y_dev))          # Assigning the majority class to all the y dev samples

  train_baseline_acc,train_baseline_f1,train_baseline_prec,train_baseline_rec,train_baseline_rocauc = \
  computeModelMetrics(y_baseline_train,y_baseline_train,y_train)     # no data to train the model so pass in the baseline values 
                                                                     # for y pred, y pred proba and compare with actual y train
  dev_baseline_acc,dev_baseline_f1,dev_baseline_prec,dev_baseline_rec,dev_baseline_rocauc = \
  computeModelMetrics(y_baseline_dev,y_baseline_dev,y_dev)

  baseline_metrics_df = createMetricsDF(train_baseline_acc,train_baseline_f1,
                                        train_baseline_prec,train_baseline_rec,train_baseline_rocauc,
                                        dev_baseline_acc,dev_baseline_f1,
                                        dev_baseline_prec,dev_baseline_rec,dev_baseline_rocauc)
  train_ids.append(train)
  dev_ids.append(dev)
  
  metrics['baseline'].append(baseline_metrics_df)

train_dev_indices = {}
train_dev_indices['train'] = train_ids
train_dev_indices['dev'] = dev_ids
# train_dev_indices_df = pd.DataFrame.from_dict(train_dev_indices)
# train_dev_indices_df.to_csv('train_dev_indices_' + str(numFolds) + '.csv',sep=',',index=False)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
# Saving the split indices (the same idices can be used for all models)
# pickle is a file type in python
import pickle

fileName = 'train_dev_indices_' + str(numFolds) +'.pickle'
with open(fileName, 'wb') as handle:
    pickle.dump(train_dev_indices, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [13]:
metrics['baseline']

[  Dataset Type  accuracy  f1 score  precision  recall  roc auc score
 0     training  0.965011       0.0        0.0     0.0            0.5
 1  development  0.965008       0.0        0.0     0.0            0.5,
   Dataset Type  accuracy  f1 score  precision  recall  roc auc score
 0     training  0.965011       0.0        0.0     0.0            0.5
 1  development  0.965008       0.0        0.0     0.0            0.5,
   Dataset Type  accuracy  f1 score  precision  recall  roc auc score
 0     training  0.965008       0.0        0.0     0.0            0.5
 1  development  0.965013       0.0        0.0     0.0            0.5]

In [15]:
# CP cannot be used to copy file here because numFolds is a variable and hence the filenames will change based on the number of folds.
# instead of using linux CP command we use SHUTIL copy 
import shutil # file copy package. 

srcFileName = 'train_dev_indices_' + str(numFolds) + '.pickle'
destFileName = '/content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Classification_products/train_dev_indices_' + str(numFolds) + '.pickle'
# !cp  srcFileName destFileName
shutil.copyfile(srcFileName, destFileName)

'/content/drive/MyDrive/PCMALAI_UCBerkeley_Capstone/Classification_products/train_dev_indices_3.pickle'

## Summary

In this notebook we built the baseline model that serves as thebench mark for other model to be compared with for evaluation of the latter model's performance. This baseline model is guessing the classification if the transaction is fraud or not by choosing the majority class all the time. It is due to this fact that basline model is able to exhibit an accuracy score of ~96% which the percentage of observation of the majority class (not fraud transaction)as well. 

Hence any other model that we build must be able to surpass this accuracy score and also have a high recall and low precision score. SInce the data is extremely imbalanced ROC-AUC score along with Recall score would be chosen as the metric of evaluation along with accuracy just to make sure the model is performing better than the baseline model.

Recall is an important metric for this particular project beacuse, recall score indicates the ability of a model to find all the relevant cases (fraudulent transactions in our case)within a data set. Mathematically, we define recall as the number of true positives divided by the number of true positives plus the number of false negatives. Hence as the number of false negative increases and/or number of true postives decreases the model will output a low recall score. 

