## Data Exploration and Data Preprocessing

In [1]:
#importing required libraries

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA, TruncatedSVD
import matplotlib.patches as mpatches
import time
import gc

In [2]:
#Importing Classifier Libraries

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.cluster import KMeans
from sklearn.cluster import SpectralClustering
import collections
from tqdm import tqdm_notebook
from sklearn.preprocessing import LabelEncoder

In [3]:
#importing Other Libraries

from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
#from imblearn.pipeline import make_pipeline as imbalanced_make_pipeline
#from imblearn.over_sampling import SMOTE
#from imblearn.under_sampling import NearMiss
#from imblearn.metrics import classification_report_imbalanced
from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score, accuracy_score, classification_report
from collections import Counter
from sklearn.model_selection import KFold, StratifiedKFold

#to ignore warnings
import warnings
warnings.filterwarnings("ignore")

In [4]:
#imporing dataset into dataframe using pandas

df_original = pd.read_csv('train_transaction.csv')
df_original.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,ProductCD,card1,card2,card3,card4,card5,...,V330,V331,V332,V333,V334,V335,V336,V337,V338,V339
0,2987000,0,86400,68.5,W,13926,,150.0,discover,142.0,...,,,,,,,,,,
1,2987001,0,86401,29.0,W,2755,404.0,150.0,mastercard,102.0,...,,,,,,,,,,
2,2987002,0,86469,59.0,W,4663,490.0,150.0,visa,166.0,...,,,,,,,,,,
3,2987003,0,86499,50.0,W,18132,567.0,150.0,mastercard,117.0,...,,,,,,,,,,
4,2987004,0,86506,50.0,H,4497,514.0,150.0,mastercard,102.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
#dataset columns

df_original.columns

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt',
       'ProductCD', 'card1', 'card2', 'card3', 'card4', 'card5',
       ...
       'V330', 'V331', 'V332', 'V333', 'V334', 'V335', 'V336', 'V337', 'V338',
       'V339'],
      dtype='object', length=394)

In [6]:
#dats inspection to check null values present in features

df_original.isnull().sum()

TransactionID          0
isFraud                0
TransactionDT          0
TransactionAmt         0
ProductCD              0
                   ...  
V335              508189
V336              508189
V337              508189
V338              508189
V339              508189
Length: 394, dtype: int64

In [7]:
#dimensionality of the DataFrame

df_original.shape

(590540, 394)

### Here, original dataset have 394 features. All attributes do not provide the essential information. We need to drop unnecessary features.

1. The columns having unique values less than 2
2. The columns with more than 70% of the null values

In [8]:
gc.collect()

one_value_cols = [col for col in df_original.columns if df_original[col].nunique() <= 2]
many_null_cols = [col for col in df_original.columns if df_original[col].isnull().sum() / df_original.shape[0] > 0.7]
big_top_value_cols = [col for col in df_original.columns if df_original[col].value_counts(dropna=False, normalize=True).values[0] > 0.7]

cols_to_drop = list(set(many_null_cols  + big_top_value_cols  + one_value_cols))
cols_to_drop.remove('isFraud')

print('{} features are going to be dropped for being useless'.format(len(cols_to_drop)))

train = df_original.drop(cols_to_drop, axis=1)
train.head()

312 features are going to be dropped for being useless


Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card4,card5,addr1,dist1,...,V91,V96,V99,V127,V130,V282,V283,V285,V307,V310
0,2987000,0,86400,68.5,13926,,discover,142.0,315.0,19.0,...,0.0,1.0,0.0,117.0,0.0,1.0,1.0,0.0,117.0,0.0
1,2987001,0,86401,29.0,2755,404.0,mastercard,102.0,325.0,,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,visa,166.0,330.0,287.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,mastercard,117.0,476.0,,...,0.0,48.0,10.0,1758.0,354.0,0.0,0.0,10.0,1758.0,354.0
4,2987004,0,86506,50.0,4497,514.0,mastercard,102.0,420.0,,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [9]:
#dimensionality after dropping  DataFrame

train.shape

(590540, 82)

### Here we have dropped 312 columns. Only 82 attributes are left. 

### Reducing memory usage

In [10]:
#function to reduce memory usage

def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: 
        print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

### One-hot encoding

In [11]:
#using tqdm_notebook to show progress meter

for col in tqdm_notebook(train.columns): 
    if train[col].dtype == 'object':
        le = LabelEncoder()
        le.fit(list(train[col].astype(str).values))
        train[col] = le.transform(list(train[col].astype(str).values))

HBox(children=(IntProgress(value=0, max=82), HTML(value='')))




In [12]:
train = reduce_mem_usage(train)

Mem. usage decreased to 94.61 Mb (74.4% reduction)


In [13]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card4,card5,addr1,dist1,...,V91,V96,V99,V127,V130,V282,V283,V285,V307,V310
0,2987000,0,86400,68.5,13926,,1,142.0,315.0,19.0,...,0.0,1.0,0.0,117.0,0.0,1.0,1.0,0.0,117.0,0.0
1,2987001,0,86401,29.0,2755,404.0,2,102.0,325.0,,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,4,166.0,330.0,287.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,2,117.0,476.0,,...,0.0,48.0,10.0,1758.0,354.0,0.0,0.0,10.0,1758.0,354.0
4,2987004,0,86506,50.0,4497,514.0,2,102.0,420.0,,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [14]:
train.dropna(axis=1, how='any', thresh=250000, subset=None, inplace=False)
train.columns

Index(['TransactionID', 'isFraud', 'TransactionDT', 'TransactionAmt', 'card1',
       'card2', 'card4', 'card5', 'addr1', 'dist1', 'P_emaildomain', 'C1',
       'C2', 'C5', 'C6', 'C9', 'C11', 'C13', 'C14', 'D1', 'D2', 'D3', 'D4',
       'D5', 'D10', 'D11', 'D15', 'M4', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7',
       'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V19', 'V20', 'V29', 'V30',
       'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V42', 'V43', 'V44', 'V45',
       'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52', 'V53', 'V54', 'V61',
       'V62', 'V69', 'V70', 'V75', 'V76', 'V82', 'V83', 'V90', 'V91', 'V96',
       'V99', 'V127', 'V130', 'V282', 'V283', 'V285', 'V307', 'V310'],
      dtype='object')

In [15]:
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card4,card5,addr1,dist1,...,V91,V96,V99,V127,V130,V282,V283,V285,V307,V310
0,2987000,0,86400,68.5,13926,,1,142.0,315.0,19.0,...,0.0,1.0,0.0,117.0,0.0,1.0,1.0,0.0,117.0,0.0
1,2987001,0,86401,29.0,2755,404.0,2,102.0,325.0,,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,4,166.0,330.0,287.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,2,117.0,476.0,,...,0.0,48.0,10.0,1758.0,354.0,0.0,0.0,10.0,1758.0,354.0
4,2987004,0,86506,50.0,4497,514.0,2,102.0,420.0,,...,,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [16]:
train = train.fillna((train.mean()))
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card4,card5,addr1,dist1,...,V91,V96,V99,V127,V130,V282,V283,V285,V307,V310
0,2987000,0,86400,68.5,13926,362.5,1,142.0,315.0,19.0,...,0.0,1.0,0.0,117.0,0.0,1.0,1.0,0.0,117.0,0.0
1,2987001,0,86401,29.0,2755,404.0,2,102.0,325.0,118.5,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,4,166.0,330.0,287.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,2,117.0,476.0,118.5,...,0.0,48.0,10.0,1758.0,354.0,0.0,0.0,10.0,1758.0,354.0
4,2987004,0,86506,50.0,4497,514.0,2,102.0,420.0,118.5,...,0.42041,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [17]:
train = train.dropna()
train.head()

Unnamed: 0,TransactionID,isFraud,TransactionDT,TransactionAmt,card1,card2,card4,card5,addr1,dist1,...,V91,V96,V99,V127,V130,V282,V283,V285,V307,V310
0,2987000,0,86400,68.5,13926,362.5,1,142.0,315.0,19.0,...,0.0,1.0,0.0,117.0,0.0,1.0,1.0,0.0,117.0,0.0
1,2987001,0,86401,29.0,2755,404.0,2,102.0,325.0,118.5,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
2,2987002,0,86469,59.0,4663,490.0,4,166.0,330.0,287.0,...,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
3,2987003,0,86499,50.0,18132,567.0,2,117.0,476.0,118.5,...,0.0,48.0,10.0,1758.0,354.0,0.0,0.0,10.0,1758.0,354.0
4,2987004,0,86506,50.0,4497,514.0,2,102.0,420.0,118.5,...,0.42041,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0


In [18]:
label = train['isFraud']
amount = train['TransactionAmt']
time = train['TransactionDT']
train.drop(['isFraud', 'TransactionAmt', 'TransactionDT'], axis = 1, inplace = True)
train.columns

Index(['TransactionID', 'card1', 'card2', 'card4', 'card5', 'addr1', 'dist1',
       'P_emaildomain', 'C1', 'C2', 'C5', 'C6', 'C9', 'C11', 'C13', 'C14',
       'D1', 'D2', 'D3', 'D4', 'D5', 'D10', 'D11', 'D15', 'M4', 'V2', 'V3',
       'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11', 'V12', 'V13', 'V19',
       'V20', 'V29', 'V30', 'V35', 'V36', 'V37', 'V38', 'V39', 'V40', 'V42',
       'V43', 'V44', 'V45', 'V46', 'V47', 'V48', 'V49', 'V50', 'V51', 'V52',
       'V53', 'V54', 'V61', 'V62', 'V69', 'V70', 'V75', 'V76', 'V82', 'V83',
       'V90', 'V91', 'V96', 'V99', 'V127', 'V130', 'V282', 'V283', 'V285',
       'V307', 'V310'],
      dtype='object')

### Data Normalization

In [19]:
#importing StandardScaler from sklearn.preprocessing

from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [20]:
# Fit on training set only.
scaler.fit(train)

# Apply transform to both the training set and the test set.
train_img = scaler.transform(train)

In [21]:
train = pd.DataFrame(train_img)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,69,70,71,72,73,74,75,76,77,78
0,-1.732048,0.821695,-0.000349,-2.187837,-1.393797,0.252981,-0.421234,0.443858,-0.098021,-0.09226,...,-0.812137,-0.049845,-0.328818,-0.051825,-0.291777,0.198535,0.005707,-0.355732,-0.066413,-0.334838
1,-1.732042,-1.457558,0.264666,-1.222461,-2.367142,0.357241,-4e-06,-0.524573,-0.098021,-0.09226,...,-0.812137,-0.074703,-0.328818,-0.079436,-0.291777,0.198535,0.005707,-0.355732,-0.093053,-0.334838
2,-1.732036,-1.068263,0.813853,0.708291,-0.809791,0.409371,0.713335,0.685966,-0.098021,-0.09226,...,-0.812137,-0.074703,-0.328818,-0.079436,-0.291777,0.198535,0.005707,-0.355732,-0.093053,-0.334838
3,-1.73203,1.679858,1.305567,-1.222461,-2.002138,1.931567,-4e-06,1.775451,-0.090534,-0.066398,...,-0.812137,1.118474,3.345186,0.335428,0.828927,-0.887373,-0.636532,2.690803,0.307226,0.667998
4,-1.732024,-1.102133,0.967114,-1.222461,-2.367142,1.347711,-4e-06,-0.524573,-0.098021,-0.09226,...,-8.3e-05,-0.074703,-0.328818,-0.079436,-0.291777,0.198535,0.005707,-0.355732,-0.093053,-0.334838


### Dimensionality Reduction using PCA

In [22]:
from sklearn.decomposition import PCA
# Make an instance of the Model
pca = PCA(.95)

In [23]:
pca.fit(train)
train_img = pca.transform(train)

In [24]:
train = pd.DataFrame(train_img)
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,30,31,32,33,34,35,36,37,38,39
0,1.142189,-2.407466,-0.315179,0.472726,-1.871187,-0.170313,-0.24781,-0.146323,0.145267,-0.588239,...,0.213837,-0.954562,-0.123068,-0.420971,-1.126948,1.320462,-1.430072,0.590573,-0.227249,-0.271183
1,2.635649,-0.180293,-0.298392,-0.389802,1.197492,-1.549432,-0.760506,-1.528241,2.554283,1.151469,...,-1.174306,0.022074,0.072964,0.046722,0.432892,0.054302,-0.163587,0.407974,0.911103,0.436231
2,0.790683,-3.870323,0.346263,0.704673,-0.256064,-1.403109,0.075089,0.659549,0.013285,-0.37929,...,-1.293386,0.754749,-0.065459,-0.479369,-0.247473,-0.687304,-0.045933,0.602694,0.85636,0.272771
3,0.142006,-3.741732,-0.364204,1.853785,1.373877,1.554043,1.993759,-0.448794,-0.947588,-0.103242,...,-0.402596,0.004748,0.133949,0.009899,0.378988,-0.113081,-0.28801,0.056753,0.219213,0.128744
4,0.219761,0.189417,0.115989,-0.708838,-0.567218,-0.155321,-0.147346,-0.431944,0.576049,-0.475766,...,-1.128536,0.177251,0.006774,-0.024601,0.03092,-0.06117,-0.042112,-0.145206,0.063432,-0.33805


In [25]:
train['isFraud'] = label
train['Amount'] = amount
train['Time'] = time
train.columns

Index([        0,         1,         2,         3,         4,         5,
               6,         7,         8,         9,        10,        11,
              12,        13,        14,        15,        16,        17,
              18,        19,        20,        21,        22,        23,
              24,        25,        26,        27,        28,        29,
              30,        31,        32,        33,        34,        35,
              36,        37,        38,        39, 'isFraud',  'Amount',
          'Time'],
      dtype='object')

In [26]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,33,34,35,36,37,38,39,isFraud,Amount,Time
0,1.142189,-2.407466,-0.315179,0.472726,-1.871187,-0.170313,-0.24781,-0.146323,0.145267,-0.588239,...,-0.420971,-1.126948,1.320462,-1.430072,0.590573,-0.227249,-0.271183,0,68.5,86400
1,2.635649,-0.180293,-0.298392,-0.389802,1.197492,-1.549432,-0.760506,-1.528241,2.554283,1.151469,...,0.046722,0.432892,0.054302,-0.163587,0.407974,0.911103,0.436231,0,29.0,86401
2,0.790683,-3.870323,0.346263,0.704673,-0.256064,-1.403109,0.075089,0.659549,0.013285,-0.37929,...,-0.479369,-0.247473,-0.687304,-0.045933,0.602694,0.85636,0.272771,0,59.0,86469
3,0.142006,-3.741732,-0.364204,1.853785,1.373877,1.554043,1.993759,-0.448794,-0.947588,-0.103242,...,0.009899,0.378988,-0.113081,-0.28801,0.056753,0.219213,0.128744,0,50.0,86499
4,0.219761,0.189417,0.115989,-0.708838,-0.567218,-0.155321,-0.147346,-0.431944,0.576049,-0.475766,...,-0.024601,0.03092,-0.06117,-0.042112,-0.145206,0.063432,-0.33805,0,50.0,86506


In [27]:
train.to_csv('train_data.csv')