## Importing the Dependencies

In [1]:
import pandas as pd 
import matplotlib.pyplot as plt
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


### Loading the DataSet to Pandas DataFrame

In [2]:
df = pd.read_csv('Fraud.csv')

In [3]:
# First five rows of the dataframe 

df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [4]:
# Last five rows of the dataframe 

df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [5]:
# Dataset information

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


In [18]:
# checking the missng values in each column 

In [6]:
df.isnull().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

In [4]:
df['isFlaggedFraud'].value_counts()

0    6362604
1         16
Name: isFlaggedFraud, dtype: int64

In [7]:
# Disttribution of normal and fraudlennt transaction

df['isFraud'].value_counts()

0    6354407
1       8213
Name: isFraud, dtype: int64

#### The Dataset is highly unalanced 
 
 
#### 0--> Normal Transaction
#### 1 --> Fraudlent Transaction

In [8]:
# Seperating the data for Analysis

In [14]:
normal = df[df.isFraud==0]
fraud = df[df.isFraud==1]

In [15]:
print(normal.shape,fraud.shape)

(6354407, 11) (8213, 11)


In [30]:
# Statical measures of the data

normal.amount.describe()


count            6354407.0
mean    178197.04172739814
std      596236.9813471739
min                   0.01
25%              13368.395
50%               74684.72
75%              208364.76
max            92445516.64
Name: amount, dtype: float64

In [32]:
#pd.set_option('display.float_format', str) to suppress scientific notation

fraud.amount.describe()


count               8213.0
mean     1467967.299140387
std     2404252.9472401612
min                    0.0
25%              127091.33
50%              441423.44
75%             1517771.48
max             10000000.0
Name: amount, dtype: float64

In [43]:
# compare the values for both transaction

In [37]:
df.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,243.23566306029812,178197.04172740743,832828.7117272627,855970.2281088118,1101420.87456938,1224925.6845631606,0.0
1,368.4138560818215,1467967.2991403872,1649667.6057116764,192392.63183611343,544249.6190746378,1279707.6171459882,0.0019481310118105


####  As our dataset is highly unbalanced so to counter this problem i will be using under-sampling method.

 Build a sample Dataset containing similar distribution of Normal transactions and Fraudlent transactions 

number of fraudlent transactions = 8213

In [38]:
normal_sample = normal.sample(n = 8213)

concatenating two Dataframe

In [39]:
new_data = pd.concat([normal_sample,fraud], axis = 0)

In [40]:
new_data.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
5481674,379,CASH_OUT,187721.03,C923391563,558.0,0.0,C1362851575,39076.81,226797.84,0,0
4716612,332,PAYMENT,24740.84,C2090805706,0.0,0.0,M1610988397,0.0,0.0,0,0
4724267,332,PAYMENT,4189.27,C250687407,31172.0,26982.73,M630568725,0.0,0.0,0,0
3004963,232,CASH_OUT,484976.82,C903961012,697537.0,212560.18,C2009468024,5008624.73,5493601.55,0,0
1682212,159,CASH_OUT,56239.37,C1341211289,62206.0,5966.63,C743557930,122028.27,178267.64,0,0


Converting Categorical Data to Numerical Data using get_dummies method

In [41]:
df2 = new_data.copy()
df2 = pd.get_dummies(df2,columns=['type']) # converting string to numerical data

In [42]:
df2.tail()

Unnamed: 0,step,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
6362615,743,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0,0,1,0,0,0
6362616,743,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0,0,0,0,0,1
6362617,743,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0,0,1,0,0,0
6362618,743,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0,0,0,0,0,1
6362619,743,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0,0,1,0,0,0


In [48]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)


           feature                VIF
0             step  1.063912931444907
1           amount 28.184601544304087
2    oldbalanceOrg  76.25115520842311
3   newbalanceOrig   45.5389086631395
4   oldbalanceDest 14.539205607604334
5   newbalanceDest 16.403415542875333
6     type_CASH_IN 1.6048691926089769
7    type_CASH_OUT  2.387344013530083
8       type_DEBIT  1.004126231397208
9     type_PAYMENT  1.276754432244521
10   type_TRANSFER 2.3139823824259795


In [43]:
df2['isFraud'].value_counts()

0    8213
1    8213
Name: isFraud, dtype: int64

In [44]:
df2.groupby('isFraud').mean()

Unnamed: 0_level_0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,isFlaggedFraud,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
isFraud,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
0,243.8784853281383,176140.45478387922,829716.2819152563,852196.9117314015,1121813.1239912335,1243607.762172166,0.0,0.2218434189699257,0.3500547911847071,0.0057226348471934,0.3400706197491781,0.0823085352489954
1,368.4138560818215,1467967.2991403872,1649667.6057116764,192392.63183611343,544249.6190746378,1279707.6171459882,0.0019481310118105,0.0,0.5011567027882625,0.0,0.0,0.4988432972117375


Splitting the Data into Features and Target

In [45]:
X = df2.drop(columns = ['isFraud','isFlaggedFraud','nameOrig','nameDest'],axis = 1)

In [46]:
Y = df2['isFraud']

In [89]:
X

Unnamed: 0,step,amount,oldbalanceOrg,newbalanceOrig,oldbalanceDest,newbalanceDest,type_CASH_IN,type_CASH_OUT,type_DEBIT,type_PAYMENT,type_TRANSFER
4499878,325,83824.67,27195.0,0.0,0.0,83824.67,0,1,0,0,0
5663570,396,18030.18,62751.0,44720.82,0.0,0.0,0,0,0,1,0
1825391,163,324560.98,50371.0,374931.98,47523.41,0.0,1,0,0,0,0
603342,34,22995.47,121303.0,98307.53,0.0,0.0,0,0,0,1,0
480588,19,59921.31,0.0,0.0,8436582.09,8472128.77,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,339682.13,339682.13,0.0,0.0,339682.13,0,1,0,0,0
6362616,743,6311409.28,6311409.28,0.0,0.0,0.0,0,0,0,0,1
6362617,743,6311409.28,6311409.28,0.0,68488.84,6379898.11,0,1,0,0,0
6362618,743,850002.52,850002.52,0.0,0.0,0.0,0,0,0,0,1


In [90]:
print(Y)

4499878    0
5663570    0
1825391    0
603342     0
480588     0
          ..
6362615    1
6362616    1
6362617    1
6362618    1
6362619    1
Name: isFraud, Length: 16426, dtype: int64


Split the data into Training and Testing Data

In [117]:
 X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.33, random_state=42)

In [118]:
 print(X.shape,X_test.shape,X_train.shape)

(16426, 11) (5421, 11) (11005, 11)


Model Training

Using Logistics Regression Model

In [119]:
model = LogisticRegression()

In [120]:
model.fit(X_train,Y_train)

LogisticRegression()

getting the accuracy score 

In [124]:
# Accuracy score of Training DataSet

In [129]:
train_score = model.score(X_train,Y_train)
print(' Accuracy score of Training Model :',train_score)

 Accuracy score of Training Model : 0.9147660154475239


In [125]:
# Accuracy score of Testing DataSet

In [130]:
test_score = model.score(X_test,Y_test)
print(' Accuracy score of Testing Model :',test_score)

 Accuracy score of Testing Model : 0.9096107729201255
