Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset to Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [3]:
#first five rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [4]:
# dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 59511 entries, 0 to 59510
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    59511 non-null  int64  
 1   V1      59511 non-null  float64
 2   V2      59511 non-null  float64
 3   V3      59511 non-null  float64
 4   V4      59511 non-null  float64
 5   V5      59511 non-null  float64
 6   V6      59511 non-null  float64
 7   V7      59511 non-null  float64
 8   V8      59511 non-null  float64
 9   V9      59511 non-null  float64
 10  V10     59511 non-null  float64
 11  V11     59511 non-null  float64
 12  V12     59511 non-null  float64
 13  V13     59511 non-null  float64
 14  V14     59511 non-null  float64
 15  V15     59511 non-null  float64
 16  V16     59510 non-null  float64
 17  V17     59510 non-null  float64
 18  V18     59510 non-null  float64
 19  V19     59510 non-null  float64
 20  V20     59510 non-null  float64
 21  V21     59510 non-null  float64
 22

In [9]:
# checking the number of missing values in each coloumn
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       1
V17       1
V18       1
V19       1
V20       1
V21       1
V22       1
V23       1
V24       1
V25       1
V26       1
V27       1
V28       1
Amount    1
Class     1
dtype: int64

In [10]:
# distribution of legit transaction and fraudlent transactions
credit_card_data['Class'].value_counts()

0.0    59348
1.0      162
Name: Class, dtype: int64

This dataset is highly unbalanced

0 ----> normal transactions

1 --> fraudlent transactions

In [11]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [12]:
print(legit.shape)
print(fraud.shape)

(59348, 31)
(162, 31)


In [13]:
#statistical measures of data
legit.Amount.describe()

count    59348.000000
mean        95.423013
std        270.229826
min          0.000000
25%          7.690000
50%         25.985000
75%         87.440000
max      19656.530000
Name: Amount, dtype: float64

In [14]:
fraud.Amount.describe()

count     162.000000
mean       93.565988
std       224.658775
min         0.000000
25%         1.000000
50%         4.245000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [15]:
#compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,31756.653956,-0.220094,-0.024129,0.716642,0.156674,-0.246262,0.108918,-0.094662,0.045945,0.072867,...,0.047016,-0.030245,-0.10724,-0.039144,0.006999,0.135867,0.01986,0.001431,0.004203,95.423013
1.0,28583.783951,-7.086497,5.090192,-9.571784,5.554124,-5.23016,-2.157161,-7.462325,3.43351,-3.352556,...,0.424429,0.851337,-0.20449,-0.262548,-0.080078,0.238594,0.124826,0.566296,0.036929,93.565988


Under - Sampling


Build a sample dataset containing similar distribution of normal transactions and fradulent transactions

In [16]:
legit_sample = legit.sample(n = 162)

Conacatenating two DataFrames


In [17]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [18]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
43755,41641,1.428597,-0.907639,-0.058047,-1.370968,-1.000969,-0.820064,-0.473898,-0.275356,-2.253413,...,-0.450328,-0.991395,0.143926,0.038056,0.266546,-0.425621,0.014823,0.019815,52.96,0.0
34953,37894,0.442845,1.492794,-1.605712,1.625162,1.35248,-1.869859,1.326924,-0.861025,0.126721,...,-0.239459,-0.062221,0.204475,0.046044,-0.485114,-0.485301,-0.415024,-0.468926,0.89,0.0
58549,48440,1.314607,-0.184961,0.120973,-0.463546,-0.768848,-1.214965,-0.25669,-0.190032,-1.080099,...,-0.271235,-0.912385,0.281809,0.476955,0.056322,-0.64452,0.02019,0.043905,20.0,0.0
52813,45656,-1.171559,1.745159,1.441715,-0.047893,-0.30012,-0.571349,-0.029771,-2.498441,-0.393213,...,2.098413,-1.220703,0.347408,0.680678,-0.14062,0.065747,0.376654,0.150813,2.69,0.0
35491,38143,-0.273273,1.224363,0.975346,-0.151459,0.564503,-0.46974,0.786911,-0.099961,-0.552009,...,-0.304431,-0.72704,-0.142015,-0.467661,-0.001782,0.119243,0.254813,0.092547,0.99,0.0


In [19]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
57248,47826,-0.887287,1.390002,1.219686,1.661425,1.009228,-0.733908,0.855829,7.7e-05,-1.275631,...,-0.083734,-0.34693,-0.050619,0.231044,-0.45076,-0.376205,0.034504,0.157775,7.58,1.0
57470,47923,0.364377,1.443523,-2.220907,2.036985,-1.237055,-1.728161,-2.058582,0.358895,-1.393306,...,0.40273,-0.132129,-0.032977,0.460861,0.560404,0.409366,0.539668,0.296918,0.76,1.0
57615,47982,-1.232804,2.244119,-1.703826,1.492536,-1.192985,-1.68611,-1.864612,0.856122,-1.973535,...,0.560475,0.165682,-0.013754,0.474935,-0.218725,0.302809,0.466031,0.250134,0.76,1.0
58422,48380,-2.790771,-1.464269,1.031165,1.921356,-0.090014,-0.483871,0.780731,-0.348776,0.609133,...,-0.392667,0.44002,0.777659,0.418552,0.244563,-0.159361,0.06054,0.356958,208.58,1.0
58761,48533,1.243848,0.524526,-0.538884,1.209196,0.479538,-0.197429,0.049166,0.037792,0.128119,...,-0.05166,-0.084089,-0.192846,-0.917392,0.681953,-0.194419,0.045917,0.040136,1.0,1.0


In [20]:
new_dataset['Class'].value_counts()

0.0    162
1.0    162
Name: Class, dtype: int64

In [21]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,32514.759259,-0.243978,0.031728,0.794961,0.341114,-0.212988,0.181539,-0.201063,-0.051174,0.034777,...,-0.027455,0.04218,-0.061652,-0.003204,0.016102,0.106899,0.018731,0.017528,0.00121,81.019815
1.0,28583.783951,-7.086497,5.090192,-9.571784,5.554124,-5.23016,-2.157161,-7.462325,3.43351,-3.352556,...,0.424429,0.851337,-0.20449,-0.262548,-0.080078,0.238594,0.124826,0.566296,0.036929,93.565988


Splitting the data into features and target

In [25]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [26]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
43755  41641  1.428597 -0.907639 -0.058047 -1.370968 -1.000969 -0.820064   
34953  37894  0.442845  1.492794 -1.605712  1.625162  1.352480 -1.869859   
58549  48440  1.314607 -0.184961  0.120973 -0.463546 -0.768848 -1.214965   
52813  45656 -1.171559  1.745159  1.441715 -0.047893 -0.300120 -0.571349   
35491  38143 -0.273273  1.224363  0.975346 -0.151459  0.564503 -0.469740   
...      ...       ...       ...       ...       ...       ...       ...   
57248  47826 -0.887287  1.390002  1.219686  1.661425  1.009228 -0.733908   
57470  47923  0.364377  1.443523 -2.220907  2.036985 -1.237055 -1.728161   
57615  47982 -1.232804  2.244119 -1.703826  1.492536 -1.192985 -1.686110   
58422  48380 -2.790771 -1.464269  1.031165  1.921356 -0.090014 -0.483871   
58761  48533  1.243848  0.524526 -0.538884  1.209196  0.479538 -0.197429   

             V7        V8        V9  ...       V20       V21       V22  \
43755 -0.4738

In [27]:
print(Y)

43755    0.0
34953    0.0
58549    0.0
52813    0.0
35491    0.0
        ... 
57248    1.0
57470    1.0
57615    1.0
58422    1.0
58761    1.0
Name: Class, Length: 324, dtype: float64


Split the data into training data and testing data

In [29]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y,random_state=2)

In [30]:
print(X.shape, X_train.shape, X_test.shape)

(324, 30) (259, 30) (65, 30)


Model Training

In [31]:
model = LogisticRegression()

In [32]:
# training the model with training data
model.fit(X_train, Y_train)

Model Evaluation

In [34]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [35]:
print('Accuracy on training data:', training_data_accuracy)

Accuracy on training data: 0.9498069498069498


In [36]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [37]:
print('Accuracy Score on test Data : ', test_data_accuracy)

Accuracy Score on test Data :  0.9230769230769231
