In [34]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [35]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [36]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [37]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
97137,66085,0.998571,-0.046655,0.644378,1.114925,-0.665698,-0.627024,-0.002057,0.01811,-0.052156,...,-0.171808,-0.784227,0.095822,0.455048,0.167887,-0.72906,0.004417,0.037612,89.0,0.0
97138,66085,-1.326193,0.549467,1.220272,1.286509,0.473532,-0.681876,-0.249255,0.444731,-0.768583,...,0.088777,0.029885,-0.123943,-0.092548,-0.159851,-0.360097,0.318036,0.007246,3.6,0.0
97139,66086,1.230983,-0.22452,-0.345196,0.212802,1.586953,3.997378,-1.145351,1.068038,0.584379,...,0.067612,0.229977,-0.119921,1.019614,0.667317,-0.226637,0.071064,0.028365,1.0,0.0
97140,66086,1.241193,0.767604,-0.210715,1.297487,0.152102,-1.162435,0.389686,-0.321743,-0.288129,...,-0.036601,0.032307,-0.136263,0.308814,0.73834,-0.331821,0.040823,0.054137,1.0,0.0
97141,66087,0.310485,-2.576074,1.002015,0.011196,-2.280745,0.465648,-0.860224,0.156411,0.087629,...,,,,,,,,,,


In [38]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 97142 entries, 0 to 97141
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    97142 non-null  int64  
 1   V1      97142 non-null  float64
 2   V2      97142 non-null  float64
 3   V3      97142 non-null  float64
 4   V4      97142 non-null  float64
 5   V5      97142 non-null  float64
 6   V6      97142 non-null  float64
 7   V7      97142 non-null  float64
 8   V8      97142 non-null  float64
 9   V9      97142 non-null  float64
 10  V10     97142 non-null  float64
 11  V11     97142 non-null  float64
 12  V12     97142 non-null  float64
 13  V13     97142 non-null  float64
 14  V14     97142 non-null  float64
 15  V15     97142 non-null  float64
 16  V16     97142 non-null  float64
 17  V17     97142 non-null  float64
 18  V18     97142 non-null  float64
 19  V19     97141 non-null  float64
 20  V20     97141 non-null  float64
 21  V21     97141 non-null  float64
 22

In [39]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [40]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,96919
1.0,222


This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [41]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [42]:
print(legit.shape)
print(fraud.shape)

(96919, 31)
(222, 31)


In [43]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,96919.0
mean,98.31027
std,265.983851
min,0.0
25%,7.58
50%,26.61
75%,89.345
max,19656.53


In [44]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,222.0
mean,114.488243
std,255.373074
min,0.0
25%,1.0
50%,7.805
75%,99.99
max,1809.68


In [45]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,41730.096658,-0.249732,-0.043534,0.694778,0.151455,-0.269573,0.098435,-0.093862,0.050744,-0.036243,...,0.043675,-0.031999,-0.108137,-0.036618,0.009741,0.131925,0.026549,-0.0007,0.001378,98.31027
1.0,36541.941441,-6.044462,4.134072,-7.932926,4.915738,-4.386432,-1.796113,-6.30049,2.722455,-2.896811,...,0.345305,0.715188,-0.125165,-0.26545,-0.105791,0.205945,0.103589,0.523395,0.037908,114.488243


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [46]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [47]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [48]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
12820,22500,0.020386,-2.141652,0.652948,0.850495,-1.26217,1.14294,-0.307323,0.293975,2.170156,...,0.049778,-0.589089,-0.335565,-0.264663,-0.280469,0.857479,-0.135752,0.085883,555.62,0.0
28338,34960,-0.51957,1.220854,1.883044,2.858822,0.100109,-0.134817,0.319055,-0.09462,-1.293369,...,0.075139,0.18195,0.076458,0.399632,-0.753255,-0.012098,-0.088337,0.240989,5.43,0.0
84480,60327,1.311626,0.075634,-1.292619,-0.483309,2.139444,3.181496,-0.505297,0.80491,-0.074048,...,-0.342791,-1.148403,0.095446,0.946111,0.369783,0.110646,-0.021801,0.023605,1.79,0.0
405,292,1.252189,-0.126779,0.280285,0.579416,-0.374125,-0.215217,-0.193078,0.011076,0.770448,...,-0.360296,-0.959573,-0.023837,-0.462201,0.381732,0.340518,-0.034929,0.007525,23.88,0.0
87747,61818,-1.090234,0.10346,1.664984,-1.861969,-1.019642,-0.329755,-0.700244,0.686723,-1.203913,...,0.513739,1.183572,-0.232971,0.029328,0.162284,-0.179175,0.209487,0.06427,34.0,0.0


In [49]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
95534,65358,1.193916,-0.571085,0.742522,-0.014588,-0.624561,0.832162,-0.83335,0.272897,1.169425,...,-0.049502,0.207265,-0.265272,-0.679294,0.511812,1.246604,-0.028671,-0.006112,31.91,1.0
95597,65385,-2.923827,1.524837,-3.018758,3.289291,-5.755542,2.218276,-0.509995,-3.569444,-1.016592,...,-0.511657,-0.122724,-4.288639,0.563797,-0.949451,-0.204532,1.510206,-0.324706,1354.25,1.0
96341,65728,1.227614,-0.668974,-0.271785,-0.58944,-0.604795,-0.350285,-0.486365,-0.010809,-0.794944,...,-0.026055,-0.295255,-0.180459,-0.436539,0.494649,-0.283738,-0.001128,0.035075,98.01,1.0
96789,65936,-3.593476,0.781442,-1.822448,0.605761,-1.194656,-0.517195,-1.722523,0.12889,0.014963,...,0.351792,0.391249,-0.252875,-0.498042,0.010172,0.909929,-1.478767,0.722673,101.5,1.0
96994,66037,0.286302,1.399345,-1.682503,3.864377,-1.185373,-0.341732,-2.53938,0.768378,-1.547882,...,0.352456,-0.243678,-0.194079,-0.172201,0.742237,0.12779,0.569731,0.291206,7.53,1.0


In [50]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,222


In [51]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,42829.03252,-0.24939,-0.072887,0.593993,0.085749,-0.244931,0.163452,-0.04151,0.057746,-0.122733,...,0.127697,0.016021,-0.12261,0.000993,-0.024737,0.141649,0.002455,0.008706,0.01495,117.192134
1.0,36541.941441,-6.044462,4.134072,-7.932926,4.915738,-4.386432,-1.796113,-6.30049,2.722455,-2.896811,...,0.345305,0.715188,-0.125165,-0.26545,-0.105791,0.205945,0.103589,0.523395,0.037908,114.488243


Splitting the data into Features & Targets

In [52]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [53]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
12820  22500  0.020386 -2.141652  0.652948  0.850495 -1.262170  1.142940   
28338  34960 -0.519570  1.220854  1.883044  2.858822  0.100109 -0.134817   
84480  60327  1.311626  0.075634 -1.292619 -0.483309  2.139444  3.181496   
405      292  1.252189 -0.126779  0.280285  0.579416 -0.374125 -0.215217   
87747  61818 -1.090234  0.103460  1.664984 -1.861969 -1.019642 -0.329755   
...      ...       ...       ...       ...       ...       ...       ...   
95534  65358  1.193916 -0.571085  0.742522 -0.014588 -0.624561  0.832162   
95597  65385 -2.923827  1.524837 -3.018758  3.289291 -5.755542  2.218276   
96341  65728  1.227614 -0.668974 -0.271785 -0.589440 -0.604795 -0.350285   
96789  65936 -3.593476  0.781442 -1.822448  0.605761 -1.194656 -0.517195   
96994  66037  0.286302  1.399345 -1.682503  3.864377 -1.185373 -0.341732   

             V7        V8        V9  ...       V20       V21       V22  \
12820 -0.3073

In [54]:
print(Y)

12820    0.0
28338    0.0
84480    0.0
405      0.0
87747    0.0
        ... 
95534    1.0
95597    1.0
96341    1.0
96789    1.0
96994    1.0
Name: Class, Length: 714, dtype: float64


Split the data into Training data & Testing Data

In [55]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [56]:
print(X.shape, X_train.shape, X_test.shape)

(714, 30) (571, 30) (143, 30)


Model Training

Logistic Regression

In [57]:
# training the Logistic Regression Model with Training Data
# Import Logistic Regression
from sklearn.linear_model import LogisticRegression

# Create the model
model = LogisticRegression()

# Train the model
model.fit(X_train, Y_train)

Model Evaluation

Accuracy Score

In [58]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [59]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.957968476357268


In [60]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [61]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9300699300699301
