# Import the dependencies


In [10]:
import numpy as np
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [23]:
#import the dataset

credit_card_data=pd.read_csv("CreditCard.csv")

In [24]:
#show first 5 rows
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [25]:
#show last 5 rows
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [26]:
#check the shape
credit_card_data.shape

(284807, 31)

In [27]:
#dataset information
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [28]:
#checking the number of missing values in dataset
credit_card_data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

In [29]:
#distribution of legit and fraud data collection
credit_card_data['Class'].value_counts()

0    284315
1       492
Name: Class, dtype: int64

# This data set is highly unbalanced

'0' --> Normal transection

'1' --> fraud transection

In [30]:
#seperating the data for analysis
Legit = credit_card_data[credit_card_data.Class == 0]
Fraud = credit_card_data[credit_card_data.Class == 1]  



In [31]:
print(Legit.shape)
print(Fraud.shape)

(284315, 31)
(492, 31)


In [32]:
#statistical measure of data
Legit.Amount.describe()

count    284315.000000
mean         88.291022
std         250.105092
min           0.000000
25%           5.650000
50%          22.000000
75%          77.050000
max       25691.160000
Name: Amount, dtype: float64

In [33]:
Fraud.Amount.describe()

count     492.000000
mean      122.211321
std       256.683288
min         0.000000
25%         1.000000
50%         9.250000
75%       105.890000
max      2125.870000
Name: Amount, dtype: float64

In [34]:
#compare the values of both transection
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# Under sampling
Build a dataset containing simillar distribution of legit and fraudulent transection

number of fraudulent transection == 492


In [35]:
Legit_sample=Legit.sample(n=492)

# 
concatanating two datasets

In [37]:
new_dataset=pd.concat([Legit_sample,Fraud],axis=0)

In [38]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
134804,80971.0,-0.634443,1.366636,0.724378,0.625088,1.044836,0.03828,0.699695,0.082398,-0.90238,...,-0.013207,-0.113368,-0.573596,-0.958664,0.597993,-0.149553,-0.045879,0.075302,1.0,0
77742,57190.0,1.146465,0.182321,0.234392,1.16284,-0.348032,-0.957186,0.264813,-0.229182,0.022778,...,0.054641,0.075209,-0.13975,0.392204,0.658677,-0.332807,0.002204,0.028512,54.88,0
25000,33472.0,1.441174,-0.575394,-0.153414,-0.616068,-0.74094,-0.848866,-0.364849,-0.188389,-0.512792,...,0.107262,0.279729,-0.217667,-0.100677,0.804315,-0.030289,-0.020786,-0.002571,20.0,0
215719,140189.0,2.076698,-1.383669,-1.952099,-1.873136,1.269575,3.678885,-1.579294,0.943834,-0.034977,...,0.344382,0.884022,0.141137,0.743452,-0.124536,-0.071273,0.023133,-0.0477,49.99,0
116787,74428.0,-0.812663,0.539251,1.239604,-0.480183,0.241125,-0.891298,0.548056,-0.013232,-0.273486,...,-0.057733,-0.327135,0.324109,0.274031,-0.919539,-0.220448,-0.14161,0.210567,1.29,0


In [49]:
new_dataset['Class'].value_counts()

0    492
1    492
Name: Class, dtype: int64

In [45]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94699.390244,-0.0316,-0.100788,0.046226,-0.024813,-0.04503,0.014086,0.09783,0.00768,0.005854,...,0.029981,-0.001213,-0.060797,0.029953,0.02103,-0.012746,-0.026921,-0.007528,-0.001816,106.606646
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


splitting data into features and target

In [51]:
X = new_dataset.drop(columns='Class',axis=1)
Y = new_dataset['Class']

In [52]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
134804   80971.0 -0.634443  1.366636  0.724378  0.625088  1.044836  0.038280   
77742    57190.0  1.146465  0.182321  0.234392  1.162840 -0.348032 -0.957186   
25000    33472.0  1.441174 -0.575394 -0.153414 -0.616068 -0.740940 -0.848866   
215719  140189.0  2.076698 -1.383669 -1.952099 -1.873136  1.269575  3.678885   
116787   74428.0 -0.812663  0.539251  1.239604 -0.480183  0.241125 -0.891298   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [53]:
print(Y)

134804    0
77742     0
25000     0
215719    0
116787    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


Split the data into training and testing data

In [54]:
X_train, X_test, Y_train, Y_test= train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [56]:
print(X.shape,X_train.shape,X_test.shape)

(984, 30) (787, 30) (197, 30)


Model training

Logistic Regression

In [63]:
model = LogisticRegression()

In [64]:
#training the logistic regression with training data

model.fit(X_train, Y_train)

LogisticRegression()

Model Evaluation

Accuracy Score

In [66]:
#accuracy on training data
X_train_prediction= model.predict(X_train)
training_data_accuracy= accuracy_score(X_train_prediction,Y_train)

In [67]:
print('Accuracy on training data is ',training_data_accuracy)

Accuracy on training data is  0.9453621346886912


In [68]:
#aacuracy on test data
X_test_prediction= model.predict(X_test)
testing_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [69]:
print("Accuracy of testing data is ",testing_data_accuracy)

Accuracy of testing data is  0.9187817258883249


This is a Machine Learning credit card fraud detection project in which we have successfully created a model that can detect that the transaction made by the person is Normal or fraudulent. In this project, we learned how to perform exploratory data analysis. And also we have learned how to handle highly unbalanced datasets using sampling. Also learned about Logistic Regression and how to create a Logistic Regression model.