<a href="https://colab.research.google.com/github/niharikaveer/Credit_Card_Fraud_Detection/blob/main/Credit_Card_Fraud_Detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Credit Card Fraud Detection**
This project focuses on detecting fraudulent transactions using machine learning techniques. It involves loading and exploring transaction data, preprocessing it for analysis, building a classification model such as Logistic Regression, and evaluating its performance using metrics like accuracy and precision. The goal is to develop an effective system that distinguishes between legitimate and fraudulent transactions, contributing to improved financial security and fraud prevention.

Importing the Dependencies

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
# loading the dataset to a Pandas DataFrame
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [3]:
# first 5 rows of the dataset
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0.0
1,0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0.0
2,1,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0.0
3,1,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0.0
4,2,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0.0


In [4]:
credit_card_data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
85254,60679,1.258467,0.407363,0.160962,1.034153,0.175374,-0.344634,0.227623,-0.227186,-0.20363,...,-0.081358,-0.1625,-0.154661,-0.429501,0.697308,-0.377775,0.027767,0.023488,20.0,0.0
85255,60680,1.170321,-0.791258,0.341092,-0.498724,-1.234283,-1.024154,-0.282613,-0.248967,-0.867952,...,-0.577634,-1.495287,0.148887,0.380317,-0.089069,0.725803,-0.077233,0.033769,120.99,0.0
85256,60681,0.826449,-0.246364,-0.220287,1.144915,0.119542,0.013537,0.454082,-0.030595,-0.481902,...,0.142593,0.029994,-0.281769,-0.287336,0.598513,-0.330393,-0.020391,0.033028,192.0,0.0
85257,60682,1.201303,-0.309688,-0.314367,0.167426,1.540306,4.05973,-1.136824,1.090075,0.760974,...,-0.113604,-0.280918,-0.043147,1.001217,0.572592,-0.393931,0.074081,0.032089,12.99,0.0
85258,60684,-1.03541,1.27823,1.652904,0.574609,-0.222304,0.410386,-0.100852,0.859145,-0.699598,...,-0.104574,-0.408998,-0.128501,-0.383566,0.099796,-0.43213,,,,


In [5]:
# dataset informations
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 85259 entries, 0 to 85258
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    85259 non-null  int64  
 1   V1      85259 non-null  float64
 2   V2      85259 non-null  float64
 3   V3      85259 non-null  float64
 4   V4      85259 non-null  float64
 5   V5      85259 non-null  float64
 6   V6      85259 non-null  float64
 7   V7      85259 non-null  float64
 8   V8      85259 non-null  float64
 9   V9      85259 non-null  float64
 10  V10     85259 non-null  float64
 11  V11     85259 non-null  float64
 12  V12     85259 non-null  float64
 13  V13     85259 non-null  float64
 14  V14     85259 non-null  float64
 15  V15     85259 non-null  float64
 16  V16     85259 non-null  float64
 17  V17     85259 non-null  float64
 18  V18     85259 non-null  float64
 19  V19     85259 non-null  float64
 20  V20     85259 non-null  float64
 21  V21     85259 non-null  float64
 22

In [6]:
# checking the number of missing values in each column
credit_card_data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [7]:
# distribution of legit transactions & fraudulent transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,85054
1.0,204


This Dataset is highly unblanced

0 --> Normal Transaction

1 --> fraudulent transaction

In [8]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [9]:
print(legit.shape)
print(fraud.shape)

(85054, 31)
(204, 31)


In [10]:
# statistical measures of the data
legit.Amount.describe()

Unnamed: 0,Amount
count,85054.0
mean,98.365392
std,268.470093
min,0.0
25%,7.69
50%,27.0
75%,89.8
max,19656.53


In [11]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,204.0
mean,101.986814
std,229.00897
min,0.0
25%,1.0
50%,7.55
75%,99.99
max,1809.68


In [12]:
# compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,38709.224046,-0.247847,-0.0499,0.70098,0.151762,-0.267214,0.099795,-0.095774,0.047207,-0.013915,...,0.041531,-0.031152,-0.105502,-0.037276,0.009236,0.133378,0.026126,8.7e-05,0.00184,98.365392
1.0,34136.960784,-6.403734,4.412429,-8.46741,5.101621,-4.614515,-1.96404,-6.731357,2.942715,-3.057064,...,0.391757,0.754409,-0.150411,-0.240026,-0.093307,0.225443,0.088375,0.569295,0.040876,101.986814


Under-Sampling

Build a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

Number of Fraudulent Transactions --> 492

In [13]:
legit_sample = legit.sample(n=492)

Concatenating two DataFrames

In [14]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [15]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
69327,53369,-1.236824,0.284758,0.062131,-0.078666,0.306784,0.098485,0.421982,0.508116,-0.508881,...,-0.437453,-1.214831,0.811131,-0.94239,-0.576823,-0.155292,-0.195946,-0.145883,84.0,0.0
78565,57590,1.213593,0.145622,-0.439146,0.596789,0.536597,0.773631,-0.267771,0.255765,0.372582,...,-0.187961,-0.27435,-0.248692,-1.394462,0.636894,0.507221,0.014633,0.016014,10.7,0.0
41421,40674,-0.692841,0.166683,1.352646,0.448964,1.288304,-0.926343,0.581101,-0.369026,-0.477145,...,0.017163,0.21036,-0.260939,-0.019834,0.16877,0.610043,-0.120511,-0.106382,0.0,0.0
72931,54912,1.38178,-1.322602,0.726634,-1.324297,-1.789778,-0.346488,-1.355194,0.050327,-1.438478,...,-0.022659,0.245562,0.009262,0.050782,0.216952,-0.074398,0.054077,0.028182,50.0,0.0
10915,18716,-0.103683,0.471116,2.121714,-0.177422,0.145605,0.977728,-0.104169,-0.006835,1.473459,...,-0.23751,-0.017257,-0.285577,-0.903749,-0.523272,1.064352,-0.080944,-0.114839,1.63,0.0


In [16]:
new_dataset.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
82400,59385,-7.626924,-6.97642,-2.077911,3.416754,4.458758,-5.080408,-6.578948,1.760341,-0.599509,...,1.224795,-0.656639,-0.330811,-0.078946,0.270306,0.431119,0.821381,-1.056088,18.98,1.0
83053,59669,0.326007,1.286638,-2.007181,2.419675,-1.532902,-1.432803,-2.45953,0.617738,-1.125861,...,0.556895,0.169776,-0.174357,0.308061,0.710996,-0.23103,0.580495,0.300984,6.27,1.0
83297,59777,-8.257111,-4.814461,-5.365307,1.20423,-3.34742,-1.331601,-1.967893,1.295438,-1.674415,...,0.43639,-0.077553,-3.091624,-0.390201,-0.288689,-0.340004,0.039819,-1.0079,319.2,1.0
83417,59840,-3.215382,-0.364223,-1.261883,3.794949,0.711206,-1.316476,-5.165141,0.625278,-1.582301,...,0.401341,0.152191,-0.934675,-0.256148,-0.469403,-0.282496,0.866077,-0.433466,5.91,1.0
84543,60353,-3.975216,0.581573,-1.880372,4.319241,-3.02433,1.240793,-1.909559,0.660718,-2.752611,...,0.578984,1.397311,1.045322,-0.304,0.005295,0.235435,0.962015,-0.673557,454.82,1.0


In [17]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0.0,492
1.0,204


In [18]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0.0,38930.569106,-0.137477,-0.02929,0.764806,0.089964,-0.203702,0.146313,-0.130977,0.066634,0.044454,...,0.021724,-0.069324,-0.072955,-0.03041,-0.05493,0.146169,0.010634,0.012025,-0.003893,78.335691
1.0,34136.960784,-6.403734,4.412429,-8.46741,5.101621,-4.614515,-1.96404,-6.731357,2.942715,-3.057064,...,0.391757,0.754409,-0.150411,-0.240026,-0.093307,0.225443,0.088375,0.569295,0.040876,101.986814


Splitting the data into Features & Targets

In [19]:
X = new_dataset.drop(columns='Class', axis=1)
Y = new_dataset['Class']

In [20]:
print(X)

        Time        V1        V2        V3        V4        V5        V6  \
69327  53369 -1.236824  0.284758  0.062131 -0.078666  0.306784  0.098485   
78565  57590  1.213593  0.145622 -0.439146  0.596789  0.536597  0.773631   
41421  40674 -0.692841  0.166683  1.352646  0.448964  1.288304 -0.926343   
72931  54912  1.381780 -1.322602  0.726634 -1.324297 -1.789778 -0.346488   
10915  18716 -0.103683  0.471116  2.121714 -0.177422  0.145605  0.977728   
...      ...       ...       ...       ...       ...       ...       ...   
82400  59385 -7.626924 -6.976420 -2.077911  3.416754  4.458758 -5.080408   
83053  59669  0.326007  1.286638 -2.007181  2.419675 -1.532902 -1.432803   
83297  59777 -8.257111 -4.814461 -5.365307  1.204230 -3.347420 -1.331601   
83417  59840 -3.215382 -0.364223 -1.261883  3.794949  0.711206 -1.316476   
84543  60353 -3.975216  0.581573 -1.880372  4.319241 -3.024330  1.240793   

             V7        V8        V9  ...       V20       V21       V22  \
69327  0.4219

In [21]:
print(Y)

69327    0.0
78565    0.0
41421    0.0
72931    0.0
10915    0.0
        ... 
82400    1.0
83053    1.0
83297    1.0
83417    1.0
84543    1.0
Name: Class, Length: 696, dtype: float64


Split the data into Training data & Testing Data

In [22]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)

In [23]:
print(X.shape, X_train.shape, X_test.shape)

(696, 30) (556, 30) (140, 30)


Model Training

Logistic Regression

In [24]:
model = LogisticRegression()

In [25]:
# training the Logistic Regression Model with Training Data
model.fit(X_train, Y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model Evaluation

Accuracy Score

In [26]:
# accuracy on training data
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)

In [27]:
print('Accuracy on Training data : ', training_data_accuracy)

Accuracy on Training data :  0.9586330935251799


In [28]:
# accuracy on test data
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)

In [29]:
print('Accuracy score on Test Data : ', test_data_accuracy)

Accuracy score on Test Data :  0.9357142857142857
