## Author: Miral Patel

## Code Clause Internship February 2024

## Domain: Data Science

## Project Title - Credit Card Fraud Detection

### Importing Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

### Importing Dataset

In [2]:
data = pd.read_csv('creditcard.csv')

In [3]:
data

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,33469.0,-0.311372,0.759277,1.650994,1.091764,0.747545,0.924909,0.675067,-0.052870,-0.170135,...,-0.040939,0.357128,-0.406407,-0.871783,0.265729,-0.089932,-0.000652,-0.116323,20.00,0
24996,33469.0,-0.745983,0.518650,1.533865,1.335173,0.376647,0.019838,0.221768,0.328102,-0.390842,...,0.054737,0.314848,-0.034032,0.218806,-0.161277,-0.250516,0.402972,0.202283,18.04,0
24997,33470.0,-0.618962,1.302587,1.182654,-0.046862,0.267769,-0.527204,0.658724,0.029762,-0.331111,...,-0.259006,-0.529602,0.006277,0.042659,-0.155856,0.103680,0.378294,0.154449,1.79,0
24998,33471.0,1.377620,-0.279881,0.119446,-0.594892,-0.764533,-1.241617,-0.010524,-0.388173,-1.259304,...,-0.784787,-1.841549,0.264707,0.388256,0.013974,0.618726,-0.072529,0.013620,25.48,0


### Data Preprocessing

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 31 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Time    25000 non-null  float64
 1   V1      25000 non-null  float64
 2   V2      25000 non-null  float64
 3   V3      25000 non-null  float64
 4   V4      25000 non-null  float64
 5   V5      25000 non-null  float64
 6   V6      25000 non-null  float64
 7   V7      25000 non-null  float64
 8   V8      25000 non-null  float64
 9   V9      25000 non-null  float64
 10  V10     25000 non-null  float64
 11  V11     25000 non-null  float64
 12  V12     25000 non-null  float64
 13  V13     25000 non-null  float64
 14  V14     25000 non-null  float64
 15  V15     25000 non-null  float64
 16  V16     25000 non-null  float64
 17  V17     25000 non-null  float64
 18  V18     25000 non-null  float64
 19  V19     25000 non-null  float64
 20  V20     25000 non-null  float64
 21  V21     25000 non-null  float64
 22

### Checking For Null Or Missing Values

In [5]:
data.isnull().sum()

Time      0
V1        0
V2        0
V3        0
V4        0
V5        0
V6        0
V7        0
V8        0
V9        0
V10       0
V11       0
V12       0
V13       0
V14       0
V15       0
V16       0
V17       0
V18       0
V19       0
V20       0
V21       0
V22       0
V23       0
V24       0
V25       0
V26       0
V27       0
V28       0
Amount    0
Class     0
dtype: int64

### By Distributing Legit Transactions And Fradulent Transactions 

In [6]:
data['Class'].value_counts()

Class
0    24912
1       88
Name: count, dtype: int64

In [7]:
legit = data[data.Class == 0]
fraud = data[data.Class == 1]

In [8]:
legit

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.166480,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.167170,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.379780,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.108300,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.50,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.206010,0.502292,0.219422,0.215153,69.99,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,33469.0,-0.311372,0.759277,1.650994,1.091764,0.747545,0.924909,0.675067,-0.052870,-0.170135,...,-0.040939,0.357128,-0.406407,-0.871783,0.265729,-0.089932,-0.000652,-0.116323,20.00,0
24996,33469.0,-0.745983,0.518650,1.533865,1.335173,0.376647,0.019838,0.221768,0.328102,-0.390842,...,0.054737,0.314848,-0.034032,0.218806,-0.161277,-0.250516,0.402972,0.202283,18.04,0
24997,33470.0,-0.618962,1.302587,1.182654,-0.046862,0.267769,-0.527204,0.658724,0.029762,-0.331111,...,-0.259006,-0.529602,0.006277,0.042659,-0.155856,0.103680,0.378294,0.154449,1.79,0
24998,33471.0,1.377620,-0.279881,0.119446,-0.594892,-0.764533,-1.241617,-0.010524,-0.388173,-1.259304,...,-0.784787,-1.841549,0.264707,0.388256,0.013974,0.618726,-0.072529,0.013620,25.48,0


In [9]:
fraud

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
541,406.0,-2.312227,1.951992,-1.609851,3.997906,-0.522188,-1.426545,-2.537387,1.391657,-2.770089,...,0.517232,-0.035049,-0.465211,0.320198,0.044519,0.177840,0.261145,-0.143276,0.00,1
623,472.0,-3.043541,-3.157307,1.088463,2.288644,1.359805,-1.064823,0.325574,-0.067794,-0.270953,...,0.661696,0.435477,1.375966,-0.293803,0.279798,-0.145362,-0.252773,0.035764,529.00,1
4920,4462.0,-2.303350,1.759247,-0.359745,2.330243,-0.821628,-0.075788,0.562320,-0.399147,-0.238253,...,-0.294166,-0.932391,0.172726,-0.087330,-0.156114,-0.542628,0.039566,-0.153029,239.93,1
6108,6986.0,-4.397974,1.358367,-2.592844,2.679787,-1.128131,-1.706536,-3.496197,-0.248778,-0.247768,...,0.573574,0.176968,-0.436207,-0.053502,0.252405,-0.657488,-0.827136,0.849573,59.00,1
6329,7519.0,1.234235,3.019740,-4.304597,4.732795,3.624201,-1.357746,1.713445,-0.496358,-1.282858,...,-0.379068,-0.704181,-0.656805,-1.632653,1.488901,0.566797,-0.010016,0.146793,1.00,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18773,29753.0,0.269614,3.549755,-5.810353,5.809370,1.538808,-2.269219,-0.824203,0.351070,-3.759059,...,0.371121,-0.322290,-0.549856,-0.520629,1.378210,0.564714,0.553255,0.402400,0.68,1
18809,29785.0,0.923764,0.344048,-2.880004,1.721680,-3.019565,-0.639736,-3.801325,1.299096,0.864065,...,0.899931,1.481271,0.725266,0.176960,-1.815638,-0.536517,0.489035,-0.049729,30.30,1
20198,30852.0,-2.830984,0.885657,1.199930,2.861292,0.321669,0.289966,1.767760,-2.451050,0.069736,...,0.546589,0.334971,0.172106,0.623590,-0.527114,-0.079215,-2.532445,0.311177,104.81,1
23308,32686.0,0.287953,1.728735,-1.652173,3.813544,-1.090927,-0.984745,-2.202318,0.555088,-2.033892,...,0.262202,-0.633528,0.092891,0.187613,0.368708,-0.132474,0.576561,0.309843,0.00,1


In [10]:
print(legit.shape)
print(fraud.shape)

(24912, 31)
(88, 31)


### Statistical Measures Of Data

In [11]:
legit.Amount.describe()

count    24912.000000
mean        75.466638
std        217.958420
min          0.000000
25%          6.040000
50%         18.960000
75%         67.560000
max       7879.420000
Name: Amount, dtype: float64

In [12]:
fraud.Amount.describe()

count      88.000000
mean      100.010000
std       265.845031
min         0.000000
25%         1.000000
50%         1.000000
75%        99.990000
max      1809.680000
Name: Amount, dtype: float64

In [13]:
data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,18902.256342,-0.205708,0.158976,0.77052,0.218704,-0.171282,0.092146,-0.09993,0.009893,0.480893,...,0.040015,-0.041217,-0.133011,-0.037927,0.012091,0.128848,0.0255,0.009969,0.004986,75.466638
1,17935.875,-8.613716,6.376169,-12.221731,6.231847,-6.027247,-2.48708,-8.308784,4.351326,-2.987199,...,0.714069,0.539387,-0.381823,-0.350615,-0.25297,0.346695,0.17976,0.856336,0.100578,100.01


### By Using Under sampling

In [14]:
legit_sample = legit.sample(n = 492)

In [15]:
new_dataset = pd.concat([legit_sample, fraud], axis=0)

In [16]:
new_dataset

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
22044,32002.0,0.695246,-1.163061,1.311907,1.147208,-1.615465,0.372051,-0.747365,0.301908,1.326068,...,0.199570,0.491201,-0.297715,0.631091,0.361018,0.642230,-0.017023,0.046957,216.06,0
11863,20391.0,1.254478,-0.257222,0.896744,-0.600173,-0.942879,-0.561004,-0.637062,-0.111637,3.081632,...,-0.176244,-0.017835,-0.081689,-0.002555,0.531783,-0.705114,0.066470,0.027809,11.85,0
2335,1874.0,-2.482454,2.308903,0.321264,-0.926674,-1.113441,-0.906290,-0.848198,-1.634175,-0.463774,...,2.442128,-0.725129,0.360794,0.737415,-0.326198,0.678931,-0.798529,-0.294767,12.99,0
6194,7214.0,1.172754,0.152589,0.650324,0.997200,-0.252587,-0.177117,-0.141007,-0.111947,1.660239,...,-0.342429,-0.517354,0.038920,0.073344,0.373193,0.256919,-0.030908,0.006106,16.99,0
21172,31517.0,-0.394646,-0.276688,1.866060,-0.743388,-1.166051,0.189757,-0.082329,0.129120,-1.540648,...,-0.224596,-0.403483,0.280851,0.018599,-0.740426,0.979488,0.064942,0.134891,119.95,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18773,29753.0,0.269614,3.549755,-5.810353,5.809370,1.538808,-2.269219,-0.824203,0.351070,-3.759059,...,0.371121,-0.322290,-0.549856,-0.520629,1.378210,0.564714,0.553255,0.402400,0.68,1
18809,29785.0,0.923764,0.344048,-2.880004,1.721680,-3.019565,-0.639736,-3.801325,1.299096,0.864065,...,0.899931,1.481271,0.725266,0.176960,-1.815638,-0.536517,0.489035,-0.049729,30.30,1
20198,30852.0,-2.830984,0.885657,1.199930,2.861292,0.321669,0.289966,1.767760,-2.451050,0.069736,...,0.546589,0.334971,0.172106,0.623590,-0.527114,-0.079215,-2.532445,0.311177,104.81,1
23308,32686.0,0.287953,1.728735,-1.652173,3.813544,-1.090927,-0.984745,-2.202318,0.555088,-2.033892,...,0.262202,-0.633528,0.092891,0.187613,0.368708,-0.132474,0.576561,0.309843,0.00,1


In [17]:
new_dataset['Class'].value_counts()

Class
0    492
1     88
Name: count, dtype: int64

In [18]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,18769.280488,-0.244903,0.180032,0.757572,0.302345,-0.088466,0.178659,-0.110363,-0.128379,0.436475,...,0.088144,-0.120927,-0.107109,-0.019058,0.006361,0.121012,0.010213,0.025032,0.017703,79.082195
1,17935.875,-8.613716,6.376169,-12.221731,6.231847,-6.027247,-2.48708,-8.308784,4.351326,-2.987199,...,0.714069,0.539387,-0.381823,-0.350615,-0.25297,0.346695,0.17976,0.856336,0.100578,100.01


Building a sample dataset containing similar distribution of normal transactions and Fraudulent Transactions

We can see Number of Fraudulent Transactions

### Splitting The Data Into Features & Targets

In [19]:
x = new_dataset.drop(columns = 'Class', axis = 1)
y = new_dataset['Class']

In [20]:
print(x)

          Time        V1        V2        V3        V4        V5        V6  \
22044  32002.0  0.695246 -1.163061  1.311907  1.147208 -1.615465  0.372051   
11863  20391.0  1.254478 -0.257222  0.896744 -0.600173 -0.942879 -0.561004   
2335    1874.0 -2.482454  2.308903  0.321264 -0.926674 -1.113441 -0.906290   
6194    7214.0  1.172754  0.152589  0.650324  0.997200 -0.252587 -0.177117   
21172  31517.0 -0.394646 -0.276688  1.866060 -0.743388 -1.166051  0.189757   
...        ...       ...       ...       ...       ...       ...       ...   
18773  29753.0  0.269614  3.549755 -5.810353  5.809370  1.538808 -2.269219   
18809  29785.0  0.923764  0.344048 -2.880004  1.721680 -3.019565 -0.639736   
20198  30852.0 -2.830984  0.885657  1.199930  2.861292  0.321669  0.289966   
23308  32686.0  0.287953  1.728735 -1.652173  3.813544 -1.090927 -0.984745   
23422  32745.0 -2.179135  0.020218 -2.182733  2.572046 -3.663733  0.081568   

             V7        V8        V9  ...       V20       V21   

In [21]:
print(y)

22044    0
11863    0
2335     0
6194     0
21172    0
        ..
18773    1
18809    1
20198    1
23308    1
23422    1
Name: Class, Length: 580, dtype: int64


### Split The Data Into Training & Testing Data

In [22]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 2)

In [23]:
print(x.shape, x_train.shape, x_test.shape)

(580, 30) (464, 30) (116, 30)


### Model training

### By Using Logistic Regression

In [24]:
model = LogisticRegression()

Training The Data With Logistic Regression

In [25]:
model.fit(x_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Model Evaluation

In [26]:
x_train_prediction = model.predict(x_train)
training_data_accuracy = accuracy_score(x_train_prediction, y_train)

Accuracy On Training Data

In [27]:
print('Accuracy on Training data: ',training_data_accuracy)

Accuracy on Training data:  0.9827586206896551


Accuracy On Test Data

In [28]:
x_test_prediction = model.predict(x_test)
test_data_accuracy = accuracy_score(x_test_prediction, y_test)

In [29]:
print('Accuracy score on Test Data: ',test_data_accuracy)

Accuracy score on Test Data:  0.9913793103448276


## Project Made By Miral Patel

## Thankyou