<a href="https://colab.research.google.com/github/piyushdhurwey-unique/credit-card-fraud-detection/blob/main/credit_card_fraud_detection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/

In [None]:
!kaggle datasets download mlg-ulb/creditcardfraud

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
License(s): DbCL-1.0
Downloading creditcardfraud.zip to /content
  0% 0.00/66.0M [00:00<?, ?B/s]
100% 66.0M/66.0M [00:00<00:00, 1.05GB/s]


In [None]:
import zipfile
with zipfile.ZipFile('creditcardfraud.zip', 'r') as zip_ref:
    zip_ref.extractall('/content')
    zip_ref.close()

In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [None]:
credit_card_data = pd.read_csv('/content/creditcard.csv')

In [None]:
credit_card_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [None]:
credit_card_data.shape

(284807, 31)

In [None]:
credit_card_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [None]:
# distribution of legit(0) and fraudulent(1) transactions
credit_card_data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


In [None]:
# separating the data for analysis
legit = credit_card_data[credit_card_data.Class == 0]
fraud = credit_card_data[credit_card_data.Class == 1]

In [None]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [None]:
legit.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,88.291022
std,250.105092
min,0.0
25%,5.65
50%,22.0
75%,77.05
max,25691.16


In [None]:
fraud.Amount.describe()

Unnamed: 0,Amount
count,492.0
mean,122.211321
std,256.683288
min,0.0
25%,1.0
50%,9.25
75%,105.89
max,2125.87


In [None]:
#compare the values for both transactions
credit_card_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


Undersampling:Built a sample dataset containing similar distribution of Normal and Fraudulent Transactions

In [None]:
legit_sample = legit.sample(n=492)

In [None]:
#concatenating two dataframes
new_dataset=pd.concat([legit_sample, fraud],axis=0)

In [None]:
new_dataset.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
53048,45787.0,-1.006079,1.227957,1.500045,0.633745,0.366396,-0.436589,1.236379,-0.811133,0.458055,...,-0.001837,0.246267,-0.164172,0.400057,-0.23432,-0.594046,-1.110172,-0.277271,9.99,0
13968,24793.0,-0.418976,1.741306,0.235733,0.881549,0.687575,-0.62257,1.061386,-0.436813,1.358676,...,-0.119352,0.558946,0.038551,0.036985,-0.656003,-0.441979,0.585309,0.179297,0.89,0
201389,133863.0,-1.138285,0.152521,1.46876,-1.061703,0.437636,0.744244,0.569647,0.12681,-1.674669,...,-0.270325,-0.374726,-0.33415,0.176725,0.786827,-0.666789,-0.032832,0.082836,100.0,0
180058,124358.0,-3.960343,-1.2256,-1.732492,0.076198,2.730881,-1.524475,0.874361,0.087135,-0.703483,...,-0.592815,-0.201532,1.17666,-0.015626,1.444093,0.898164,0.151854,-0.285441,85.0,0
29084,35310.0,1.120689,0.069246,0.371138,1.485138,-0.432119,-0.648649,0.164699,-0.131184,0.380525,...,-0.109131,-0.243782,-0.123794,0.392123,0.69913,-0.345298,0.010969,0.025982,53.31,0


In [None]:
new_dataset['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,492
1,492


In [None]:
new_dataset.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,93914.977642,0.002559,0.07873,-0.020739,0.043005,0.020637,-0.031303,0.043817,0.046896,0.052782,...,0.012678,-0.0381,-0.044446,0.005971,-0.052931,0.00802,-0.013552,-0.01624,0.003279,82.234756
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


In [None]:
X= new_dataset.drop(columns='Class', axis=1)
Y=new_dataset['Class']

In [None]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
53048    45787.0 -1.006079  1.227957  1.500045  0.633745  0.366396 -0.436589   
13968    24793.0 -0.418976  1.741306  0.235733  0.881549  0.687575 -0.622570   
201389  133863.0 -1.138285  0.152521  1.468760 -1.061703  0.437636  0.744244   
180058  124358.0 -3.960343 -1.225600 -1.732492  0.076198  2.730881 -1.524475   
29084    35310.0  1.120689  0.069246  0.371138  1.485138 -0.432119 -0.648649   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [None]:
print(Y)

53048     0
13968     0
201389    0
180058    0
29084     0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


In [None]:
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2, stratify=Y, random_state=2)

In [None]:
print(X.shape,X_train.shape,X_test.shape)

(984, 30) (787, 30) (197, 30)


In [None]:
model=LogisticRegression()

In [None]:
# training the logistic regression model
model.fit(X_train,Y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Evaluation
X_train_prediction=model.predict(X_train)
training_data_accuracy=accuracy_score(X_train_prediction,Y_train)

In [None]:
print('Accuracy on Training Data: ',training_data_accuracy)

Accuracy on Training Data:  0.9415501905972046


In [None]:
X_test_prediction=model.predict(X_test)
test_data_accuracy=accuracy_score(X_test_prediction,Y_test)

In [None]:
print('Accuracy on Test Data: ',test_data_accuracy)

Accuracy on Test Data:  0.9137055837563451


In [None]:
from sklearn.metrics import recall_score

# Recall for Logistic Regression (Training Data)
logreg_train_recall = recall_score(Y_train, X_train_prediction)
print(f'Recall on Training Data (Logistic Regression): {logreg_train_recall}')

# Recall for Logistic Regression (Test Data)
logreg_test_recall = recall_score(Y_test, X_test_prediction)
print(f'Recall on Test Data (Logistic Regression): {logreg_test_recall}')

Recall on Training Data (Logistic Regression): 0.9187817258883249
Recall on Test Data (Logistic Regression): 0.8775510204081632


In [None]:
from sklearn.tree import DecisionTreeClassifier


In [None]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)


In [None]:
dt_X_train_prediction = dt_model.predict(X_train)
dt_training_data_accuracy = accuracy_score(dt_X_train_prediction, Y_train)


In [None]:
print('Accuracy on Training Data (Decision Tree): ', dt_training_data_accuracy)

Accuracy on Training Data (Decision Tree):  1.0


In [None]:
dt_X_test_prediction = dt_model.predict(X_test)
dt_test_data_accuracy = accuracy_score(dt_X_test_prediction, Y_test)


In [None]:
print('Accuracy on Test Data (Decision Tree): ', dt_test_data_accuracy)

Accuracy on Test Data (Decision Tree):  0.8883248730964467


Overfitting: The training accuracy is 1.0 (100%), which means the model has learned the training data perfectly, including any noise or specific patterns unique to the training set. However, the test accuracy is significantly lower at approximately 0.898 (89.8%), indicating that the model is not generalizing well to new, unseen data. It's too complex and has memorized the training examples rather than learning the underlying patterns.

In [None]:
dt_model_tuned = DecisionTreeClassifier(max_depth=5)
dt_model_tuned.fit(X_train, Y_train)


In [None]:
dt_tuned_X_train_prediction = dt_model_tuned.predict(X_train)
dt_tuned_training_data_accuracy = accuracy_score(dt_tuned_X_train_prediction, Y_train)
print('Accuracy on Training Data (Tuned Decision Tree): ', dt_tuned_training_data_accuracy)

Accuracy on Training Data (Tuned Decision Tree):  0.9783989834815756


In [None]:
dt_tuned_X_test_prediction = dt_model_tuned.predict(X_test)
dt_tuned_test_data_accuracy = accuracy_score(dt_tuned_X_test_prediction, Y_test)
print('Accuracy on Test Data (Tuned Decision Tree): ', dt_tuned_test_data_accuracy)

Accuracy on Test Data (Tuned Decision Tree):  0.9086294416243654


In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier()
rf_model.fit(X_train, Y_train)

In [None]:
rf_X_train_prediction = rf_model.predict(X_train)
rf_training_data_accuracy = accuracy_score(rf_X_train_prediction, Y_train)

In [None]:
print('Accuracy on Training Data (Random Forest): ', rf_training_data_accuracy)

Accuracy on Training Data (Random Forest):  1.0


In [None]:
rf_X_test_prediction = rf_model.predict(X_test)
rf_test_data_accuracy = accuracy_score(rf_X_test_prediction, Y_test)

In [None]:
print('Accuracy on Test Data (Random Forest): ', rf_test_data_accuracy)

Accuracy on Test Data (Random Forest):  0.8984771573604061


In [None]:
rf_model_tuned = RandomForestClassifier(max_depth=10, n_estimators=100, random_state=42)
rf_model_tuned.fit(X_train, Y_train)


In [None]:
rf_tuned_X_train_prediction = rf_model_tuned.predict(X_train)
rf_tuned_training_data_accuracy = accuracy_score(rf_tuned_X_train_prediction, Y_train)
print('Accuracy on Training Data (Tuned Random Forest): ', rf_tuned_training_data_accuracy)

Accuracy on Training Data (Tuned Random Forest):  0.9911054637865311


In [None]:
rf_tuned_X_test_prediction = rf_model_tuned.predict(X_test)
rf_tuned_test_data_accuracy = accuracy_score(rf_tuned_X_test_prediction, Y_test)
print('Accuracy on Test Data (Tuned Random Forest): ', rf_tuned_test_data_accuracy)

Accuracy on Test Data (Tuned Random Forest):  0.9035532994923858
