# Downloading Dataset from Kaggle into Google Drive

In [1]:
from google.colab import files

# Upload kaggle.json (API token)
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"miss3persin","key":"195954e66e8429ca9aad405da9f6347b"}'}

In [2]:
# Set up Kaggle API credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [7]:
# List files to confirm availability

!kaggle datasets files -d mlg-ulb/creditcardfraud

name             size  creationDate         
--------------  -----  -------------------  
creditcard.csv  144MB  2019-09-20 00:04:39  


In [4]:
# Download dataset
!kaggle datasets download -d mlg-ulb/creditcardfraud --path /content

Dataset URL: https://www.kaggle.com/datasets/mlg-ulb/creditcardfraud
License(s): DbCL-1.0
Downloading creditcardfraud.zip to /content
 89% 59.0M/66.0M [00:00<00:00, 74.5MB/s]
100% 66.0M/66.0M [00:00<00:00, 82.3MB/s]


In [5]:
import zipfile

# Extract the downloaded ZIP file
with zipfile.ZipFile("/content/creditcardfraud.zip", 'r') as zip_ref:
    zip_ref.extractall("/content/creditcardfraud.csv")

In [6]:
from google.colab import drive
import os
import shutil

drive.mount('/content/drive')

# Set the destination path in Google Drive
drive_path = "/content/drive/MyDrive/credit_card_fraud_dataset"

# Create the folder if it doesn't exist
os.makedirs(drive_path, exist_ok=True)

# Move train.csv to Google Drive
shutil.move("/content/creditcardfraud.csv", drive_path)

print(f"creditcardfraud saved to {drive_path}")

Mounted at /content/drive
creditcardfraud saved to /content/drive/MyDrive/credit_card_fraud_dataset


# Imports

In [3]:
# Imports

from google.colab import drive
drive.mount('/content/drive')
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Data Collection and Pre-Analysis

In [5]:
data = pd.read_csv('/content/drive/MyDrive/credit_card_fraud_dataset/creditcardfraud.csv/creditcard.csv')

In [6]:
data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [7]:
data.shape

(284807, 31)

In [8]:
data.isnull().sum()

Unnamed: 0,0
Time,0
V1,0
V2,0
V3,0
V4,0
V5,0
V6,0
V7,0
V8,0
V9,0


In [10]:
data['Class'].unique()

array([0, 1])

0 --> Legit Transaction
1 --> Fraudulent Transaction

In [11]:
# check last 5 rows
data.tail()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
284802,172786.0,-11.881118,10.071785,-9.834783,-2.066656,-5.364473,-2.606837,-4.918215,7.305334,1.914428,...,0.213454,0.111864,1.01448,-0.509348,1.436807,0.250034,0.943651,0.823731,0.77,0
284803,172787.0,-0.732789,-0.05508,2.03503,-0.738589,0.868229,1.058415,0.02433,0.294869,0.5848,...,0.214205,0.924384,0.012463,-1.016226,-0.606624,-0.395255,0.068472,-0.053527,24.79,0
284804,172788.0,1.919565,-0.301254,-3.24964,-0.557828,2.630515,3.03126,-0.296827,0.708417,0.432454,...,0.232045,0.578229,-0.037501,0.640134,0.265745,-0.087371,0.004455,-0.026561,67.88,0
284805,172788.0,-0.24044,0.530483,0.70251,0.689799,-0.377961,0.623708,-0.68618,0.679145,0.392087,...,0.265245,0.800049,-0.163298,0.123205,-0.569159,0.546668,0.108821,0.104533,10.0,0
284806,172792.0,-0.533413,-0.189733,0.703337,-0.506271,-0.012546,-0.649617,1.577006,-0.41465,0.48618,...,0.261057,0.643078,0.376777,0.008797,-0.473649,-0.818267,-0.002415,0.013649,217.0,0


In [12]:
data['Class'].value_counts()

Unnamed: 0_level_0,count
Class,Unnamed: 1_level_1
0,284315
1,492


crazily unbalanced

In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 284807 entries, 0 to 284806
Data columns (total 31 columns):
 #   Column  Non-Null Count   Dtype  
---  ------  --------------   -----  
 0   Time    284807 non-null  float64
 1   V1      284807 non-null  float64
 2   V2      284807 non-null  float64
 3   V3      284807 non-null  float64
 4   V4      284807 non-null  float64
 5   V5      284807 non-null  float64
 6   V6      284807 non-null  float64
 7   V7      284807 non-null  float64
 8   V8      284807 non-null  float64
 9   V9      284807 non-null  float64
 10  V10     284807 non-null  float64
 11  V11     284807 non-null  float64
 12  V12     284807 non-null  float64
 13  V13     284807 non-null  float64
 14  V14     284807 non-null  float64
 15  V15     284807 non-null  float64
 16  V16     284807 non-null  float64
 17  V17     284807 non-null  float64
 18  V18     284807 non-null  float64
 19  V19     284807 non-null  float64
 20  V20     284807 non-null  float64
 21  V21     28

In [14]:
# seperating legit and fraud data for analysis

legit = data[data.Class == 0]
fraud = data[data.Class == 1]

In [15]:
print(legit.shape)
print(fraud.shape)

(284315, 31)
(492, 31)


In [16]:
# statistical measures for the Amount attribute in legit data

legit.Amount.describe()

Unnamed: 0,Amount
count,284315.0
mean,88.291022
std,250.105092
min,0.0
25%,5.65
50%,22.0
75%,77.05
max,25691.16


In [17]:
# statistical measures for the Amount attribute in fraud data

fraud.Amount.describe()

Unnamed: 0,Amount
count,492.0
mean,122.211321
std,256.683288
min,0.0
25%,1.0
50%,9.25
75%,105.89
max,2125.87


In [18]:
# compare the values for the two transactions (legit vs fraud)

data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94838.202258,0.008258,-0.006271,0.012171,-0.00786,0.005453,0.002419,0.009637,-0.000987,0.004467,...,-0.000644,-0.001235,-2.4e-05,7e-05,0.000182,-7.2e-05,-8.9e-05,-0.000295,-0.000131,88.291022
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# Dealing with the Unbalanced Data

We'll use 'Undersampling'.

We'll build a sample dataset containing similar distribution of normal transaction and the fraudulent transaction.

In [19]:
legit_sample = legit.sample(n=492)

# I think the way it works is that we check the value with the least number of samples in the target class
# then we sample the total number it has at random from the value with the most samples and try to equate it.
# for instance, 1 is the least in this case with 492, so we sample 492 from 1 and use that with the 492 1s as
# the new dataset

In [20]:
# We'll concatenate the two DataFrames

new_data = pd.concat([legit_sample, fraud], axis=0) # axis=0 so that it adds each rows one by one under them (not by column in the case of axis=1)

In [21]:
new_data.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
253877,156472.0,-0.311727,1.179052,-0.053679,-0.559815,0.21531,-1.060414,0.734378,0.17818,-0.392124,...,-0.195738,-0.463743,0.13024,0.010248,-0.438487,0.141793,0.130825,0.036802,8.99,0
251797,155505.0,1.987752,-0.40435,-0.862414,0.753974,-0.423668,-0.064805,-0.769267,0.136383,1.729364,...,0.117853,0.720794,0.038619,0.641467,0.006381,0.748253,-0.007092,-0.025121,5.47,0
83907,60082.0,-1.301618,-0.055829,1.990019,0.437839,0.990997,-1.145327,0.157142,0.069118,-0.045317,...,-0.090781,-0.442345,0.004918,0.361007,0.202732,-0.684291,0.070393,0.131099,7.6,0
83813,60040.0,-1.579847,-0.088035,1.054499,1.50796,0.296892,-0.507411,0.146583,0.470618,-0.970627,...,0.126745,-0.29331,0.343426,0.052542,-0.452358,-0.528696,0.042802,-0.066463,125.31,0
238598,149737.0,-2.605286,-2.059515,0.586613,-2.956155,-1.950341,-0.378391,1.923092,-0.222326,0.144511,...,-0.87493,-1.513414,1.409489,-0.050774,0.711124,-0.600977,0.002944,-0.487549,505.26,0


In [25]:
print(new_data.shape)
print(new_data['Class'].value_counts())

(984, 31)
Class
0    492
1    492
Name: count, dtype: int64


In [26]:
new_data.groupby('Class').mean()

Unnamed: 0_level_0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V20,V21,V22,V23,V24,V25,V26,V27,V28,Amount
Class,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,94935.613821,-0.02436,-0.134152,0.02514,-0.019265,0.080944,0.071185,0.060014,0.036072,0.002826,...,-0.031632,-0.014745,0.029307,0.025672,0.004275,-0.03195,-0.010321,0.001616,-0.024345,96.46626
1,80746.806911,-4.771948,3.623778,-7.033281,4.542029,-3.151225,-1.397737,-5.568731,0.570636,-2.581123,...,0.372319,0.713588,0.014049,-0.040308,-0.10513,0.041449,0.051648,0.170575,0.075667,122.211321


# Data Spliting

In [27]:
# seperating into features and target

X = new_data.drop('Class', axis=1)
y = new_data['Class']

In [28]:
print(X)

            Time        V1        V2        V3        V4        V5        V6  \
253877  156472.0 -0.311727  1.179052 -0.053679 -0.559815  0.215310 -1.060414   
251797  155505.0  1.987752 -0.404350 -0.862414  0.753974 -0.423668 -0.064805   
83907    60082.0 -1.301618 -0.055829  1.990019  0.437839  0.990997 -1.145327   
83813    60040.0 -1.579847 -0.088035  1.054499  1.507960  0.296892 -0.507411   
238598  149737.0 -2.605286 -2.059515  0.586613 -2.956155 -1.950341 -0.378391   
...          ...       ...       ...       ...       ...       ...       ...   
279863  169142.0 -1.927883  1.125653 -4.518331  1.749293 -1.566487 -2.010494   
280143  169347.0  1.378559  1.289381 -5.004247  1.411850  0.442581 -1.326536   
280149  169351.0 -0.676143  1.126366 -2.213700  0.468308 -1.120541 -0.003346   
281144  169966.0 -3.113832  0.585864 -5.399730  1.817092 -0.840618 -2.943548   
281674  170348.0  1.991976  0.158476 -2.583441  0.408670  1.151147 -0.096695   

              V7        V8        V9  .

In [29]:
print(y)

253877    0
251797    0
83907     0
83813     0
238598    0
         ..
279863    1
280143    1
280149    1
281144    1
281674    1
Name: Class, Length: 984, dtype: int64


In [30]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=2)

In [32]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(787, 30)
(197, 30)
(787,)
(197,)


# Model Training (Logistic Regression)

In [34]:
model = LogisticRegression()

In [36]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


# Evaluate the Model

Evaluate on training data

In [37]:
train_pred = model.predict(X_train)

train_acc = accuracy_score(train_pred, y_train)

print(f'Training accuracy: {train_acc}')

Training accuracy: 0.9466327827191868


Evaluate on training data

In [38]:
test_pred = model.predict(X_test)

test_acc = accuracy_score(test_pred, y_test)

print(f'Testing accuracy: {test_acc}')

Testing accuracy: 0.9187817258883249


# Usage

In [42]:
input_data = (0, -1.3598071336738, -0.0727811733098497, 2.53634673796914, 1.37815522427443, -0.338320769942518, 0.462387777762292, 0.239598554061257, 0.0986979012610507, 0.363786969611213, 0.0907941719789316, -0.551599533260813, -0.617800855762348, -0.991389847235408, -0.311169353699879, 1.46817697209427, -0.470400525259478, 0.207971241929242, 0.0257905801985591, 0.403992960255733, 0.251412098239705, -0.018306777944153, 0.277837575558899, -0.110473910188767, 0.0669280749146731, 0.128539358273528, -0.189114843888824, 0.133558376740387, -0.0210530534538215, 149.62)

input_as_np_array = np.asarray(input_data)

input_reshaped = input_as_np_array.reshape(1,-1)

prediction = model.predict(input_reshaped)

if prediction == 0:
  print('The Transaction is Legit')
else:
  print(' The Transaction is a Fraud')

The Transaction is Legit


