# MSc Project
## Adaptive Classifier for Concept Drift with Web Interface for Fraud DetectionStakeholder

In [1]:
import zipfile
import os
from pathlib import Path

import pandas as pd

# 1. Preprocessing Data


N:B: Sections 1a, b, and c only need to be run once (the first time this notebook is run) with archive.zip. These can be commented out afterwards, and the "data/processed/paysim_subsample.csv" file can be used from Section 1d.
## 1a. Extract from archive

In [None]:
# !! cell can be commented out after the first run as it has served it's purpose !!
# extract data from archive

zip_file = "data/raw/archive.zip"
extract_dir = "data/raw/"

with zipfile.ZipFile(zip_file, 'r') as zip_ref:
    zip_ref.extractall(extract_dir)

## 1b. Load and Inspect Data

In [3]:
!ls

app   deploy.sh  fraud-detection-msc  notebook.ipynb  README.md
data  doc.txt	 models		      outputs	      scripts


In [None]:
full_data_dir = "data/raw/PS_20174392719_1491204439457_log.csv"
full_df = pd.read_csv(full_data_dir)

In [5]:
print(full_df.info())
full_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB
None


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [6]:
print(f"Full Dataset: {len(full_df)} row, Fraud Rate: {full_df['isFraud'].mean():.4f}")

Full Dataset: 6362620 row, Fraud Rate: 0.0013


In [7]:
# check for missing data
full_df.isna().sum()

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64

### 1d. Subsample Data
Select a subsample of 100,000 records

In [8]:
full_df = full_df.reset_index(names='original_index') # preserve original index for temporality
full_df.head()

Unnamed: 0,original_index,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [9]:
# select all fraud cases
fraud_df = full_df[full_df['isFraud']==1]
print(fraud_df.info())
fraud_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 8213 entries, 2 to 6362619
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   original_index  8213 non-null   int64  
 1   step            8213 non-null   int64  
 2   type            8213 non-null   object 
 3   amount          8213 non-null   float64
 4   nameOrig        8213 non-null   object 
 5   oldbalanceOrg   8213 non-null   float64
 6   newbalanceOrig  8213 non-null   float64
 7   nameDest        8213 non-null   object 
 8   oldbalanceDest  8213 non-null   float64
 9   newbalanceDest  8213 non-null   float64
 10  isFraud         8213 non-null   int64  
 11  isFlaggedFraud  8213 non-null   int64  
dtypes: float64(5), int64(4), object(3)
memory usage: 834.1+ KB
None


Unnamed: 0,original_index,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
2,2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
251,251,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0
252,252,1,CASH_OUT,2806.0,C2101527076,2806.0,0.0,C1007251739,26202.0,0.0,1,0
680,680,1,TRANSFER,20128.0,C137533655,20128.0,0.0,C1848415041,0.0,0.0,1,0


In [10]:
# select non-fraud cases to make up 100,000 records
count_req_non_fraud = 100000 - len(fraud_df) # number of fraud cases required to make up 100000 total cases
count_total_non_fraud = len(full_df) - len(fraud_df) # total number of fraud cases in dataset
non_fraud_ratio = count_req_non_fraud / count_total_non_fraud # fraction of required to total,  to ensure proportional subseting

nonfraud_df = full_df[full_df['isFraud']==0].groupby('step').sample(frac=non_fraud_ratio, random_state=42)
print(nonfraud_df.info())
nonfraud_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 91786 entries, 1795 to 6362128
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   original_index  91786 non-null  int64  
 1   step            91786 non-null  int64  
 2   type            91786 non-null  object 
 3   amount          91786 non-null  float64
 4   nameOrig        91786 non-null  object 
 5   oldbalanceOrg   91786 non-null  float64
 6   newbalanceOrig  91786 non-null  float64
 7   nameDest        91786 non-null  object 
 8   oldbalanceDest  91786 non-null  float64
 9   newbalanceDest  91786 non-null  float64
 10  isFraud         91786 non-null  int64  
 11  isFlaggedFraud  91786 non-null  int64  
dtypes: float64(5), int64(4), object(3)
memory usage: 9.1+ MB
None


Unnamed: 0,original_index,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
1795,1795,1,PAYMENT,353.9,C1746117425,3.0,0.0,M467146800,0.0,0.0,0,0
2233,2233,1,CASH_OUT,159657.1,C1366629983,0.0,0.0,C716083600,2640580.83,2444985.19,0,0
372,372,1,TRANSFER,438437.09,C977160959,0.0,0.0,C248609774,740675.45,6453430.91,0,0
539,539,1,PAYMENT,207.75,C1288108586,82149.35,81941.6,M1089584667,0.0,0.0,0,0
2440,2440,1,PAYMENT,11380.13,C1905114489,77037.91,65657.79,M538132628,0.0,0.0,0,0


In [12]:
subsample_df = pd.concat([fraud_df, nonfraud_df], ignore_index=True)
# sort transactions by original index
subsample_df = subsample_df.sort_values(by='original_index').reset_index(drop=True)
# drop the original index
subsample_df.drop(columns=["original_index"], inplace=True)
print(subsample_df.info())
subsample_df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            99999 non-null  int64  
 1   type            99999 non-null  object 
 2   amount          99999 non-null  float64
 3   nameOrig        99999 non-null  object 
 4   oldbalanceOrg   99999 non-null  float64
 5   newbalanceOrig  99999 non-null  float64
 6   nameDest        99999 non-null  object 
 7   oldbalanceDest  99999 non-null  float64
 8   newbalanceDest  99999 non-null  float64
 9   isFraud         99999 non-null  int64  
 10  isFlaggedFraud  99999 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 8.4+ MB
None


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
2,1,TRANSFER,710544.77,C835773569,0.0,0.0,C1359044626,738531.5,16518.36,0,0
3,1,CASH_OUT,220691.42,C1123559518,0.0,0.0,C1590550415,6093090.62,19169204.93,0,0
4,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0


In [13]:
print(f"Subsample Dataset: {len(subsample_df)} rows")
print("_________Composed Of_________")
print(f"{len(subsample_df.query('isFraud==1'))} Fraud Cases")
print(f"{len(subsample_df.query('isFraud==0'))} Non-Fraud Cases")
print(f"Fraud Rate: {subsample_df['isFraud'].mean():.4f}")

Subsample Dataset: 99999 rows
_________Composed Of_________
8213 Fraud Cases
91786 Non-Fraud Cases
Fraud Rate: 0.0821


99,999 cases have been retrieved (rather than an exact 100,000) due to proportional subsampling.

In [15]:
# save the subset data
subsample_path = "data/processed/paysim_subsample.csv"
subsample_df.to_csv(subsample_path, index=False)

## 1d. Preprocess Data

- Create temporal variable
- Split x and y

In [16]:
df = pd.read_csv("data/processed/paysim_subsample.csv")
print(df.info())
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99999 entries, 0 to 99998
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   step            99999 non-null  int64  
 1   type            99999 non-null  object 
 2   amount          99999 non-null  float64
 3   nameOrig        99999 non-null  object 
 4   oldbalanceOrg   99999 non-null  float64
 5   newbalanceOrig  99999 non-null  float64
 6   nameDest        99999 non-null  object 
 7   oldbalanceDest  99999 non-null  float64
 8   newbalanceDest  99999 non-null  float64
 9   isFraud         99999 non-null  int64  
 10  isFlaggedFraud  99999 non-null  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 8.4+ MB
None


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
1,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
2,1,TRANSFER,710544.77,C835773569,0.0,0.0,C1359044626,738531.5,16518.36,0,0
3,1,CASH_OUT,220691.42,C1123559518,0.0,0.0,C1590550415,6093090.62,19169204.93,0,0
4,1,TRANSFER,2806.0,C1420196421,2806.0,0.0,C972765878,0.0,0.0,1,0


Inspect data

In [20]:
for col in df.columns:
    print(df[col].value_counts())
    print("----------")

step
19     753
18     732
187    719
235    696
307    694
      ... 
29       4
708      4
706      4
662      2
112      2
Name: count, Length: 743, dtype: int64
----------
type
CASH_OUT    36284
PAYMENT     31223
CASH_IN     20169
TRANSFER    11716
DEBIT         607
Name: count, dtype: int64
----------
amount
10000000.00    334
0.00            16
1165187.89       4
429257.45        4
20311.17         3
              ... 
21369.14         1
1088.18          1
168474.58        1
3906.64          1
18304.66         1
Name: count, Length: 95430, dtype: int64
----------
nameOrig
C1855642435    2
C579762526     2
C449846274     2
C594995657     2
C2106291024    1
              ..
C1361540546    1
C1938696489    1
C1866573811    1
C63404792      1
C1280323807    1
Name: count, Length: 99995, dtype: int64
----------
oldbalanceOrg
0.00           30500
10000000.00      142
181.00            19
144.00            19
135.00            18
               ...  
10710.00           1
1786.71        