# Libs

In [2]:
!pip install kaggle



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import ScalarFormatter
import seaborn as sns
from sklearn.metrics import confusion_matrix
import os
import warnings

In [4]:
np.set_printoptions(suppress=True)
warnings.filterwarnings('ignore')

# Data Collection

In [5]:
os.environ['KAGGLE_CONFIG_DIR'] = "/content/.kaggle"
!mkdir -p /content/.kaggle
!cp kaggle.json /content/.kaggle/

In [6]:
!kaggle datasets download -d ealaxi/paysim1

Dataset URL: https://www.kaggle.com/datasets/ealaxi/paysim1
License(s): CC-BY-SA-4.0
Downloading paysim1.zip to /content
 70% 125M/178M [00:00<00:00, 1.31GB/s]
100% 178M/178M [00:00<00:00, 1.07GB/s]


In [7]:
!unzip /content/paysim1.zip -d /content/

Archive:  /content/paysim1.zip
  inflating: /content/PS_20174392719_1491204439457_log.csv  


In [8]:
df = pd.read_csv('/content/PS_20174392719_1491204439457_log.csv')

In [9]:
df.shape

(6362620, 11)

In [10]:
df.head()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [11]:
df.tail()

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.0,C776919290,0.0,339682.13,1,0
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.0,C1881841831,0.0,0.0,1,0
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.0,C1365125890,68488.84,6379898.11,1,0
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.0,C2080388513,0.0,0.0,1,0
6362619,743,CASH_OUT,850002.52,C1280323807,850002.52,0.0,C873221189,6510099.11,7360101.63,1,0


In [12]:
df.describe(include='all')

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
count,6362620.0,6362620,6362620.0,6362620,6362620.0,6362620.0,6362620,6362620.0,6362620.0,6362620.0,6362620.0
unique,,5,,6353307,,,2722362,,,,
top,,CASH_OUT,,C1530544995,,,C1286084959,,,,
freq,,2237500,,3,,,113,,,,
mean,243.3972,,179861.9,,833883.1,855113.7,,1100702.0,1224996.0,0.00129082,2.514687e-06
std,142.332,,603858.2,,2888243.0,2924049.0,,3399180.0,3674129.0,0.0359048,0.001585775
min,1.0,,0.0,,0.0,0.0,,0.0,0.0,0.0,0.0
25%,156.0,,13389.57,,0.0,0.0,,0.0,0.0,0.0,0.0
50%,239.0,,74871.94,,14208.0,0.0,,132705.7,214661.4,0.0,0.0
75%,335.0,,208721.5,,107315.2,144258.4,,943036.7,1111909.0,0.0,0.0


In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


step - maps a unit of time in the real world. In this case 1 step is 1 hour of time. Total steps 744 (30 days simulation).

type - CASH-IN, CASH-OUT, DEBIT, PAYMENT and TRANSFER.

amount -
amount of the transaction in local currency.

nameOrig - customer who started the transaction

oldbalanceOrg - initial balance before the transaction

newbalanceOrig - new balance after the transaction.

nameDest - customer who is the recipient of the transaction

oldbalanceDest - initial balance recipient before the transaction. Note that there is not information for customers that start with M (Merchants).

newbalanceDest - new balance recipient after the transaction. Note that there is not information for customers that start with M (Merchants).

isFraud - This is the transactions made by the fraudulent agents inside the simulation. In this specific dataset the fraudulent behavior of the agents aims to profit by taking control or customers accounts and try to empty the funds by transferring to another account and then cashing out of the system.

isFlaggedFraud - The business model aims to control massive transfers from one account to another and flags illegal attempts. An illegal attempt in this dataset is an attempt to transfer more than 200.000 in a single transaction.

# Data Preprocessing

In [15]:
df.isnull().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [16]:
df.isna().sum()

Unnamed: 0,0
step,0
type,0
amount,0
nameOrig,0
oldbalanceOrg,0
newbalanceOrig,0
nameDest,0
oldbalanceDest,0
newbalanceDest,0
isFraud,0


In [17]:
df.duplicated().sum()

np.int64(0)

There is no null, NaN values, no duplicated rows

In [18]:
df.nunique()

Unnamed: 0,0
step,743
type,5
amount,5316900
nameOrig,6353307
oldbalanceOrg,1845844
newbalanceOrig,2682586
nameDest,2722362
oldbalanceDest,3614697
newbalanceDest,3555499
isFraud,2


In [20]:
df = df.rename(columns={'oldbalanceOrg': 'oldbalanceOrig'})
df['type'] = df['type'].astype('category')
df['isFraud'] = df['isFraud'].astype(np.int8)
df['nameOrig'] = df['nameOrig'].astype('string')
df['nameDest'] = df['nameDest'].astype('string')
df['amount'] = df['amount'].astype(np.float64)
df['orgType'] = df['nameOrig'].apply(lambda x: 'Merchant' if x.startswith('M') else ('Customer' if x.startswith('C') else 'Other'))
df['destType'] = df['nameDest'].apply(lambda x: 'Merchant' if x.startswith('M') else ('Customer' if x.startswith('C') else 'Other'))

In [25]:
def if_successful_transaction(row):
    # Condition 1: Origin balance changes by the amount
    origin_ok = abs(row['oldbalanceOrig'] - row['newbalanceOrig']) == row['amount']

    # Condition 2: Destination balance changes by the amount (either added or subtracted)
    dest_ok = abs(row['oldbalanceDest'] - row['newbalanceDest']) == row['amount']

    return origin_ok or dest_ok  # True if either balance matches the amount

# Apply the function row-wise
df['successful_transaction'] = df.apply(if_successful_transaction, axis=1)

In [26]:
df

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrig,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud,orgType,destType,successful_transaction
0,1,PAYMENT,9839.64,C1231006815,170136.00,160296.36,M1979787155,0.00,0.00,0,0,Customer,Merchant,False
1,1,PAYMENT,1864.28,C1666544295,21249.00,19384.72,M2044282225,0.00,0.00,0,0,Customer,Merchant,False
2,1,TRANSFER,181.00,C1305486145,181.00,0.00,C553264065,0.00,0.00,1,0,Customer,Customer,True
3,1,CASH_OUT,181.00,C840083671,181.00,0.00,C38997010,21182.00,0.00,1,0,Customer,Customer,True
4,1,PAYMENT,11668.14,C2048537720,41554.00,29885.86,M1230701703,0.00,0.00,0,0,Customer,Merchant,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6362615,743,CASH_OUT,339682.13,C786484425,339682.13,0.00,C776919290,0.00,339682.13,1,0,Customer,Customer,True
6362616,743,TRANSFER,6311409.28,C1529008245,6311409.28,0.00,C1881841831,0.00,0.00,1,0,Customer,Customer,True
6362617,743,CASH_OUT,6311409.28,C1162922333,6311409.28,0.00,C1365125890,68488.84,6379898.11,1,0,Customer,Customer,True
6362618,743,TRANSFER,850002.52,C1685995037,850002.52,0.00,C2080388513,0.00,0.00,1,0,Customer,Customer,True


# Problem Statement
- do not include cases money transfered to merchants as no data was provided

1 - code

2 - title, labels, ...

3 - comment

In [None]:
# Create confusion matrix
cm = confusion_matrix(df['isFraud'], df['isFlaggedFraud'])

# Plot it with seaborn for better visuals
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
             xticklabels=["Not Fraud", "Fraud"],
            yticklabels=["Not Fraud", "Fraud"])
plt.xlabel('Flagged')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

In [None]:
cm_norm = cm.astype('float') / cm.sum()

# Plot the normalized confusion matrix
plt.figure(figsize=(6, 5))
sns.heatmap(cm_norm, annot=True, fmt=".2%", cmap='Blues',
            xticklabels=["Not Fraud", "Fraud"],
            yticklabels=["Not Fraud", "Fraud"])
plt.xlabel('Flagged')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

# EDA
1. What kinds of transactions were Fraud and labeled as Fraud ?
2. What kinds of transactions were Fraud but not labeled as Fraud ?
3. Any fraud transactions with merchant accounts (without information provided) ?

In [None]:
fraud_trans = df[df['isFraud']==1]

In [None]:
fraud_trans.columns

## Type

In [None]:
#   PRINT OUT VALUES IN THE RIGHT SIDE WITH EXPLANATIONS

# Extract fraud counts by type
counts = fraud_trans['type'].value_counts()
labels = counts.index
sizes = counts.values
colors = plt.cm.Paired.colors  # Use a palette with more colors if >2 types

# Plot donut chart
plt.figure(figsize=(7, 7))
plt.pie(sizes, labels=labels, colors=colors, startangle=90, counterclock=False,
        autopct='%1.1f%%', wedgeprops={'width': 0.4, 'edgecolor': 'white'})

plt.title('Fraudulent Transactions by Type')
plt.axis('equal')  # Keeps it circular
plt.show()


In [None]:
#   PRINT OUT VALUES IN THE RIGHT SIDE WITH EXPLANATIONS

# Extract fraud counts by type
counts = df['type'].value_counts()
labels = counts.index
sizes = counts.values
colors = plt.cm.Paired.colors  # Use a palette with more colors if >2 types

# Plot donut chart
plt.figure(figsize=(7, 7))
plt.pie(sizes, labels=labels, colors=colors, startangle=90, counterclock=False,
        autopct='%1.1f%%', wedgeprops={'width': 0.4, 'edgecolor': 'white'})

plt.title('Fraudulent Transactions by Type')
plt.axis('equal')  # Keeps it circular
plt.show()


In [None]:
#   PRINT OUT VALUES IN THE RIGHT SIDE WITH EXPLANATIONS

# Extract fraud counts by type
counts = df[df['isFraud']==0]['type'].value_counts()
labels = counts.index
sizes = counts.values
colors = plt.cm.Paired.colors  # Use a palette with more colors if >2 types

# Plot donut chart
plt.figure(figsize=(7, 7))
plt.pie(sizes, labels=labels, colors=colors, startangle=90, counterclock=False,
        autopct='%1.1f%%', wedgeprops={'width': 0.4, 'edgecolor': 'white'})

plt.title('Fraudulent Transactions by Type')
plt.axis('equal')  # Keeps it circular
plt.show()


In [None]:
# PUT THREE GRAPHS ABOVE TOGETHER IN DIFF SIZES to COMPARE

# Set up a figure with customized grid
fig = plt.figure(figsize=(14, 8))
gs = fig.add_gridspec(2, 3)

# 🔹 Large plot (first two columns)
ax1 = fig.add_subplot(gs[:, :2])
sns.countplot(data=df, x='type', palette='plasma', ax=ax1)
ax1.set_title('Transaction Count by Type')

# 🔹 Medium plot (top right corner)
ax2 = fig.add_subplot(gs[0, 2])
fraud_counts = df['isFraud'].value_counts()
ax2.pie(fraud_counts, labels=['Not Fraud', 'Fraud'], autopct='%1.1f%%',
        wedgeprops={'width': 0.4}, colors=['#4caf50', '#f44336'])
ax2.set_title('Fraud vs Not Fraud')

# 🔹 Small plot (bottom right)
ax3 = fig.add_subplot(gs[1, 2])
sns.boxplot(data=df, y='amount', ax=ax3)
ax3.set_title('Amount Distribution')

plt.tight_layout()
plt.show()


## Amount

In [None]:
fraud_trans['amount'].describe()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot density graph for the 'amount' column
plt.figure(figsize=(8, 5))
sns.kdeplot(data=fraud_trans, x='amount', shade=True, color='red', bw_adjust=0.5)

plt.title('Density Plot of Fraud Transaction Amounts')
plt.xlabel('Amount')
plt.ylabel('Density')
plt.xlim(0, fraud_trans['amount'].quantile(0.99))  # Optional: zoom in to reduce outlier skew
plt.grid(True)
plt.ticklabel_format(axis='both', style='plain')
plt.show()


In [None]:
df[df['amount']==0]

Look at last 2 cols, transactions with amount as 0 are fraud but not labeled

Consider it later as would it cause any incon in the future or not ?

In [None]:
fraud_trans = fraud_trans[fraud_trans['amount']~0]

## Org name

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='orgType', palette='viridis', order=['Merchant', 'Customer', 'Other'])

plt.title('Number of Transactions to Merchants vs Customers')
plt.xlabel('Destination Type')
plt.ylabel('Transaction Count')
plt.grid(axis='y')
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=fraud_trans, x='orgType', palette='viridis', order=['Merchant', 'Customer', 'Other'])

plt.title('Number of Transactions to Merchants vs Customers')
plt.xlabel('Destination Type')
plt.ylabel('Transaction Count')
plt.grid(axis='y')
plt.show()


## Dest name

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(8, 5))
sns.countplot(data=df, x='destType', palette='viridis', order=['Merchant', 'Customer', 'Other'])

plt.title('Number of Transactions to Merchants vs Customers')
plt.xlabel('Destination Type')
plt.ylabel('Transaction Count')
plt.grid(axis='y')
plt.show()


In [None]:
plt.figure(figsize=(8, 5))
sns.countplot(data=fraud_trans, x='destType', palette='viridis', order=['Merchant', 'Customer', 'Other'])

plt.title('Number of Transactions to Merchants vs Customers')
plt.xlabel('Destination Type')
plt.ylabel('Transaction Count')
plt.grid(axis='y')
plt.show()


## Balance
Check if transaction amount and new, old balance is right