In [2]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import zipfile
import os

# Define the path to the ZIP file
zip_path = '/content/drive/MyDrive/fraud_detection.zip'

# Extract the ZIP file
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall('/content/PaySim')

# Verify extraction
os.listdir('/content/PaySim')

['PS_20174392719_1491204439457_log.csv']

In [4]:
import pandas as pd

# Load the dataset (Modify based on extracted CSV file name)
csv_file_path = '/content/PaySim/PS_20174392719_1491204439457_log.csv'

# Read the dataset
df = pd.read_csv(csv_file_path)

# Display dataset info
df.info()
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6362620 entries, 0 to 6362619
Data columns (total 11 columns):
 #   Column          Dtype  
---  ------          -----  
 0   step            int64  
 1   type            object 
 2   amount          float64
 3   nameOrig        object 
 4   oldbalanceOrg   float64
 5   newbalanceOrig  float64
 6   nameDest        object 
 7   oldbalanceDest  float64
 8   newbalanceDest  float64
 9   isFraud         int64  
 10  isFlaggedFraud  int64  
dtypes: float64(5), int64(3), object(3)
memory usage: 534.0+ MB


Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [5]:
# Check for missing values
print(df.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [6]:
# Drop columns that are not useful for fraud detection
df = df.drop(columns=['nameOrig', 'nameDest'])


In [7]:
from sklearn.preprocessing import LabelEncoder

# Encode 'type' column (categorical to numerical)
le = LabelEncoder()
df['type'] = le.fit_transform(df['type'])

# Check the unique values after encoding
df['type'].unique()


array([3, 4, 1, 2, 0])

In [8]:
# Define features and target
X = df.drop(columns=['isFraud'])  # Features
y = df['isFraud']  # Target variable


In [9]:
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

# Split dataset (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Apply SMOTE
smote = SMOTE(sampling_strategy=0.5, random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [10]:
from sklearn.ensemble import RandomForestClassifier

# Initialize Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train_smote, y_train_smote)


In [11]:
from sklearn.metrics import classification_report, roc_auc_score

# Predict on test set
y_pred = model.predict(X_test)

# Print evaluation metrics
print(classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00   1270881
           1       0.67      0.94      0.79      1643

    accuracy                           1.00   1272524
   macro avg       0.84      0.97      0.89   1272524
weighted avg       1.00      1.00      1.00   1272524

ROC-AUC Score: 0.9714014685964466


In [12]:
df['transaction_hour'] = df['step'] % 24  # Convert step into hours

In [13]:
import numpy as np

def augment_data(data, num_samples=5000):
    augmented_data = data.copy()

    for _ in range(num_samples):
        sample = data.sample(n=1, random_state=np.random.randint(10000))
        noisy_sample = sample + np.random.normal(0, 0.01, sample.shape)  # Add small noise
        augmented_data = pd.concat([augmented_data, noisy_sample], axis=0)

    return augmented_data

# Apply data augmentation on training set
X_train_augmented = augment_data(X_train_smote)
y_train_augmented = np.concatenate([y_train_smote, np.ones(len(X_train_augmented) - len(y_train_smote))])

In [None]:
# Train the model again with augmented data
model.fit(X_train_augmented, y_train_augmented)

# Predict again and evaluate
y_pred_augmented = model.predict(X_test)

print(classification_report(y_test, y_pred_augmented))
print("ROC-AUC Score (after augmentation):", roc_auc_score(y_test, y_pred_augmented))