In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)

In [2]:
df = pd.read_csv("Synthetic_Financial_datasets_log.csv")

df.head(5)

Unnamed: 0,step,type,amount,nameOrig,oldbalanceOrg,newbalanceOrig,nameDest,oldbalanceDest,newbalanceDest,isFraud,isFlaggedFraud
0,1,PAYMENT,9839.64,C1231006815,170136.0,160296.36,M1979787155,0.0,0.0,0,0
1,1,PAYMENT,1864.28,C1666544295,21249.0,19384.72,M2044282225,0.0,0.0,0,0
2,1,TRANSFER,181.0,C1305486145,181.0,0.0,C553264065,0.0,0.0,1,0
3,1,CASH_OUT,181.0,C840083671,181.0,0.0,C38997010,21182.0,0.0,1,0
4,1,PAYMENT,11668.14,C2048537720,41554.0,29885.86,M1230701703,0.0,0.0,0,0


In [3]:
df.dtypes

step                int64
type               object
amount            float64
nameOrig           object
oldbalanceOrg     float64
newbalanceOrig    float64
nameDest           object
oldbalanceDest    float64
newbalanceDest    float64
isFraud             int64
isFlaggedFraud      int64
dtype: object

In [4]:
print(df.isnull().sum())

step              0
type              0
amount            0
nameOrig          0
oldbalanceOrg     0
newbalanceOrig    0
nameDest          0
oldbalanceDest    0
newbalanceDest    0
isFraud           0
isFlaggedFraud    0
dtype: int64


In [6]:
df_sample = df.sample(n=2000000, random_state=42)

In [7]:
# New features
df_sample['balance_change_ratio'] = df_sample['oldbalanceOrg'] / (df_sample['amount'] + 1e-10)  # Adding a small value to avoid division by zero
df_sample['remaining_balance_ratio'] = df_sample['newbalanceOrig'] / (df_sample['oldbalanceOrg'] + 1e-10)

# Adding them
df_sample[['balance_change_ratio', 'remaining_balance_ratio']].head()


Unnamed: 0,balance_change_ratio,remaining_balance_ratio
3737323,0.063188,16.82567
264914,2.607521,0.616494
85647,0.700027,2.428517
5899326,0.0,0.0
2544263,37.370419,1.026759


In [8]:
from sklearn.preprocessing import LabelEncoder
import joblib

# Label Encoders for beafures that are non-numerical
le_nameOrig = LabelEncoder()
le_nameDest = LabelEncoder()

# Apply label encoding to nameOrig and nameDest on the sampled data
df_sample['nameOrig'] = le_nameOrig.fit_transform(df_sample['nameOrig'])
df_sample['nameDest'] = le_nameDest.fit_transform(df_sample['nameDest'])

joblib.dump(le_nameOrig, 'le_nameOrig.pkl')
joblib.dump(le_nameDest, 'le_nameDest.pkl')

# Define features (X) and target (y)
X_sample = df_sample.drop(columns=['isFraud'])
y_sample = df_sample['isFraud']


In [9]:
# One-hot encode categorical variables
X_sample = pd.get_dummies(X_sample, columns=['type'], drop_first=True)


In [10]:
from sklearn.model_selection import train_test_split

# Spriting training-testing
X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)


In [11]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100, random_state=42)

rf.fit(X_train, y_train)

In [12]:
from sklearn.metrics import classification_report, accuracy_score

# Predicting
y_pred = rf.predict(X_test)

# Evaluating the performance
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))


Accuracy: 0.9999916666666666
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    599218
           1       1.00      0.99      1.00       782

    accuracy                           1.00    600000
   macro avg       1.00      1.00      1.00    600000
weighted avg       1.00      1.00      1.00    600000



In [13]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(rf, X_sample, y_sample, cv=5, scoring='f1')
print("Cross-validated F1 scores:", scores)

Cross-validated F1 scores: [1.         0.99803536 0.99607843 0.99606299 0.99408284]


In [14]:
import joblib

joblib.dump(rf, 'fraud_model.pkl')

feature_columns = X_train.columns.tolist()
joblib.dump(feature_columns, 'feat_columns.pkl')

['feat_columns.pkl']

## Trying a different approach!!! In this case, Gradient Boosting

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set(color_codes=True)
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

df = pd.read_csv("Synthetic_Financial_datasets_log.csv")

df_sample = df.sample(n=500000, random_state=42)
df_sample_cleaned = df_sample.drop(columns=['nameOrig', 'nameDest'])

X_sample = pd.get_dummies(df_sample_cleaned.drop(['isFraud'], axis=1), columns=['type'], drop_first=True)
y_sample = df_sample_cleaned['isFraud']

X_train, X_test, y_train, y_test = train_test_split(X_sample, y_sample, test_size=0.3, random_state=42)

smote = SMOTE(random_state=42)
X_train_res, y_train_res = smote.fit_resample(X_train, y_train)


In [5]:
!pip install xgboost
import xgboost as xgb
from sklearn.model_selection import GridSearchCV

xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)

param_grid = {
    'n_estimators': [100, 200],
    'learning_rate': [0.01, 0.1, 0.3],
    'max_depth': [3, 6, 9],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0]
}

grid_search_xgb = GridSearchCV(estimator=xgb_model, param_grid=param_grid, cv=3, scoring='f1', verbose=2, n_jobs=-1)

grid_search_xgb.fit(X_train_res, y_train_res)

print("Best parameters:", grid_search_xgb.best_params_)


Collecting xgboost
  Downloading xgboost-2.1.1-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-2.1.1-py3-none-win_amd64.whl (124.9 MB)
   ---------------------------------------- 0.0/124.9 MB ? eta -:--:--
   - -------------------------------------- 4.2/124.9 MB 22.9 MB/s eta 0:00:06
   --- ------------------------------------ 10.0/124.9 MB 25.9 MB/s eta 0:00:05
   ----- ---------------------------------- 17.8/124.9 MB 30.4 MB/s eta 0:00:04
   -------- ------------------------------- 26.0/124.9 MB 31.6 MB/s eta 0:00:04
   --------- ------------------------------ 29.9/124.9 MB 29.2 MB/s eta 0:00:04
   ------------ --------------------------- 38.3/124.9 MB 30.8 MB/s eta 0:00:03
   -------------- ------------------------- 45.1/124.9 MB 32.2 MB/s eta 0:00:03
   ---------------- ----------------------- 52.2/124.9 MB 31.9 MB/s eta 0:00:03
   ------------------- -------------------- 60.3/124.9 MB 32.8 MB/s eta 0:00:02
   --------------------- ------------------ 68.7/124.9 MB 33.4

Parameters: { "use_label_encoder" } are not used.



Best parameters: {'colsample_bytree': 0.8, 'learning_rate': 0.3, 'max_depth': 9, 'n_estimators': 200, 'subsample': 0.8}


In [6]:
from sklearn.metrics import classification_report, accuracy_score

y_pred_xgb = grid_search_xgb.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print(classification_report(y_test, y_pred_xgb))


Accuracy: 0.9988866666666667
              precision    recall  f1-score   support

           0       1.00      1.00      1.00    149820
           1       0.52      0.93      0.67       180

    accuracy                           1.00    150000
   macro avg       0.76      0.97      0.83    150000
weighted avg       1.00      1.00      1.00    150000



## Now trying Autoencoders

In [9]:
df = pd.read_csv("Synthetic_Financial_datasets_log.csv")

df_sample = df.sample(n=500000, random_state=42)
df_cleaned = df_sample.drop(columns=['nameOrig', 'nameDest'])

In [10]:
df_cleaned = pd.get_dummies(df_cleaned, columns=['type'], drop_first=True)


In [11]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(df_cleaned.drop(['isFraud'], axis=1))

y = df_cleaned['isFraud']


In [12]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)


In [13]:
# Now training
# Separating non-fraudulent transactions for training the autoencoder
X_train_autoencoder = X_train[y_train == 0]

X_test_autoencoder = X_test.copy()
y_test_autoencoder = y_test.copy()


In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# Autoencoder architecture
class Autoencoder(nn.Module):
    def __init__(self, input_dim):
        super(Autoencoder, self).__init__()
        # Encoder
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, 14),
            nn.ReLU(),
            nn.Linear(14, 7),
            nn.ReLU()
        )
        # Decoder
        self.decoder = nn.Sequential(
            nn.Linear(7, 14),
            nn.ReLU(),
            nn.Linear(14, input_dim),
            nn.Sigmoid()
        )
    
    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded


In [16]:
X_train_autoencoder = torch.tensor(X_train_autoencoder, dtype=torch.float32)
X_test_autoencoder = torch.tensor(X_test_autoencoder, dtype=torch.float32)

train_loader = DataLoader(X_train_autoencoder, batch_size=256, shuffle=True)


In [17]:
input_dim = X_train_autoencoder.shape[1]
autoencoder = Autoencoder(input_dim)

criterion = nn.MSELoss()
optimizer = optim.Adam(autoencoder.parameters(), lr=0.001)

# Training
num_epochs = 50
for epoch in range(num_epochs):
    for data in train_loader:
        outputs = autoencoder(data)
        loss = criterion(outputs, data)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')


Epoch [1/50], Loss: 0.0008
Epoch [2/50], Loss: 0.0011
Epoch [3/50], Loss: 0.0001
Epoch [4/50], Loss: 0.0020
Epoch [5/50], Loss: 0.0011
Epoch [6/50], Loss: 0.0020
Epoch [7/50], Loss: 0.0001
Epoch [8/50], Loss: 0.0001
Epoch [9/50], Loss: 0.0011
Epoch [10/50], Loss: 0.0001
Epoch [11/50], Loss: 0.0011
Epoch [12/50], Loss: 0.0001
Epoch [13/50], Loss: 0.0001
Epoch [14/50], Loss: 0.0001
Epoch [15/50], Loss: 0.0001
Epoch [16/50], Loss: 0.0000
Epoch [17/50], Loss: 0.0000
Epoch [18/50], Loss: 0.0000
Epoch [19/50], Loss: 0.0000
Epoch [20/50], Loss: 0.0001
Epoch [21/50], Loss: 0.0001
Epoch [22/50], Loss: 0.0000
Epoch [23/50], Loss: 0.0001
Epoch [24/50], Loss: 0.0000
Epoch [25/50], Loss: 0.0000
Epoch [26/50], Loss: 0.0000
Epoch [27/50], Loss: 0.0001
Epoch [28/50], Loss: 0.0000
Epoch [29/50], Loss: 0.0000
Epoch [30/50], Loss: 0.0000
Epoch [31/50], Loss: 0.0000
Epoch [32/50], Loss: 0.0000
Epoch [33/50], Loss: 0.0000
Epoch [34/50], Loss: 0.0001
Epoch [35/50], Loss: 0.0000
Epoch [36/50], Loss: 0.0000
E

In [19]:
import numpy as np

autoencoder.eval()

with torch.no_grad():
    X_test_pred = autoencoder(X_test_autoencoder)

mse = torch.mean((X_test_autoencoder - X_test_pred) ** 2, dim=1).numpy()

threshold = np.percentile(mse, 95)

y_pred_autoencoder = (mse > threshold).astype(int)


In [20]:
from sklearn.metrics import classification_report, accuracy_score

# Evaluate the performance of the autoencoder
print("Accuracy:", accuracy_score(y_test_autoencoder, y_pred_autoencoder))
print(classification_report(y_test_autoencoder, y_pred_autoencoder))


Accuracy: 0.9496933333333333
              precision    recall  f1-score   support

           0       1.00      0.95      0.97    149820
           1       0.01      0.37      0.02       180

    accuracy                           0.95    150000
   macro avg       0.50      0.66      0.50    150000
weighted avg       1.00      0.95      0.97    150000

