In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math
from PIL import Image
from scipy.stats import skew
from matplotlib.transforms import Bbox
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler, RobustScaler
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

In [2]:
!pip install opendatasets

Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl.metadata (9.2 kB)
Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [2]:
import opendatasets as od

# Download a dataset
dataset_url = 'https://www.kaggle.com/datasets/berkanoztas/synthetic-transaction-monitoring-dataset-aml/data'
od.download(dataset_url)

Skipping, found downloaded files in "./synthetic-transaction-monitoring-dataset-aml" (use force=True to force download)


In [2]:
df = pd.read_csv("/content/synthetic-transaction-monitoring-dataset-aml/SAML-D.csv")
df.head()

Unnamed: 0,Time,Date,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering,Laundering_type
0,10:35:19,2022-10-07,8724731955,2769355426,1459.15,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits
1,10:35:20,2022-10-07,1491989064,8401255335,6019.64,UK pounds,Dirham,UK,UAE,Cross-border,0,Normal_Fan_Out
2,10:35:20,2022-10-07,287305149,4404767002,14328.44,UK pounds,UK pounds,UK,UK,Cheque,0,Normal_Small_Fan_Out
3,10:35:21,2022-10-07,5376652437,9600420220,11895.0,UK pounds,UK pounds,UK,UK,ACH,0,Normal_Fan_In
4,10:35:21,2022-10-07,9614186178,3803336972,115.25,UK pounds,UK pounds,UK,UK,Cash Deposit,0,Normal_Cash_Deposits


In [3]:
train_start = '2022-10-01'
train_end = '2023-05-31'
test_start = '2023-06-01'
test_end = '2023-08-31'

# Split the data into training and test sets based on the 'Date' column
train_data = df[(df['Date'] >= train_start) & (df['Date'] <= train_end)]
test_data = df[(df['Date'] >= test_start) & (df['Date'] <= test_end)]

In [4]:
train_data.drop(columns=['Laundering_type', 'Time', 'Date'], inplace=True)
test_data.drop(columns=['Laundering_type', 'Time', 'Date'], inplace=True)

In [5]:
from sklearn import preprocessing

In [6]:
test_data.head()

Unnamed: 0,Sender_account,Receiver_account,Amount,Payment_currency,Received_currency,Sender_bank_location,Receiver_bank_location,Payment_type,Is_laundering
7048788,1172680616,1904681724,1279.6,UK pounds,UK pounds,UK,UK,Credit card,0
7048789,1346993016,6109194301,11302.37,UK pounds,UK pounds,UK,UK,Credit card,0
7048790,4094213814,4059926020,11228.93,UK pounds,UK pounds,UK,UK,Cheque,0
7048791,6353823692,4261858913,20891.16,UK pounds,UK pounds,UK,UK,Credit card,0
7048792,8463259273,946018491,1265.78,UK pounds,UK pounds,UK,UK,Debit card,0


In [7]:
from sklearn import preprocessing

categorical_cols = ['Sender_account', 'Receiver_account', 'Payment_currency', 'Received_currency',
                   'Sender_bank_location', 'Receiver_bank_location', 'Payment_type']

# Concatenate train and test to fit encoders
full_data = pd.concat([train_data, test_data], axis=0)

for col in categorical_cols:
    encoder = preprocessing.LabelEncoder()
    encoder.fit(full_data[col])  # Fit on combined data
    train_data[col] = encoder.transform(train_data[col])
    test_data[col] = encoder.transform(test_data[col])

# Scaling numeric columns
numerical_cols = ['Amount']
scaler = preprocessing.StandardScaler()
train_data[numerical_cols] = scaler.fit_transform(train_data[numerical_cols])
test_data[numerical_cols] = scaler.transform(test_data[numerical_cols])

In [8]:
# Check the shape of training and test data
print("Shape of train_data:", train_data.shape)
print("Shape of test_data:", test_data.shape)

Shape of train_data: (7048788, 9)
Shape of test_data: (2456064, 9)


In [9]:
# Define the target variable
target_col = 'Is_laundering'

# Separate features and target for training and test sets
X_train = train_data.drop(columns=[target_col])
y_train = train_data[target_col]
X_test = test_data.drop(columns=[target_col])
y_test = test_data[target_col]

# Display the shapes to confirm the split
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (7048788, 8)
Shape of y_train: (7048788,)
Shape of X_test: (2456064, 8)
Shape of y_test: (2456064,)


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

# Apply SMOTE to balance the training data
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# Check the shape after resampling
print("Shape of X_train after SMOTE:", X_train_resampled.shape)
print("Shape of y_train after SMOTE:", y_train_resampled.shape)

Shape of X_train after SMOTE: (14083504, 8)
Shape of y_train after SMOTE: (14083504,)


In [11]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Create the model
hgb_classifier = HistGradientBoostingClassifier(
    max_iter=100,                # Maximum number of boosting iterations
    learning_rate=0.1,           # Step size in each iteration
    max_depth=5,                 # Maximum depth of each tree
    random_state=42,
)

# Train the model
hgb_classifier.fit(X_train_resampled, y_train_resampled)

# Make predictions
y_pred = hgb_classifier.predict(X_test)

# Evaluate the model
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.9093973121221597
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.91      0.95   2453227
           1       0.01      0.51      0.01      2837

    accuracy                           0.91   2456064
   macro avg       0.50      0.71      0.48   2456064
weighted avg       1.00      0.91      0.95   2456064

Confusion Matrix:
[[2232100  221127]
 [   1399    1438]]


In [12]:
import pickle

# Assuming your model is stored in hgb_classifier
filename = 'hgb_classifier_model.pkl'

# Save the model using pickle
with open(filename, 'wb') as file:
    pickle.dump(hgb_classifier, file)


In [13]:
# Load the saved model
with open(filename, 'rb') as file:
    loaded_model = pickle.load(file)

# Now you can use the loaded model to make predictions
y_pred = loaded_model.predict(X_test)