# **Anti-Money Laundering Detector**

## **Installing the needed libraries**

## **Loading the libraries**

In [1]:
import os
import json
import zipfile
from pyspark.sql import SparkSession
import random
from pyspark.sql.types import StructType, StructField, StringType, FloatType, IntegerType, DoubleType
from pyspark.sql import functions as F
from pyspark.sql.functions import col, sum, when, count, avg, to_timestamp, hour, dayofweek
from pyspark.ml.feature import StringIndexer, VectorAssembler, StandardScaler
from pyspark.sql import Window
import seaborn as sns
import matplotlib.pyplot as plt
from pyspark.ml.stat import Correlation
import numpy as np
import pandas as pd
from pyspark.ml import Pipeline
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator
import kaggle
from kaggle.api.kaggle_api_extended import KaggleApi

os.environ['KAGGLE_CONFIG_DIR'] = "C:/Users/Public/kaggle.json"
api = KaggleApi()
api.authenticate()

## **Loading the dataset**

According to the dataset owner, the data is divided in 6 datasets into two groups of three:

* Group HI has a relatively higher illicit ratio (more laundering).
* Group LI has a relatively lower illicit ratio (less laundering).

Both HI and LI internally have three sets of data: small, medium, and large. Also, provides two files for each of the six datasets:

* A list of transactions in CSV format
* A text file list of laundering transactions.

So, we have a larga dataset, but for our project, we will use the following files:

* HI-Medium_Trans.csv
* HI-Medium_Patterns.txt
* LI-Medium_Trans.csv
* LI-Medium_Patterns.txt

Transaction files and Pattern files, each they will be merged in a single DataFrame.

In [2]:
# Downloading the dataset from Kaggle
from kaggle.api.kaggle_api_extended import KaggleApi
import zipfile
import os

api = KaggleApi()
api.authenticate()

# Lista de archivos y sus rutas
files = [
    ('ealtman2019/ibm-transactions-for-anti-money-laundering-aml', 'HI-Medium_Trans.csv'),
    ('ealtman2019/ibm-transactions-for-anti-money-laundering-aml', 'HI-Medium_Patterns.txt'),
    ('ealtman2019/ibm-transactions-for-anti-money-laundering-aml', 'LI-Medium_Trans.csv'),
    ('ealtman2019/ibm-transactions-for-anti-money-laundering-aml', 'LI-Medium_Patterns.txt'),
]

# Unzip the files (if zip extension)
for dataset, file_name in files:
    api.dataset_download_file(dataset, file_name=file_name, path='./dataset/')
    zip_path = f'./dataset/{file_name}.zip'
    if os.path.exists(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall('./dataset/')
        os.remove(zip_path)

Dataset URL: https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml
Dataset URL: https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml
Dataset URL: https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml
Dataset URL: https://www.kaggle.com/datasets/ealtman2019/ibm-transactions-for-anti-money-laundering-aml


NameError: name 'files' is not defined

In [13]:
# Creating a SparkSession to manipulate the datasets
spark = SparkSession.builder.appName("AML_Spark").getOrCreate()

### **Transactions Dataset**

In [14]:
# Define schema for the transactions
schema = StructType([
    StructField("Timestamp", StringType(), True),
    StructField("From_Bank", StringType(), True),
    StructField("From_Account", StringType(), True),
    StructField("To_Bank", StringType(), True),
    StructField("To_Account", StringType(), True),
    StructField("Amount_Received", FloatType(), True),
    StructField("Receiving_Currency", StringType(), True),
    StructField("Amount_Paid", FloatType(), True),
    StructField("Payment_Currency", StringType(), True),
    StructField("Payment_Format", StringType(), True)
])

### High Ilicit - Medium

In [15]:
# Read the CSV file into a Spark DataFrame
hi_medium_df = spark.read.csv("HI-Medium_Trans.csv", schema=schema, header=True)

# Display the first few rows of the DataFrame
hi_medium_df.show()

AnalysisException: [PATH_NOT_FOUND] Path does not exist: file:/C:/Users/mgmig/Documents/Personal/Lambton College/2024F/Big Data Framework/Project/HI-Medium_Trans.csv.

### Low Ilicit - Medium

In [6]:
# Read the CSV file into a Spark DataFrame
li_medium_df = spark.read.csv("LI-Medium_Trans.csv", schema=schema, header=True)

# Display the first few rows of the DataFrame
li_medium_df.show()

+----------------+---------+------------+--------+----------+---------------+------------------+-----------+----------------+--------------+
|       Timestamp|From_Bank|From_Account| To_Bank|To_Account|Amount_Received|Receiving_Currency|Amount_Paid|Payment_Currency|Payment_Format|
+----------------+---------+------------+--------+----------+---------------+------------------+-----------+----------------+--------------+
|2022/09/01 00:15|      020|   800104D70|     020| 800104D70|        8095.07|         US Dollar|    8095.07|       US Dollar|  Reinvestment|
|2022/09/01 00:18|    03196|   800107150|   03196| 800107150|        7739.29|         US Dollar|    7739.29|       US Dollar|  Reinvestment|
|2022/09/01 00:23|    01208|   80010E430|   01208| 80010E430|        2654.22|         US Dollar|    2654.22|       US Dollar|  Reinvestment|
|2022/09/01 00:19|    03203|   80010EA80|   03203| 80010EA80|       13284.41|         US Dollar|   13284.41|       US Dollar|  Reinvestment|
|2022/09/01 0

24/12/03 15:38:31 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 11, schema size: 10
CSV file: file:///kaggle/input/ibm-transactions-for-anti-money-laundering-aml/LI-Medium_Trans.csv


In [7]:
# Combine both datasets
trans_df = hi_medium_df.union(li_medium_df)

### **Patterns Dataset**

### High Ilicit - Medium

In [8]:
# Read the file into a Spark DataFrame
hi_patterns_df = spark.read.text("HI-Medium_Patterns.txt")

# Display the first few rows of the DataFrame
hi_patterns_df.show()

+--------------------+
|               value|
+--------------------+
|BEGIN LAUNDERING ...|
|2022/09/01 05:14,...|
|2022/09/03 13:09,...|
|2022/09/01 07:40,...|
|2022/09/01 14:19,...|
|2022/09/02 12:40,...|
|2022/09/03 06:34,...|
|END LAUNDERING AT...|
|                    |
|BEGIN LAUNDERING ...|
|2022/09/01 00:19,...|
|2022/09/01 19:35,...|
|2022/09/02 02:58,...|
|2022/09/02 18:02,...|
|2022/09/03 07:16,...|
|2022/09/03 11:39,...|
|2022/09/03 12:04,...|
|2022/09/04 07:27,...|
|2022/09/04 08:38,...|
|2022/09/05 13:23,...|
+--------------------+
only showing top 20 rows



### Low Ilicit - Medium

In [9]:
# Read the file into a Spark DataFrame
li_patterns_df = spark.read.text("LI-Medium_Patterns.txt")

# Display the first few rows of the DataFrame
li_patterns_df.show()

+--------------------+
|               value|
+--------------------+
|BEGIN LAUNDERING ...|
|2022/09/01 00:29,...|
|2022/09/04 12:49,...|
|2022/09/01 12:28,...|
|2022/09/04 13:39,...|
|2022/09/01 14:26,...|
|2022/09/04 15:34,...|
|2022/09/02 15:52,...|
|2022/09/04 16:27,...|
|2022/09/02 17:41,...|
|2022/09/05 08:36,...|
|2022/09/03 08:04,...|
|2022/09/05 13:36,...|
|2022/09/03 15:18,...|
|2022/09/05 15:45,...|
|2022/09/03 16:50,...|
|2022/09/05 16:00,...|
|2022/09/03 22:43,...|
|2022/09/05 17:50,...|
|2022/09/03 23:22,...|
+--------------------+
only showing top 20 rows



In [10]:
# Combine both datasets
patterns_df = hi_patterns_df.union(li_patterns_df)

## **PreProcessing**

### Identify Laundering Patterns:
Each laundering attempt begins with BEGIN LAUNDERING ATTEMPT - [PATTERN] and ends with END LAUNDERING ATTEMPT.

Used regex to extract pattern types and transaction details.

In [11]:
# Step 1: Extract Pattern_Type where there is "BEGIN LAUNDERING ATTEMPT"
patterns_df = patterns_df.withColumn(
    "Pattern_Type",
    F.when(F.col("value").rlike("BEGIN LAUNDERING ATTEMPT - (.+)"),
           F.regexp_extract(F.col("value"), "BEGIN LAUNDERING ATTEMPT - (.+)", 1))
     .otherwise(None)
)

# Step 2: Forward fill the Pattern_Type to propagate it down until "END LAUNDERING ATTEMPT"
window_spec = Window.orderBy(F.monotonically_increasing_id()).rowsBetween(Window.unboundedPreceding, 0)
patterns_df = patterns_df.withColumn(
    "Pattern_Type",
    F.last("Pattern_Type", True).over(window_spec)
)

# Step 3: Filter out rows with "END LAUNDERING ATTEMPT" as they only mark the end of an attempt
patterns_df = patterns_df.filter(~F.col("value").contains("END LAUNDERING ATTEMPT"))

In [12]:
# Show the DataFrame without truncating long strings
patterns_df.show(truncate=False)

24/12/03 15:38:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:32 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------------------------------------------------------------------------------------------------+-------------------+
|value                                                                                           |Pattern_Type       |
+------------------------------------------------------------------------------------------------+-------------------+
|BEGIN LAUNDERING ATTEMPT - STACK                                                                |STACK              |
|2022/09/01 05:14,00952,8139F54E0,0111632,8062C56E0,5331.44,US Dollar,5331.44,US Dollar,ACH,1    |STACK              |
|2022/09/03 13:09,0111632,8062C56E0,008456,81363F620,5602.59,US Dollar,5602.59,US Dollar,ACH,1   |STACK              |
|2022/09/01 07:40,0118693,823D5EB90,013729,801CF2E60,1400.54,US Dollar,1400.54,US Dollar,ACH,1   |STACK              |
|2022/09/01 14:19,013729,801CF2E60,0123621,81A7090F0,1467.94,US Dollar,1467.94,US Dollar,ACH,1   |STACK              |
|2022/09/02 12:40,0024750,81363F410,0213834,8087

                                                                                

In [13]:
from pyspark.sql import functions as F

# Remove any text after the colon in Pattern_Type if it exists
patterns_df = patterns_df.withColumn(
    "Pattern_Type",
    F.regexp_replace(F.col("Pattern_Type"), ":.*", "")
)

# Display the DataFrame (use .show() in local PySpark)
patterns_df.show(truncate=False)

24/12/03 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+------------------------------------------------------------------------------------------------+------------+
|value                                                                                           |Pattern_Type|
+------------------------------------------------------------------------------------------------+------------+
|BEGIN LAUNDERING ATTEMPT - STACK                                                                |STACK       |
|2022/09/01 05:14,00952,8139F54E0,0111632,8062C56E0,5331.44,US Dollar,5331.44,US Dollar,ACH,1    |STACK       |
|2022/09/03 13:09,0111632,8062C56E0,008456,81363F620,5602.59,US Dollar,5602.59,US Dollar,ACH,1   |STACK       |
|2022/09/01 07:40,0118693,823D5EB90,013729,801CF2E60,1400.54,US Dollar,1400.54,US Dollar,ACH,1   |STACK       |
|2022/09/01 14:19,013729,801CF2E60,0123621,81A7090F0,1467.94,US Dollar,1467.94,US Dollar,ACH,1   |STACK       |
|2022/09/02 12:40,0024750,81363F410,0213834,808757B00,16898.29,US Dollar,16898.29,US Dollar,ACH,1|STACK 

In [14]:
# Filter to get only transaction lines and ignore start/end laundering attempt lines
laundering_transactions = patterns_df.filter(patterns_df.value.rlike(r'\d{4}/\d{2}/\d{2}'))

# Unpersist patterns_df to free memory
patterns_df.unpersist()

# Cache laundering_transactions for reuse
laundering_transactions.cache()

# Display the filtered DataFrame (use .show() instead of display())
laundering_transactions.show(truncate=False)

24/12/03 15:38:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
24/12/03 15:38:35 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
[Stage 12:>                                                         (0 + 1) / 1]

+------------------------------------------------------------------------------------------------+------------+
|value                                                                                           |Pattern_Type|
+------------------------------------------------------------------------------------------------+------------+
|2022/09/01 05:14,00952,8139F54E0,0111632,8062C56E0,5331.44,US Dollar,5331.44,US Dollar,ACH,1    |STACK       |
|2022/09/03 13:09,0111632,8062C56E0,008456,81363F620,5602.59,US Dollar,5602.59,US Dollar,ACH,1   |STACK       |
|2022/09/01 07:40,0118693,823D5EB90,013729,801CF2E60,1400.54,US Dollar,1400.54,US Dollar,ACH,1   |STACK       |
|2022/09/01 14:19,013729,801CF2E60,0123621,81A7090F0,1467.94,US Dollar,1467.94,US Dollar,ACH,1   |STACK       |
|2022/09/02 12:40,0024750,81363F410,0213834,808757B00,16898.29,US Dollar,16898.29,US Dollar,ACH,1|STACK       |
|2022/09/03 06:34,0213834,808757B00,000,800073EF0,17607.19,US Dollar,17607.19,US Dollar,ACH,1    |STACK 

                                                                                

In [15]:
# Updated Code for Better Readability
columns = [
    "Timestamp", "From_Bank", "From_Account", "To_Bank", "To_Account",
    "Amount_Received", "Receiving_currency", "Amount_paid",
    "Payment_currency", "Payment_Format", "isLaundering"
]

for idx, col_name in enumerate(columns):
    laundering_transactions = laundering_transactions.withColumn(col_name, F.split(F.col("value"), ",").getItem(idx))

In [16]:
# Display the results
laundering_transactions.show(truncate=False)

+------------------------------------------------------------------------------------------------+------------+----------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+
|value                                                                                           |Pattern_Type|Timestamp       |From_Bank|From_Account|To_Bank|To_Account|Amount_Received|Receiving_currency|Amount_paid|Payment_currency|Payment_Format|isLaundering|
+------------------------------------------------------------------------------------------------+------------+----------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+
|2022/09/01 05:14,00952,8139F54E0,0111632,8062C56E0,5331.44,US Dollar,5331.44,US Dollar,ACH,1    |STACK       |2022/09/01 05:14|00952    |8139F54E0   |0111632|8062C56E0 |5331.44        |US Dollar         |5331.4

In [17]:
laundering_transactions = laundering_transactions.select("Timestamp", "From_Bank", "Pattern_Type", "isLaundering")

#  Display the results
laundering_transactions.show(truncate=False)

+----------------+---------+------------+------------+
|Timestamp       |From_Bank|Pattern_Type|isLaundering|
+----------------+---------+------------+------------+
|2022/09/01 05:14|00952    |STACK       |1           |
|2022/09/03 13:09|0111632  |STACK       |1           |
|2022/09/01 07:40|0118693  |STACK       |1           |
|2022/09/01 14:19|013729   |STACK       |1           |
|2022/09/02 12:40|0024750  |STACK       |1           |
|2022/09/03 06:34|0213834  |STACK       |1           |
|2022/09/01 00:19|0134266  |CYCLE       |1           |
|2022/09/01 19:35|0036925  |CYCLE       |1           |
|2022/09/02 02:58|0119211  |CYCLE       |1           |
|2022/09/02 18:02|0132965  |CYCLE       |1           |
|2022/09/03 07:16|0137089  |CYCLE       |1           |
|2022/09/03 11:39|0216618  |CYCLE       |1           |
|2022/09/03 12:04|0024083  |CYCLE       |1           |
|2022/09/04 07:27|0038110  |CYCLE       |1           |
|2022/09/04 08:38|0225015  |CYCLE       |1           |
|2022/09/0

In [18]:
# Count empty strings in each column
empty_string_counts = laundering_transactions.select(
    [sum(when(col(c) == "", 1).otherwise(0)).alias(c) for c in laundering_transactions.columns])

# Show the result
empty_string_counts.show(truncate=False)

+---------+---------+------------+------------+
|Timestamp|From_Bank|Pattern_Type|isLaundering|
+---------+---------+------------+------------+
|0        |0        |0           |0           |
+---------+---------+------------+------------+



In [19]:
laundering_transactions.createOrReplaceTempView("combined")

In [20]:
# Query using spark.sql()
result_df = spark.sql("""
    SELECT Pattern_Type, COUNT(Pattern_Type) AS count
    FROM combined
    GROUP BY Pattern_Type
""")

# Show the result
result_df.show(truncate=False)

+--------------+-----+
|Pattern_Type  |count|
+--------------+-----+
|STACK         |4601 |
|CYCLE         |2518 |
|FAN-IN        |2644 |
|GATHER-SCATTER|4830 |
|BIPARTITE     |2623 |
|FAN-OUT       |2617 |
|SCATTER-GATHER|4874 |
|RANDOM        |1945 |
+--------------+-----+



In [21]:
laundering_transactions.cache().groupBy("isLaundering").count().show()

+------------+-----+
|isLaundering|count|
+------------+-----+
|           1|26652|
+------------+-----+



### **Laundering Transactions Labeling**

Joined the laundering pattern DataFrame (laundering_transactions) with the combined transaction DataFrame (trans_df) on transaction identifiers.

In [22]:
# Join the DataFrames on Timestamp, From_Bank, and To_Bank
joined_df = trans_df.join(
    laundering_transactions,
    on=["Timestamp", "From_Bank"],
    how="left"
)

# Freeing from cache
trans_df.unpersist()
laundering_transactions.unpersist()

DataFrame[Timestamp: string, From_Bank: string, Pattern_Type: string, isLaundering: string]

In [23]:
# Fill null values in the `isLaundering` column with 0
joined_df = joined_df.withColumn(
    "isLaundering",
    F.when(F.col("isLaundering").isNull(), F.lit(0)).otherwise(F.col("isLaundering"))
)

In [24]:
# Cache the joined DataFrame
joined_df.cache()

# Display the cached DataFrame (use .show() instead of .display())
joined_df.show(truncate=False)

24/12/03 15:38:41 WARN CSVHeaderChecker: Number of column in CSV header is not equal to number of fields in the schema:
 Header length: 11, schema size: 10
CSV file: file:///kaggle/input/ibm-transactions-for-anti-money-laundering-aml/HI-Medium_Trans.csv
24/12/03 15:40:37 WARN MemoryStore: Not enough space to cache rdd_107_4 in memory! (computed 44.3 MiB so far)
24/12/03 15:40:37 WARN BlockManager: Persisting block rdd_107_4 to disk instead.
24/12/03 15:40:50 WARN MemoryStore: Not enough space to cache rdd_107_4 in memory! (computed 6.7 MiB so far)
24/12/03 15:41:00 WARN MemoryStore: Not enough space to cache rdd_107_10 in memory! (computed 6.4 MiB so far)
24/12/03 15:41:00 WARN BlockManager: Persisting block rdd_107_10 to disk instead.
24/12/03 15:41:00 WARN MemoryStore: Not enough space to cache rdd_107_11 in memory! (computed 6.4 MiB so far)
24/12/03 15:41:00 WARN BlockManager: Persisting block rdd_107_11 to disk instead.
24/12/03 15:41:05 WARN MemoryStore: Not enough space to cache 

+----------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+
|Timestamp       |From_Bank|From_Account|To_Bank|To_Account|Amount_Received|Receiving_Currency|Amount_Paid|Payment_Currency|Payment_Format|Pattern_Type|isLaundering|
+----------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+
|2022/09/01 00:17|020      |800104D70   |020    |800104D70 |6794.63        |US Dollar         |6794.63    |US Dollar       |Reinvestment  |NULL        |0           |
|2022/09/01 00:02|03196    |800107150   |03196  |800107150 |7739.29        |US Dollar         |7739.29    |US Dollar       |Reinvestment  |NULL        |0           |
|2022/09/01 00:17|01208    |80010E430   |01208  |80010E430 |1880.23        |US Dollar         |1880.23    |US Dollar       |Reinvestment  |NULL        |0           |
|202

                                                                                

In [25]:
# Convert 'isLaundering' from string to integer
joined_df = joined_df.withColumn("isLaundering", col("isLaundering").cast("integer"))


# Verify the schema change
joined_df.printSchema()

root
 |-- Timestamp: string (nullable = true)
 |-- From_Bank: string (nullable = true)
 |-- From_Account: string (nullable = true)
 |-- To_Bank: string (nullable = true)
 |-- To_Account: string (nullable = true)
 |-- Amount_Received: float (nullable = true)
 |-- Receiving_Currency: string (nullable = true)
 |-- Amount_Paid: float (nullable = true)
 |-- Payment_Currency: string (nullable = true)
 |-- Payment_Format: string (nullable = true)
 |-- Pattern_Type: string (nullable = true)
 |-- isLaundering: integer (nullable = true)



In [26]:
joined_df.groupBy("isLaundering").count().show()

24/12/03 15:51:07 WARN MemoryStore: Not enough space to cache rdd_107_5 in memory! (computed 26.2 MiB so far)
24/12/03 15:51:07 WARN MemoryStore: Not enough space to cache rdd_107_4 in memory! (computed 44.3 MiB so far)
24/12/03 15:51:08 WARN MemoryStore: Not enough space to cache rdd_107_10 in memory! (computed 12.4 MiB so far)
24/12/03 15:51:08 WARN MemoryStore: Not enough space to cache rdd_107_11 in memory! (computed 6.4 MiB so far)
24/12/03 15:51:08 WARN MemoryStore: Not enough space to cache rdd_107_9 in memory! (computed 49.5 MiB so far)
24/12/03 15:51:09 WARN MemoryStore: Not enough space to cache rdd_107_12 in memory! (computed 26.7 MiB so far)
24/12/03 15:51:09 WARN MemoryStore: Not enough space to cache rdd_107_14 in memory! (computed 26.0 MiB so far)
24/12/03 15:51:09 WARN MemoryStore: Not enough space to cache rdd_107_13 in memory! (computed 45.1 MiB so far)
24/12/03 15:51:10 WARN MemoryStore: Not enough space to cache rdd_107_17 in memory! (computed 24.4 MiB so far)
24/12

+------------+--------+
|isLaundering|   count|
+------------+--------+
|           1|   84928|
|           0|63065007|
+------------+--------+



                                                                                

The dataset is imbalanced.

### Balance the data

In [27]:
# Register joined_df as a temporary view to use SQL
joined_df.createOrReplaceTempView("joined_table")

# Step 1: Identify rows to drop
# Here we assign a random value to each row that meets the condition, then select 50% of those rows
query = """
SELECT *
FROM joined_table
WHERE NOT ((Pattern_Type IS NULL) AND (isLaundering = 0) AND (rand() < 0.5))
"""

# Execute the SQL query
balanced_df = spark.sql(query)

In [28]:
joined_df.unpersist()

DataFrame[Timestamp: string, From_Bank: string, From_Account: string, To_Bank: string, To_Account: string, Amount_Received: float, Receiving_Currency: string, Amount_Paid: float, Payment_Currency: string, Payment_Format: string, Pattern_Type: string, isLaundering: int]

In [29]:
balanced_df.cache().groupBy("isLaundering").count().show()

24/12/03 15:51:17 WARN MemoryStore: Not enough space to cache rdd_107_2 in memory! (computed 14.3 MiB so far)
24/12/03 15:51:17 WARN MemoryStore: Not enough space to cache rdd_107_1 in memory! (computed 7.5 MiB so far)
24/12/03 15:51:17 WARN MemoryStore: Not enough space to cache rdd_107_3 in memory! (computed 7.4 MiB so far)
24/12/03 15:51:17 WARN MemoryStore: Not enough space to cache rdd_107_0 in memory! (computed 8.0 MiB so far)
24/12/03 15:51:40 WARN MemoryStore: Not enough space to cache rdd_107_11 in memory! (computed 42.5 MiB so far)
24/12/03 15:51:50 WARN MemoryStore: Not enough space to cache rdd_107_13 in memory! (computed 45.1 MiB so far)
24/12/03 15:51:51 WARN MemoryStore: Not enough space to cache rdd_107_15 in memory! (computed 43.5 MiB so far)
24/12/03 15:52:26 WARN MemoryStore: Not enough space to cache rdd_107_28 in memory! (computed 51.9 MiB so far)
24/12/03 15:52:40 WARN MemoryStore: Not enough space to cache rdd_107_34 in memory! (computed 42.4 MiB so far)
24/12/03

+------------+--------+
|isLaundering|   count|
+------------+--------+
|           1|   84928|
|           0|31540149|
+------------+--------+



                                                                                

### Data Cleaning and Exploration

In [30]:
# Count NULL values in each column
null_counts = balanced_df.select([sum(col(c).isNull().cast("int")).alias(c) for c in balanced_df.columns])

# Show the result
null_counts.show()

24/12/03 15:53:17 WARN MemoryStore: Not enough space to cache rdd_126_2 in memory! (computed 8.4 MiB so far)
24/12/03 15:53:17 WARN MemoryStore: Not enough space to cache rdd_126_3 in memory! (computed 16.0 MiB so far)
24/12/03 15:53:17 WARN MemoryStore: Not enough space to cache rdd_126_1 in memory! (computed 16.3 MiB so far)
24/12/03 15:53:17 WARN MemoryStore: Not enough space to cache rdd_126_0 in memory! (computed 15.9 MiB so far)
24/12/03 15:53:19 WARN MemoryStore: Not enough space to cache rdd_126_4 in memory! (computed 30.5 MiB so far)
24/12/03 15:53:20 WARN MemoryStore: Not enough space to cache rdd_126_5 in memory! (computed 15.7 MiB so far)
24/12/03 15:53:21 WARN MemoryStore: Not enough space to cache rdd_126_8 in memory! (computed 30.2 MiB so far)
24/12/03 15:53:21 WARN MemoryStore: Not enough space to cache rdd_126_11 in memory! (computed 7.9 MiB so far)
24/12/03 15:53:21 WARN MemoryStore: Not enough space to cache rdd_126_10 in memory! (computed 15.3 MiB so far)
24/12/03 1

+---------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+
|Timestamp|From_Bank|From_Account|To_Bank|To_Account|Amount_Received|Receiving_Currency|Amount_Paid|Payment_Currency|Payment_Format|Pattern_Type|isLaundering|
+---------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+
|        0|        0|           0|      0|         0|              0|                 0|          0|               0|             0|    31540149|           0|
+---------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+



                                                                                

In [31]:
balanced_df = balanced_df.na.fill({
    "Pattern_Type": "Unknown"
})

In [32]:
balanced_df = balanced_df.withColumn("Timestamp", to_timestamp("Timestamp", "yyyy/MM/dd HH:mm")) \
                         .withColumn("Hour", hour("Timestamp")) \
                         .withColumn("DayOfWeek", dayofweek("Timestamp"))

In [33]:
# Show the final DataFrame
balanced_df.show(truncate=False)

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+----+---------+
|Timestamp          |From_Bank|From_Account|To_Bank|To_Account|Amount_Received|Receiving_Currency|Amount_Paid|Payment_Currency|Payment_Format|Pattern_Type|isLaundering|Hour|DayOfWeek|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+----+---------+
|2022-09-01 00:17:00|020      |800104D70   |020    |800104D70 |6794.63        |US Dollar         |6794.63    |US Dollar       |Reinvestment  |Unknown     |0           |0   |5        |
|2022-09-01 00:02:00|03196    |800107150   |03196  |800107150 |7739.29        |US Dollar         |7739.29    |US Dollar       |Reinvestment  |Unknown     |0           |0   |5        |
|2022-09-01 00:17:00|01208    |80010E430   |01208  |80010E430 |1880.23        |U

24/12/03 15:53:31 WARN MemoryStore: Not enough space to cache rdd_126_0 in memory! (computed 31.0 MiB so far)


## **Feature Engineering**
Calculate FanOut, FanIn, and AvgAmountSent.

-FanOut:
how many different transactions it initiates.

-FanIn:
how many different transactions it receives.

-AvgAmountSent:
the typical transaction size for each account as a sender.

In [34]:
# Window specifications
sender_window = Window.partitionBy("From_Account")
receiver_window = Window.partitionBy("To_Account")

# Calculate fan-out, fan-in, and average amount sent
featured_df = balanced_df.withColumn("FanOut", count("To_Account").over(sender_window)) \
                         .withColumn("FanIn", count("From_Account").over(receiver_window)) \
                         .withColumn("AvgAmountSent", avg("Amount_Paid").over(sender_window))

In [35]:
balanced_df.unpersist()

# Show the resulting DataFrame
featured_df.show(truncate=False)

24/12/03 15:53:32 WARN MemoryStore: Not enough space to cache rdd_126_3 in memory! (computed 8.4 MiB so far)
24/12/03 15:53:32 WARN MemoryStore: Not enough space to cache rdd_126_1 in memory! (computed 16.3 MiB so far)
24/12/03 15:53:32 WARN MemoryStore: Not enough space to cache rdd_126_0 in memory! (computed 15.9 MiB so far)
24/12/03 15:53:32 WARN MemoryStore: Not enough space to cache rdd_126_2 in memory! (computed 16.3 MiB so far)
24/12/03 15:53:38 WARN MemoryStore: Not enough space to cache rdd_126_4 in memory! (computed 30.5 MiB so far)
24/12/03 15:53:38 WARN MemoryStore: Not enough space to cache rdd_126_5 in memory! (computed 15.7 MiB so far)
24/12/03 15:53:42 WARN MemoryStore: Not enough space to cache rdd_126_8 in memory! (computed 30.2 MiB so far)
24/12/03 15:53:42 WARN MemoryStore: Not enough space to cache rdd_126_10 in memory! (computed 15.3 MiB so far)
24/12/03 15:53:42 WARN MemoryStore: Not enough space to cache rdd_126_11 in memory! (computed 7.9 MiB so far)
24/12/03 1

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+----+---------+------+-----+------------------+
|Timestamp          |From_Bank|From_Account|To_Bank|To_Account|Amount_Received|Receiving_Currency|Amount_Paid|Payment_Currency|Payment_Format|Pattern_Type|isLaundering|Hour|DayOfWeek|FanOut|FanIn|AvgAmountSent     |
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+----+---------+------+-----+------------------+
|2022-09-01 05:21:00|004      |800060A20   |004    |800060A20 |1834.72        |Rupee             |1834.72    |Rupee           |Reinvestment  |Unknown     |0           |5   |5        |228   |53   |157083.68798252573|
|2022-09-08 07:24:00|004      |800060A20   |025    |8129D4D00 |922259.2       |Rupee             |922259.2   |Rupee           |ACH      

                                                                                

### Encode Categorical Variables

Converted categorical columns into numerical indices using StringIndexer.

In [36]:
currency_index = StringIndexer(inputCol="Receiving_Currency", outputCol="CurrencyIndex")
payment_format_index = StringIndexer(inputCol="Payment_Format", outputCol="PaymentFormatIndex")
pattern_type_index = StringIndexer(inputCol="Pattern_Type", outputCol="PatternTypeIndex")

featured_df = currency_index.fit(featured_df).transform(featured_df)
featured_df = payment_format_index.fit(featured_df).transform(featured_df)
featured_df = pattern_type_index.fit(featured_df).transform(featured_df)

24/12/03 15:56:18 WARN MemoryStore: Not enough space to cache rdd_126_5 in memory! (computed 31.1 MiB so far)
24/12/03 15:56:18 WARN MemoryStore: Not enough space to cache rdd_126_6 in memory! (computed 31.0 MiB so far)
24/12/03 15:56:19 WARN MemoryStore: Not enough space to cache rdd_126_7 in memory! (computed 30.2 MiB so far)
24/12/03 15:56:19 WARN MemoryStore: Not enough space to cache rdd_126_8 in memory! (computed 30.2 MiB so far)
24/12/03 15:56:19 WARN MemoryStore: Not enough space to cache rdd_126_10 in memory! (computed 15.3 MiB so far)
24/12/03 15:56:19 WARN MemoryStore: Not enough space to cache rdd_126_11 in memory! (computed 7.9 MiB so far)
24/12/03 15:56:20 WARN MemoryStore: Not enough space to cache rdd_126_13 in memory! (computed 16.3 MiB so far)
24/12/03 15:56:20 WARN MemoryStore: Not enough space to cache rdd_126_14 in memory! (computed 8.2 MiB so far)
24/12/03 15:56:20 WARN MemoryStore: Not enough space to cache rdd_126_12 in memory! (computed 31.9 MiB so far)
24/12/0

In [37]:
# Show the resulting DataFrame
featured_df.show(truncate=False)

24/12/03 15:56:44 WARN MemoryStore: Not enough space to cache rdd_126_5 in memory! (computed 31.1 MiB so far)
24/12/03 15:56:44 WARN MemoryStore: Not enough space to cache rdd_126_6 in memory! (computed 15.9 MiB so far)
24/12/03 15:56:44 WARN MemoryStore: Not enough space to cache rdd_126_7 in memory! (computed 7.9 MiB so far)
24/12/03 15:56:48 WARN MemoryStore: Not enough space to cache rdd_126_8 in memory! (computed 30.2 MiB so far)
24/12/03 15:56:49 WARN MemoryStore: Not enough space to cache rdd_126_10 in memory! (computed 15.3 MiB so far)
24/12/03 15:56:49 WARN MemoryStore: Not enough space to cache rdd_126_11 in memory! (computed 7.9 MiB so far)
24/12/03 15:56:53 WARN MemoryStore: Not enough space to cache rdd_126_12 in memory! (computed 31.9 MiB so far)
24/12/03 15:56:53 WARN MemoryStore: Not enough space to cache rdd_126_13 in memory! (computed 16.3 MiB so far)
24/12/03 15:56:53 WARN MemoryStore: Not enough space to cache rdd_126_14 in memory! (computed 8.2 MiB so far)
24/12/03

+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+----+---------+------+-----+------------------+-------------+------------------+----------------+
|Timestamp          |From_Bank|From_Account|To_Bank|To_Account|Amount_Received|Receiving_Currency|Amount_Paid|Payment_Currency|Payment_Format|Pattern_Type|isLaundering|Hour|DayOfWeek|FanOut|FanIn|AvgAmountSent     |CurrencyIndex|PaymentFormatIndex|PatternTypeIndex|
+-------------------+---------+------------+-------+----------+---------------+------------------+-----------+----------------+--------------+------------+------------+----+---------+------+-----+------------------+-------------+------------------+----------------+
|2022-09-01 05:21:00|004      |800060A20   |004    |800060A20 |1834.72        |Rupee             |1834.72    |Rupee           |Reinvestment  |Unknown     |0           |5   |5        |228   |53   |157083

                                                                                

### Generate synthetic data for minority, using SMOTE

In [38]:
# Step 1: Select only the required columns (excluding 'isLaundering')
feature_columns = ["Amount_Received", "FanOut", "FanIn", "AvgAmountSent",
                   "Hour", "DayOfWeek", "CurrencyIndex",
                   "PaymentFormatIndex", "PatternTypeIndex"]

# Select features for minority class (isLaundering = 1)
minority_df = featured_df.filter(F.col("isLaundering") == 1).select(*feature_columns)
majority_df = featured_df.filter(F.col("isLaundering") == 0).select(*feature_columns, "isLaundering")

featured_df.unpersist()

# Step 2: Define a function to generate synthetic samples (excluding isLaundering)
def generate_synthetic_samples(minority_data, num_samples=10):
    synthetic_samples = []

    for row in minority_data:
        base_vector = np.array([row[col] for col in feature_columns])

        # Find random neighbors within the minority class
        neighbors = random.sample(minority_data, k=num_samples)
        for neighbor in neighbors:
            neighbor_vector = np.array([neighbor[col] for col in feature_columns])

            # Interpolate to create a synthetic sample
            gap = np.random.rand()
            synthetic_vector = base_vector + gap * (neighbor_vector - base_vector)

            # Append the synthetic sample without the 'isLaundering' column
            synthetic_samples.append(tuple(synthetic_vector.tolist()))

    return synthetic_samples

# Step 3: Collect minority samples and generate synthetic samples
minority_data = minority_df.collect()
synthetic_samples = generate_synthetic_samples(minority_data, num_samples=50)

# Step 4: Define schema for synthetic samples without 'isLaundering'
schema = StructType([StructField(col, DoubleType(), True) for col in feature_columns])

# Create synthetic DataFrame from synthetic samples
synthetic_df = spark.createDataFrame(synthetic_samples, schema=schema)

# Step 5: Add 'isLaundering' column with value 1 to synthetic samples
synthetic_df = synthetic_df.withColumn("isLaundering", F.lit(1))

# Step 6: Combine the majority and synthetic DataFrames
balanced_featured_df = majority_df.union(synthetic_df)

# Display counts to confirm balancing
balanced_featured_df.cache().groupBy("isLaundering").count().show()

24/12/03 15:59:31 WARN MemoryStore: Not enough space to cache rdd_126_6 in memory! (computed 31.0 MiB so far)
24/12/03 15:59:31 WARN MemoryStore: Not enough space to cache rdd_126_7 in memory! (computed 15.3 MiB so far)
24/12/03 15:59:34 WARN MemoryStore: Not enough space to cache rdd_126_8 in memory! (computed 30.2 MiB so far)
24/12/03 15:59:35 WARN MemoryStore: Not enough space to cache rdd_126_9 in memory! (computed 7.9 MiB so far)
24/12/03 15:59:35 WARN MemoryStore: Not enough space to cache rdd_126_11 in memory! (computed 7.9 MiB so far)
24/12/03 15:59:35 WARN MemoryStore: Not enough space to cache rdd_126_10 in memory! (computed 15.3 MiB so far)
24/12/03 15:59:38 WARN MemoryStore: Not enough space to cache rdd_126_12 in memory! (computed 31.9 MiB so far)
24/12/03 15:59:39 WARN MemoryStore: Not enough space to cache rdd_126_14 in memory! (computed 8.2 MiB so far)
24/12/03 15:59:39 WARN MemoryStore: Not enough space to cache rdd_126_13 in memory! (computed 16.3 MiB so far)
24/12/03

Py4JJavaError: An error occurred while calling o493.showString.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 12 in stage 81.0 failed 1 times, most recent failure: Lost task 12.0 in stage 81.0 (TID 706) (5043e306571c executor driver): java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.unsafe.types.UTF8String.fromAddress(UTF8String.java:132)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.getUTF8String(UnsafeRow.java:382)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:168)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2483/0x0000000840fcf040.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
Caused by: java.lang.OutOfMemoryError: Java heap space
	at org.apache.spark.unsafe.types.UTF8String.fromAddress(UTF8String.java:132)
	at org.apache.spark.sql.catalyst.expressions.UnsafeRow.getUTF8String(UnsafeRow.java:382)
	at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage1.processNext(Unknown Source)
	at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
	at org.apache.spark.sql.execution.WholeStageCodegenEvaluatorFactory$WholeStageCodegenPartitionEvaluator$$anon$1.hasNext(WholeStageCodegenEvaluatorFactory.scala:43)
	at scala.collection.Iterator$$anon$10.hasNext(Iterator.scala:460)
	at org.apache.spark.shuffle.sort.BypassMergeSortShuffleWriter.write(BypassMergeSortShuffleWriter.java:168)
	at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:104)
	at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:54)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.executor.Executor$TaskRunner$$Lambda$2483/0x0000000840fcf040.apply(Unknown Source)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
	at java.base/java.lang.Thread.run(Thread.java:829)
