In [1]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

# Initializing SparkSession with Kryo serialization
spark = SparkSession.builder \
    .appName("MoneyLaunderingDetectionRF") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.kryo.registrationRequired", "false") \
    .config("spark.kryoserializer.buffer.max", "1024m") \
    .config("spark.driver.memory", "10g") \
    .config("spark.executor.memory", "10g") \
    .config("spark.executor.memoryOverhead", "1g")\
    .config("spark.executor.extraJavaOptions", "-XX:+UseG1GC")\
    .getOrCreate()


In [2]:
data = spark.read.csv("/common/users/shared/cs543_group4/clean_data/part-00000-7c14ae91-860c-498f-8c30-891819d8e7e0-c000.csv", header=True, inferSchema=True)
data.show()


+----------+---------+------------+-------+----------+----------+-----------+-----------+-------------+--------------+-------------+-----+----+--------------------+------------------+--------------------+------------------+--------------------+--------------------+
| timestamp|bank_from|account_from|bank_to|account_to| amount_to|currency_to|amount_from|currency_from|payment_format|is_laundering|month|year|conversion_rate_from|   amount_from_usd|  conversion_rate_to|     amount_to_usd|      unique_id_from|        unique_id_to|
+----------+---------+------------+-------+----------+----------+-----------+-----------+-------------+--------------+-------------+-----+----+--------------------+------------------+--------------------+------------------+--------------------+--------------------+
|2022-08-01|    34377|   801BEB6D0|  34377| 801BEB6D0|  99492.58|        Yen|   99492.58|          Yen|  Reinvestment|            0|    8|2022|0.009474283158489959| 942.6208750887149|0.00947428315848995

In [3]:
from pyspark.sql.functions import col

data = data.withColumn("timestamp", col("timestamp").cast("long"))


In [4]:
indexers = [
    StringIndexer(inputCol=column, outputCol=column+"_index").fit(data) 
    for column in ['account_from', 'account_to', 'payment_format']
]

pipeline = Pipeline(stages=indexers)
data = pipeline.fit(data).transform(data)

In [5]:
selected_cols = ['timestamp', 'bank_from', 'account_from_index', 'bank_to', 'account_to_index', 
                 'payment_format_index', 'month', 'year', 'amount_from_usd', 'amount_to_usd', 'is_laundering']
reduced_df = data.select(*selected_cols)

In [6]:
assembler = VectorAssembler(
    inputCols=['timestamp', 'bank_from', 'account_from_index', 'bank_to', 'account_to_index', 'payment_format_index', 'month', 'year', 'amount_from_usd', 'amount_to_usd'],
    outputCol="features"
)

reduced_data = assembler.transform(reduced_df)


In [7]:
train_data, test_data = reduced_data.randomSplit([0.7, 0.3], seed=42)

In [None]:
rf = RandomForestClassifier(labelCol="is_laundering", featuresCol="features", numTrees=30)

model = rf.fit(train_data)


In [None]:
predictions = model.transform(test_data)

evaluator = BinaryClassificationEvaluator(
    labelCol="is_laundering", rawPredictionCol="rawPrediction", metricName="areaUnderROC"
)
auc = evaluator.evaluate(predictions)

print(f"Area under ROC: {auc}")


In [None]:
importances = model.featureImportances
for i, importance in enumerate(importances):
    print(f"Feature {i}: {importance}")


In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [None]:
predictions = model.transform(test_data)

In [None]:
evaluator = MulticlassClassificationEvaluator(
    labelCol="is_laundering", predictionCol="prediction", metricName="accuracy"
)
accuracy = evaluator.evaluate(predictions)
print(f"Accuracy: {accuracy * 100:.2f}%")