In [1]:
from pyspark.sql import SparkSession
#Initialize a PySpark SparkSession
spark = SparkSession.builder.appName("MLPPySpark").getOrCreate()

In [2]:
data = spark.read.format("csv").option("header", "true").load("./ml_model/fraudulent_transactions_anonymous_ia.csv")

In [3]:
data.printSchema()

root
 |-- transaction-id: string (nullable = true)
 |-- customer-id: string (nullable = true)
 |-- account-id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- customer_id_avrge_amount_1day: string (nullable = true)
 |-- customer_id_avrge_amount_1week: string (nullable = true)
 |-- customer_id_avrge_amount_1month: string (nullable = true)
 |-- customer_id_avrge_amount_3month: string (nullable = true)
 |-- customer_id_count_1day: string (nullable = true)
 |-- customer_id_count_1week: string (nullable = true)
 |-- customer_id_count_1month: string (nullable = true)
 |-- customer_id_count_3month: string (nullable = true)
 |-- account_id_avrge_amount_1day: string (nullable = true)
 |-- account_id_avrge_amount_1week: string (nullable = true)
 |-- account_id_avrge_amount_1month: string (nullable = true)
 |-- account_id_avrge_amount_3month: string (nullable = true)
 |-- account_id_count_1day: string (nullable = true)
 |-- account_id_co

In [4]:
from pyspark.sql.functions import col

# Convert  column from String to Double
data = data.withColumn("amount", col("amount").cast("double"))

data = data.withColumn("customer_id_avrge_amount_1day", col("customer_id_avrge_amount_1day").cast("double"))
data = data.withColumn("customer_id_avrge_amount_1week", col("customer_id_avrge_amount_1week").cast("double"))
data = data.withColumn("customer_id_avrge_amount_1month", col("customer_id_avrge_amount_1month").cast("double"))
data = data.withColumn("customer_id_avrge_amount_3month", col("customer_id_avrge_amount_3month").cast("double"))

data = data.withColumn("customer_id_count_1day", col("customer_id_count_1day").cast("integer"))
data = data.withColumn("customer_id_count_1week", col("customer_id_count_1week").cast("integer"))
data = data.withColumn("customer_id_count_1month", col("customer_id_count_1month").cast("integer"))
data = data.withColumn("customer_id_count_3month", col("customer_id_count_3month").cast("integer"))

data = data.withColumn("account_id_avrge_amount_1day", col("account_id_avrge_amount_1day").cast("double"))
data = data.withColumn("account_id_avrge_amount_1week", col("account_id_avrge_amount_1week").cast("double"))
data = data.withColumn("account_id_avrge_amount_1month", col("account_id_avrge_amount_1month").cast("double"))
data = data.withColumn("account_id_avrge_amount_3month", col("account_id_avrge_amount_3month").cast("double"))

data = data.withColumn("account_id_count_1day", col("account_id_count_1day").cast("integer"))
data = data.withColumn("account_id_count_1week", col("account_id_count_1week").cast("integer"))
data = data.withColumn("account_id_count_1month", col("account_id_count_1month").cast("integer"))
data = data.withColumn("account_id_count_3month", col("account_id_count_3month").cast("integer"))

data = data.withColumn("transaction_in_weekend", col("transaction_in_weekend").cast("integer"))

data = data.withColumn("transaction_at_night", col("transaction_at_night").cast("integer"))

data = data.withColumn("is_fraud", col("is_fraud").cast("integer"))

In [5]:
data.printSchema()

root
 |-- transaction-id: string (nullable = true)
 |-- customer-id: string (nullable = true)
 |-- account-id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- customer_id_avrge_amount_1day: double (nullable = true)
 |-- customer_id_avrge_amount_1week: double (nullable = true)
 |-- customer_id_avrge_amount_1month: double (nullable = true)
 |-- customer_id_avrge_amount_3month: double (nullable = true)
 |-- customer_id_count_1day: integer (nullable = true)
 |-- customer_id_count_1week: integer (nullable = true)
 |-- customer_id_count_1month: integer (nullable = true)
 |-- customer_id_count_3month: integer (nullable = true)
 |-- account_id_avrge_amount_1day: double (nullable = true)
 |-- account_id_avrge_amount_1week: double (nullable = true)
 |-- account_id_avrge_amount_1month: double (nullable = true)
 |-- account_id_avrge_amount_3month: double (nullable = true)
 |-- account_id_count_1day: integer (nullable = true)
 |-- account_

In [6]:
%%time
from pyspark.ml.feature import VectorAssembler


# Define the input features column names
feature_cols = [
    'amount',
    'customer_id_avrge_amount_1day', 'customer_id_avrge_amount_1week', 'customer_id_avrge_amount_1month', 'customer_id_avrge_amount_3month',
    'customer_id_count_1day', 'customer_id_count_1week', 'customer_id_count_1month', 'customer_id_count_3month',
    'account_id_avrge_amount_1day', 'account_id_avrge_amount_1week', 'account_id_avrge_amount_1month', 'account_id_avrge_amount_3month',
    'account_id_count_1day', 'account_id_count_1week', 'account_id_count_1month', 'account_id_count_3month',
    'transaction_in_weekend', 'transaction_at_night'
]

# Define the target column name
target_col = "is_fraud"

# Create a vector assembler to combine the input features into a single feature vector
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")

# Transform the dataset using the vector assembler
df_data = assembler.transform(data)

model_df=df_data.select(['features','is_fraud'])

# Split the dataset into training and testing sets
train_data, test_data = model_df.randomSplit([0.7, 0.3], seed=123)

CPU times: user 1.52 s, sys: 359 ms, total: 1.88 s
Wall time: 19.5 s


In [7]:
%%time
from pyspark.ml.classification import MultilayerPerceptronClassifier

# Define the layers for the MLP
layers = [len(feature_cols), 32, 16, 2]

# Create the MLP classifier
mlp = MultilayerPerceptronClassifier(labelCol=target_col, featuresCol="features", layers=layers, seed=123)

# Train the MLP classifier
model = mlp.fit(train_data)

CPU times: user 55.3 ms, sys: 25.8 ms, total: 81.1 ms
Wall time: 2min 52s


In [8]:
%%time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Make predictions on the test data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(labelCol=target_col, predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))

Accuracy: 85.67%
CPU times: user 14.1 ms, sys: 7.24 ms, total: 21.4 ms
Wall time: 9.01 s


In [9]:
%%time
predictions.show()

+--------------------+--------+--------------------+--------------------+----------+
|            features|is_fraud|       rawPrediction|         probability|prediction|
+--------------------+--------+--------------------+--------------------+----------+
|[3.194,3.194,3.19...|       0|[0.22650386846030...|[0.94820592832277...|       0.0|
|[3.194,3.194,3.19...|       0|[0.22650386846030...|[0.94820592832277...|       0.0|
|[3.194,3.194,3.19...|       0|[0.22650386846030...|[0.94820592832277...|       0.0|
|[3.194,3.194,3.19...|       0|[0.22650386846030...|[0.94820592832277...|       0.0|
|[3.194,3.194,3.19...|       0|[0.22650386846030...|[0.94820592832277...|       0.0|
|[3.194,3.194,3.19...|       0|[0.22548870469081...|[0.94748659161814...|       0.0|
|[3.194,3.194,3.19...|       0|[0.22091792801325...|[0.94812993212255...|       0.0|
|[3.194,3.194,3.19...|       0|[1.01547732116015...|[0.98951976524052...|       0.0|
|[3.194,3.194,3.19...|       0|[1.01541787454185...|[0.9895190018

In [10]:
# Save the model
model.save("mlp_ml")

In [11]:
%%time
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Creating a MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction")

# Calculating precision
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
print("Precision:", precision)

# Calculating recall
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
print("Recall:", recall)

# Calculating accuracy
accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
print("Accuracy:", accuracy)

# Calculating the F1 score
f1_score = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print("F1 Score:", f1_score)

Precision: 0.7339534142771261
Recall: 0.8567108113460027
Accuracy: 0.8567108113460027
F1 Score: 0.7905952933457144
CPU times: user 44.3 ms, sys: 11 ms, total: 55.3 ms
Wall time: 23.2 s
