In [28]:
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline

In [29]:
spark = SparkSession.builder.appName("MLP with MLlib in PySpark").getOrCreate()

In [30]:
data = spark.read.format("csv").option("header", "true").load("./ml_model/fraudulent_transactions_anonymous_ia.csv")

In [31]:
# Convert target column to numeric using StringIndexer
label_indexer = StringIndexer(inputCol="is_fraud", outputCol="label")

# Convert string columns to numeric using StringIndexer
indexers = [
    StringIndexer(inputCol=col, outputCol=col+"_index", handleInvalid="keep")
    for col in ['amount', 'customer_id_avrge_amount_1day', 'customer_id_avrge_amount_1week',
                'customer_id_avrge_amount_1month', 'customer_id_avrge_amount_3month',
                'customer_id_count_1day', 'customer_id_count_1week', 'customer_id_count_1month',
                'customer_id_count_3month', 'account_id_avrge_amount_1day', 'account_id_avrge_amount_1week',
                'account_id_avrge_amount_1month', 'account_id_avrge_amount_3month',
                'account_id_count_1day', 'account_id_count_1week', 'account_id_count_1month',
                'account_id_count_3month', 'transaction_in_weekend', 'transaction_at_night']
]

# Create a VectorAssembler to assemble the feature columns
feature_columns = [col+"_index" for col in ['amount', 'customer_id_avrge_amount_1day', 'customer_id_avrge_amount_1week',
                                            'customer_id_avrge_amount_1month', 'customer_id_avrge_amount_3month',
                                            'customer_id_count_1day', 'customer_id_count_1week',
                                            'customer_id_count_1month', 'customer_id_count_3month',
                                            'account_id_avrge_amount_1day', 'account_id_avrge_amount_1week',
                                            'account_id_avrge_amount_1month', 'account_id_avrge_amount_3month',
                                            'account_id_count_1day', 'account_id_count_1week', 'account_id_count_1month',
                                            'account_id_count_3month', 'transaction_in_weekend', 'transaction_at_night']
]
vectorAssembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features"
)
# Apply the VectorAssembler to the data
#data = vectorAssembler.transform(data)

In [32]:
# Create a Multilayer Perceptron Classifier
layers = [len(feature_columns), 10, 5, 2]  # Adjust the number of neurons in each layer as needed
mlp = MultilayerPerceptronClassifier(
    labelCol="label",
    featuresCol="features",
    layers=layers,
    seed=42
)

In [34]:
# Create a pipeline to combine the stages
pipeline = Pipeline(stages=[label_indexer] + indexers + [vectorAssembler, mlp])

# Split the data into training and testing sets
train_data, test_data = data.randomSplit([0.8, 0.2], seed=42)

# Fit the pipeline on the training data
model = pipeline.fit(train_data)

Py4JJavaError: An error occurred while calling o2707.fit.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 5 in stage 293.0 failed 1 times, most recent failure: Lost task 5.0 in stage 293.0 (TID 686) (pyspark executor driver): java.lang.OutOfMemoryError: Java heap space

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2238)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2259)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2278)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2303)
	at org.apache.spark.rdd.RDD.count(RDD.scala:1274)
	at org.apache.spark.mllib.optimization.LBFGS$.runLBFGS(LBFGS.scala:195)
	at org.apache.spark.mllib.optimization.LBFGS.optimizeWithLossReturned(LBFGS.scala:154)
	at org.apache.spark.ml.ann.FeedForwardTrainer.train(Layer.scala:855)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.$anonfun$train$1(MultilayerPerceptronClassifier.scala:228)
	at org.apache.spark.ml.util.Instrumentation$.$anonfun$instrumented$1(Instrumentation.scala:191)
	at scala.util.Try$.apply(Try.scala:213)
	at org.apache.spark.ml.util.Instrumentation$.instrumented(Instrumentation.scala:191)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:184)
	at org.apache.spark.ml.classification.MultilayerPerceptronClassifier.train(MultilayerPerceptronClassifier.scala:93)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:151)
	at org.apache.spark.ml.Predictor.fit(Predictor.scala:115)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.OutOfMemoryError: Java heap space


In [None]:
# Make predictions on the testing data
predictions = model.transform(test_data)

# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="label",
    predictionCol="prediction",
    metricName="accuracy"
)

In [None]:
accuracy = evaluator.evaluate(predictions)
print("Accuracy: {:.2f}%".format(accuracy * 100))