In [1]:
from pyspark.sql import SparkSession
from math import exp
from pyspark.rdd import RDD

spark = SparkSession.builder\
    .appName("credit-card-fraud-detection")\
    .master("local[*]")\
    .config("spark.log.level", "ERROR")\
    .getOrCreate()

sc = spark.sparkContext

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/04 19:09:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


# Data preparation

In [2]:
# Change the path to the CSV file as needed
# Load the CSV file as a text file and filter out the header
lines = sc.textFile("../../data/creditcard.csv")
header = lines.first()
data_rdd = lines.filter(lambda line: line != header)

# Parse each line: split by comma and convert each element to float
data_rdd = data_rdd.map(lambda line: [float(x.strip("\"")) for x in line.split(",")])

                                                                                

# Data preprocessing

**Understanding the data**:
- According to the dataset description, the input variables are the result of a PCA transformation except "Time" and "Amount" so the features are previously scaled. 
- Every value in the dataset is not null so imputing is also not needed.
- The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. To deal with this problem, we have 2 methods:
    - Cost-sensitive learning: the lost function will be adjusted to favor the detection of the minority class.
    - Undersampling, oversampling technique or a combination of the two.

Because of the reasons above and the fact that I will choose the oversampling method to deal with the highly unbalanced nature of the dataset, this data processing step will include:
- Create an RDD where each record is a tuple (label, features)
- Splitting the dataset into train and test set.
- Oversample the minority class (Class = 1) 

In [3]:
# Create an RDD where each record is a tuple (label, features)
data_rdd = data_rdd.map(lambda x: (x[-1], x[:-1]))

# Split the data into train and test sets in a stratified fashion
train_rdd = data_rdd.sampleByKey(withReplacement=False, fractions={0.0: 0.8, 1.0: 0.8}, seed=42)
test_rdd = data_rdd.subtract(train_rdd)

# Oversample the train RDD to deal with class imbalance
# Calculate class counts in the training data
count_dict = train_rdd.countByKey()
major_count, minor_count = count_dict[0], count_dict[1]
# Calculate the desired oversampling ratio
ratio = float(major_count) / minor_count
# Filter out and oversample the minor class
oversampled_minor_rdd = train_rdd\
    .filter(lambda x : x[0] == 1)\
    .sample(withReplacement=True, fraction=ratio, seed=42)
# Combine the oversampled minor with the train RDD
train_rdd = train_rdd\
    .filter(lambda x : x[0] == 0)\
    .union(oversampled_minor_rdd)

                                                                                

# Implement and train the model using low-level operations

In [None]:
class MyLogisticRegressionModel:
    def __init__(self, learning_rate=0.0001, num_iterations=100, convergence_tol=1e-6):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.convergence_tol = convergence_tol
        self.weights = None
        self.num_features = None
        self.converged_at = None

    def predictProb(self, features):
        """Computes the prediction for a single feature vector."""
        s = sum(w * f for w, f in zip(self.weights, features))
        z = 1 / (1 + exp(-s))
        return z

    def updateWeights(self, train_rdd):
        """Computes the gradient and updates the weights for one iteration."""
        gradients = train_rdd\
            .map(lambda x: [(self.predictProb(x[1]) - x[0]) * f for f in x[1]])\
            .reduce(lambda a, b: [x + y for x, y in zip(a, b)])

        updated_weights = [w - self.learning_rate * grad for w, grad in zip(self.weights, gradients)]
        return updated_weights

    def fit(self, train_rdd: RDD):
        """Fits the Logistic Regression model to the training data."""
        train_rdd_with_intercept = train_rdd.map(lambda x : (x[0], [1.0] + x[1]))

        self.weights = [0.0] * len(train_rdd_with_intercept.first()[1])
        for i in range(self.num_iterations):
            new_weights = self.updateWeights(train_rdd_with_intercept)
            is_converged = all([abs(new - cur) < self.convergence_tol for new, cur in zip(new_weights, self.weights)])
            self.weights = new_weights
            if is_converged:
                self.converged_at = i
                return
        
        self.converged_at = self.num_iterations

    def predict(self, features):
        """Predicts the class label (0 or 1) for a data point in the input RDD."""
        return 0.0 if self.predictProb(features) < 0.5 else 1.0
    
# Initialize and train the Logistic Regression model
model = MyLogisticRegressionModel()
model.fit(train_rdd)

25/04/04 19:17:48 ERROR Executor: Exception in task 9.0 in stage 9.0 (TID 67)   
org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
    ~~~~~~~^^
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
    ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib64/python3.13/site-packages/pyspark/rdd.py", line 1919, in func
    initial = next(iterator)
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/pyth

Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.collectAndServe.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 9 in stage 9.0 failed 1 times, most recent failure: Lost task 9.0 in stage 9.0 (TID 67) (keineik-laptop executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
    ~~~~~~~^^
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
    ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib64/python3.13/site-packages/pyspark/rdd.py", line 1919, in func
    initial = next(iterator)
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_38282/2298582521.py", line 19, in <lambda>
  File "/tmp/ipykernel_38282/2298582521.py", line 13, in predictProb
OverflowError: math range error

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	at java.base/java.lang.Thread.run(Thread.java:1583)

Driver stacktrace:
	at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2856)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2792)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2791)
	at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
	at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
	at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2791)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1247)
	at scala.Option.foreach(Option.scala:407)
	at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1247)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:3060)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2994)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2983)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:989)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2393)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2414)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2433)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:2458)
	at org.apache.spark.rdd.RDD.$anonfun$collect$1(RDD.scala:1049)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:112)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:410)
	at org.apache.spark.rdd.RDD.collect(RDD.scala:1048)
	at org.apache.spark.api.python.PythonRDD$.collectAndServe(PythonRDD.scala:195)
	at org.apache.spark.api.python.PythonRDD.collectAndServe(PythonRDD.scala)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:75)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:52)
	at java.base/java.lang.reflect.Method.invoke(Method.java:580)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:374)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:1583)
Caused by: org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1247, in main
    process()
    ~~~~~~~^^
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 1239, in process
    serializer.dump_stream(out_iter, outfile)
    ~~~~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 274, in dump_stream
    vs = list(itertools.islice(iterator, batch))
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib64/python3.13/site-packages/pyspark/rdd.py", line 1919, in func
    initial = next(iterator)
  File "/home/keineik/Projects/lab03-spark-ml/venv/lib/python3.13/site-packages/pyspark/python/lib/pyspark.zip/pyspark/util.py", line 83, in wrapper
    return f(*args, **kwargs)
  File "/tmp/ipykernel_38282/2298582521.py", line 19, in <lambda>
  File "/tmp/ipykernel_38282/2298582521.py", line 13, in predictProb
OverflowError: math range error

	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.handlePythonException(PythonRunner.scala:572)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:784)
	at org.apache.spark.api.python.PythonRunner$$anon$3.read(PythonRunner.scala:766)
	at org.apache.spark.api.python.BasePythonRunner$ReaderIterator.hasNext(PythonRunner.scala:525)
	at org.apache.spark.InterruptibleIterator.hasNext(InterruptibleIterator.scala:37)
	at scala.collection.Iterator.foreach(Iterator.scala:943)
	at scala.collection.Iterator.foreach$(Iterator.scala:943)
	at org.apache.spark.InterruptibleIterator.foreach(InterruptibleIterator.scala:28)
	at scala.collection.generic.Growable.$plus$plus$eq(Growable.scala:62)
	at scala.collection.generic.Growable.$plus$plus$eq$(Growable.scala:53)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:105)
	at scala.collection.mutable.ArrayBuffer.$plus$plus$eq(ArrayBuffer.scala:49)
	at scala.collection.TraversableOnce.to(TraversableOnce.scala:366)
	at scala.collection.TraversableOnce.to$(TraversableOnce.scala:364)
	at org.apache.spark.InterruptibleIterator.to(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toBuffer(TraversableOnce.scala:358)
	at scala.collection.TraversableOnce.toBuffer$(TraversableOnce.scala:358)
	at org.apache.spark.InterruptibleIterator.toBuffer(InterruptibleIterator.scala:28)
	at scala.collection.TraversableOnce.toArray(TraversableOnce.scala:345)
	at scala.collection.TraversableOnce.toArray$(TraversableOnce.scala:339)
	at org.apache.spark.InterruptibleIterator.toArray(InterruptibleIterator.scala:28)
	at org.apache.spark.rdd.RDD.$anonfun$collect$2(RDD.scala:1049)
	at org.apache.spark.SparkContext.$anonfun$runJob$5(SparkContext.scala:2433)
	at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
	at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
	at org.apache.spark.scheduler.Task.run(Task.scala:141)
	at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$4(Executor.scala:620)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally(SparkErrorUtils.scala:64)
	at org.apache.spark.util.SparkErrorUtils.tryWithSafeFinally$(SparkErrorUtils.scala:61)
	at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:94)
	at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:623)
	at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1144)
	at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:642)
	... 1 more


# Evaluate on test set

In [None]:
print("Coefficients:", model.weights[1:])
print("Intercept:", model.weights[0])
print("Converged at:", model.converged_at)

broadcast_weights = sc.broadcast(model.weights)
predictionAndLabels = test_rdd.map(lambda p: (
    0 if 1 / (1 + exp(-sum(w * f for w, f in zip(broadcast_weights.value, [1.0] + p[1])))) < 0.5 else 1,
    p[0]
)).collect()
print(type(predictionAndLabels))
# summary = model.summary
# print("Accuracy:", summary.accuracy)
# print("Area under ROC:", summary.areaUnderROC)
# print("Precision:", summary.precisionByLabel)
# print("Recall:", summary.recallByLabel)