In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.feature import VectorAssembler

spark = SparkSession.builder\
    .appName("credit-card-fraud-detection")\
    .master("local[*]")\
    .config("spark.log.level", "ERROR")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/04 18:10:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


# Data preparation

In [5]:
# Change the path to the CSV file as needed
# Load the dataset
df = spark.read.csv("../../data/creditcard.csv", header=True, inferSchema=True)
df.show(5)

                                                                                

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

# Data preprocessing

**Understanding the data**:
- According to the dataset description, the input variables are the result of a PCA transformation except "Time" and "Amount" so the features are previously scaled. 
- Every value in the dataset is not null so imputing is also not needed.
- The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. To deal with this problem, we have 2 methods:
    - Cost-sensitive learning: the lost function will be adjusted to favor the detection of the minority class.
    - Undersampling, oversampling technique or a combination of the two.

Because of the reasons above and the fact that I will choose the oversampling method to deal with the highly unbalanced nature of the dataset, this data processing step will include:
- Using the VectorAssembler class to assemble feature columns into a single vector column
- Splitting the dataset into train and test set.
- Oversample the minority class (Class = 1) 

In [None]:
# Use all columns as features exclude the target column "Class"
input_cols = df.columns[:-1]

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
df = assembler.transform(df)
df = df.select("features", "Class")

# Sample training data in a stratified fashion
train_df = df.sampleBy("Class", {1: 0.8, 0: 0.8}, seed=42)

# Get test data as the remaining set
test_df = df.subtract(train_df)

# Oversample the train df to deal with class imbalance
# Calculate class counts in the training data
class_counts = train_df.groupBy("Class").count().orderBy("Class").collect()
major_count, minor_count = class_counts[0]["count"], class_counts[1]["count"]
# Calculate the desired oversampling ratio
ratio = float(major_count) / minor_count
# Filter out and oversample the minor class 
oversampled_minor_df = train_df\
    .filter(col("Class") == 1)\
    .sample(withReplacement=True, fraction=ratio, seed=42)
# Combine the minor with the train df
train_df = train_df\
    .filter(col("Class") == 0)\
    .union(oversampled_minor_df)

# Train the Logistic Regression model using spark.ml

In [16]:
# Initialize the Logistic Regression estimator
lr = LogisticRegression(
    featuresCol="features",
    labelCol="Class"
)

# Fit the model
model = lr.fit(train_df)

                                                                                

# Evaluate the obtained model

In [17]:
print("Coefficients:", model.coefficients)
print("Intercept:", model.intercept)

summary = model.summary
print("Accuracy:", summary.accuracy)
print("Area under ROC:", summary.areaUnderROC)
print("Precision:", summary.precisionByLabel)
print("Recall:", summary.recallByLabel)

Coefficients: [-1.131150420707179e-05,1.057343270993391,0.3726135010612134,0.3620435935055492,0.9924165512574527,0.8688846837840881,-0.5486702501623859,-0.9353033108022877,-0.5086026586043687,-0.9158343014053011,-1.7511583070834809,0.4169357993180848,-1.1642091836545314,-0.378806834363519,-1.5138740153526984,-0.2286640729742732,-1.027760150276896,-1.259161622903398,-0.24461334898764917,0.6517688191295719,-1.5802257522464103,0.27557562031260224,0.9167827623508933,0.49406386921421513,-0.3412548036683508,-0.1574882489344699,-0.37249827969853955,-1.4041020959078092,0.25943816815868576,0.009981771497433184]
Intercept: -3.7249811097894465


                                                                                

Accuracy: 0.950619449621752


                                                                                

Area under ROC: 0.9888892389587758
Precision: [0.9281903689336587, 0.9754790689073299]
Recall: [0.976719790372773, 0.9245630255414803]


# Evaluate on test set

In [18]:
# Evaluate the model
evaluator = MulticlassClassificationEvaluator(
    labelCol="Class",
    predictionCol="prediction"
)

# AUC-ROC and AUC-PR
binary_evaluator = BinaryClassificationEvaluator(
    labelCol="Class",
    rawPredictionCol="rawPrediction"
)

# Predict on the test set
predictions = model.transform(test_df)

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = [evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 0.0}),
             evaluator.evaluate(predictions, {evaluator.metricName: "precisionByLabel", evaluator.metricLabel: 1.0})]
recall = [evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 0.0}),
        evaluator.evaluate(predictions, {evaluator.metricName: "recallByLabel", evaluator.metricLabel: 1.0})]
auc_roc = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderROC"})
auc_pr = binary_evaluator.evaluate(predictions, {binary_evaluator.metricName: "areaUnderPR"})

print("Accuracy:", accuracy)
print("Area under ROC:", auc_roc)
print("Area under PR:", auc_pr)
print("Precision:", precision)
print("Recall:", recall)

                                                                                

Accuracy: 0.9769368920817255
Area under ROC: 0.9703390050646288
Area under PR: 0.7173733707071455
Precision: [0.9998541742617572, 0.06390704429920116]
Recall: [0.9770399529755437, 0.9166666666666666]
