In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when

from pyspark.ml.feature import VectorAssembler
from pyspark.mllib.linalg import DenseVector
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD, LogisticRegressionWithLBFGS
from pyspark.mllib.evaluation import MulticlassMetrics, BinaryClassificationMetrics

spark = SparkSession.builder\
    .appName("credit-card-fraud-detection")\
    .master("local[*]")\
    .config("spark.log.level", "ERROR")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/04/04 18:11:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting Spark log level to "ERROR".


# Data preparation

In [3]:
# Change the path to the CSV file as needed
# Load the dataset
df = spark.read.csv("../../data/creditcard.csv", header=True, inferSchema=True)
df.show(5)

                                                                                

+----+------------------+-------------------+----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+--------------------+-------------------+------------------+------------------+------------------+------------------+--------------------+-------------------+------+-----+
|Time|                V1|                 V2|              V3|                V4|                 V5|                 V6|                 V7|                V8|                V9|                V10|               V11|               V12|               V13|               V14|               V15|               V16|               V17|                V18|               V19|                V20|                 V21|                V22|     

# Data preprocessing

**Understanding the data**:
- According to the dataset description, the input variables are the result of a PCA transformation except "Time" and "Amount" so the features are previously scaled. 
- Every value in the dataset is not null so imputing is also not needed.
- The dataset is highly unbalanced, the positive class (frauds) account for 0.172% of all transactions. To deal with this problem, we have 2 methods:
    - Cost-sensitive learning: the lost function will be adjusted to favor the detection of the minority class.
    - Undersampling, oversampling technique or a combination of the two.

Because of the reasons above and the fact that I will choose the cost-sensitive learning method to deal with the highly unbalanced nature of the dataset, this data processing step will include:
- Adding a weight column of value 0.99828 whenever the label is 1 (minority) and 0.00172 when the label is 0 (majority) 
- Using the VectorAssembler class to assemble feature columns into a single vector column
- Splitting the dataset into train and test set.

In [3]:
# Use all columns as features exclude the target column "Class"
input_cols = df.columns[:-1]

# Assemble the features into a single vector column
assembler = VectorAssembler(inputCols=input_cols, outputCol="features")
df = assembler.transform(df).select("features", "Class")

# Sample training data in a stratified fashion
train_df = df.sampleBy("Class", {1: 0.8, 0: 0.8}, seed=42)

# Get test data as the remaining set
test_df = df.subtract(train_df)

# Oversample the train df to deal with class imbalance
# Calculate class counts in the training data
class_counts = train_df.groupBy("Class").count().collect()
major_count = next((row['count'] for row in class_counts if row['Class'] == 0), 0)
minor_count = next((row['count'] for row in class_counts if row['Class'] == 1), 0)
# Calculate the desired oversampling ratio
ratio = float(major_count) / minor_count
# Filter out and oversample the minor class 
oversampled_minor_df = train_df\
    .filter(col("Class") == 1)\
    .sample(withReplacement=True, fraction=ratio, seed=42)
# Combine the minor into the train df
train_df = train_df\
    .filter(col("Class") == 0)\
    .union(oversampled_minor_df)

# Train the Logistic Regression model using spark.mllib

In [14]:
# Convert the DataFrame into an RDD of LabeledPoint objects
train_rdd = train_df.rdd.map(lambda row: LabeledPoint(row.Class, DenseVector(row.features.values)))

# Train the logistic regression model
model = LogisticRegressionWithLBFGS.train(train_rdd, intercept=True)

                                                                                

# Evaluate on test set

In [None]:
print("Coefficients: ", model.weights)
print("Intercept: ", model.intercept)

test_rdd = test_df.rdd.map(lambda row: LabeledPoint(row.Class, DenseVector(row.features.values)))
predictionAndLabels = test_rdd.map(lambda p: (float(model.predict(p.features)), p.label))
multiMetrics = MulticlassMetrics(predictionAndLabels)

# Overall accuracy
accuracy = multiMetrics.accuracy

# Precision and recall for each label
labels = predictionAndLabels.map(lambda x: x[1]).distinct().collect()
precision_by_label = {label: multiMetrics.precision(label) for label in labels}
recall_by_label = {label: multiMetrics.recall(label) for label in labels}

print("Accuracy: {}".format(accuracy))
print("Precision:", precision_by_label)
print("Recall:", recall_by_label)

# Calculate the area under the ROC curve
binaryMetrics = BinaryClassificationMetrics(predictionAndLabels)
roc_auc = binaryMetrics.areaUnderROC
print("Area under ROC: {:.4f}".format(roc_auc))


Coefficients:  [-1.1301222137139996e-05,1.0140537832385095,0.333050459631398,0.3343750167040232,0.9893938495441409,0.8300514129056135,-0.5279541236002336,-0.881900487720863,-0.5063825244715748,-0.8930615126383927,-1.7094928320742324,0.4033388749816862,-1.136705692059036,-0.3757678199445609,-1.4835055998944386,-0.22548697721935382,-0.9963148350091529,-1.2125049509150228,-0.22651769134432215,0.6323593306962435,-1.5160773153815292,0.2801917728569362,0.8997841827896961,0.448158197226758,-0.3352580099842403,-0.1605665927611555,-0.3755132181271749,-1.3638526068793428,0.22746495942303743,0.00945518194920525]
Intercept:  -3.6508136814470724


                                                                                

Accuracy: 0.9769013283069865
Precision: {0.0: 0.9998541689452769, 1.0: 0.06381435823060189}
Recall: {0.0: 0.977004328387453, 1.0: 0.9166666666666666}




Area under ROC: 0.9468


                                                                                