In [1]:
from pyspark.sql import SparkSession
#Initialize a PySpark SparkSession
spark = SparkSession.builder.appName("svm_PySpark").getOrCreate()

In [2]:
data = spark.read.format("csv").option("header", "true").load("../fraudulent_transactions_anonymous_ia.csv")

In [3]:
data.printSchema()

root
 |-- transaction-id: string (nullable = true)
 |-- customer-id: string (nullable = true)
 |-- account-id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- amount: string (nullable = true)
 |-- customer_id_avrge_amount_1day: string (nullable = true)
 |-- customer_id_avrge_amount_1week: string (nullable = true)
 |-- customer_id_avrge_amount_1month: string (nullable = true)
 |-- customer_id_avrge_amount_3month: string (nullable = true)
 |-- customer_id_count_1day: string (nullable = true)
 |-- customer_id_count_1week: string (nullable = true)
 |-- customer_id_count_1month: string (nullable = true)
 |-- customer_id_count_3month: string (nullable = true)
 |-- account_id_avrge_amount_1day: string (nullable = true)
 |-- account_id_avrge_amount_1week: string (nullable = true)
 |-- account_id_avrge_amount_1month: string (nullable = true)
 |-- account_id_avrge_amount_3month: string (nullable = true)
 |-- account_id_count_1day: string (nullable = true)
 |-- account_id_co

In [4]:
from pyspark.sql.functions import col

# Convert  column from String to Double
data = data.withColumn("amount", col("amount").cast("double"))

data = data.withColumn("customer_id_avrge_amount_1day", col("customer_id_avrge_amount_1day").cast("double"))
data = data.withColumn("customer_id_avrge_amount_1week", col("customer_id_avrge_amount_1week").cast("double"))
data = data.withColumn("customer_id_avrge_amount_1month", col("customer_id_avrge_amount_1month").cast("double"))
data = data.withColumn("customer_id_avrge_amount_3month", col("customer_id_avrge_amount_3month").cast("double"))

data = data.withColumn("customer_id_count_1day", col("customer_id_count_1day").cast("integer"))
data = data.withColumn("customer_id_count_1week", col("customer_id_count_1week").cast("integer"))
data = data.withColumn("customer_id_count_1month", col("customer_id_count_1month").cast("integer"))
data = data.withColumn("customer_id_count_3month", col("customer_id_count_3month").cast("integer"))

data = data.withColumn("account_id_avrge_amount_1day", col("account_id_avrge_amount_1day").cast("double"))
data = data.withColumn("account_id_avrge_amount_1week", col("account_id_avrge_amount_1week").cast("double"))
data = data.withColumn("account_id_avrge_amount_1month", col("account_id_avrge_amount_1month").cast("double"))
data = data.withColumn("account_id_avrge_amount_3month", col("account_id_avrge_amount_3month").cast("double"))

data = data.withColumn("account_id_count_1day", col("account_id_count_1day").cast("integer"))
data = data.withColumn("account_id_count_1week", col("account_id_count_1week").cast("integer"))
data = data.withColumn("account_id_count_1month", col("account_id_count_1month").cast("integer"))
data = data.withColumn("account_id_count_3month", col("account_id_count_3month").cast("integer"))

data = data.withColumn("transaction_in_weekend", col("transaction_in_weekend").cast("integer"))

data = data.withColumn("transaction_at_night", col("transaction_at_night").cast("integer"))

data = data.withColumn("is_fraud", col("is_fraud").cast("integer"))

In [5]:
data.printSchema()

root
 |-- transaction-id: string (nullable = true)
 |-- customer-id: string (nullable = true)
 |-- account-id: string (nullable = true)
 |-- datetime: string (nullable = true)
 |-- amount: double (nullable = true)
 |-- customer_id_avrge_amount_1day: double (nullable = true)
 |-- customer_id_avrge_amount_1week: double (nullable = true)
 |-- customer_id_avrge_amount_1month: double (nullable = true)
 |-- customer_id_avrge_amount_3month: double (nullable = true)
 |-- customer_id_count_1day: integer (nullable = true)
 |-- customer_id_count_1week: integer (nullable = true)
 |-- customer_id_count_1month: integer (nullable = true)
 |-- customer_id_count_3month: integer (nullable = true)
 |-- account_id_avrge_amount_1day: double (nullable = true)
 |-- account_id_avrge_amount_1week: double (nullable = true)
 |-- account_id_avrge_amount_1month: double (nullable = true)
 |-- account_id_avrge_amount_3month: double (nullable = true)
 |-- account_id_count_1day: integer (nullable = true)
 |-- account_

In [6]:
%%time

from pyspark.ml.feature import VectorAssembler

# Define the input and output features
output_feature = "is_fraud"
input_features = [
    'amount',
    'customer_id_avrge_amount_1day', 'customer_id_avrge_amount_1week', 'customer_id_avrge_amount_1month', 'customer_id_avrge_amount_3month',
    'customer_id_count_1day', 'customer_id_count_1week', 'customer_id_count_1month', 'customer_id_count_3month',
    'account_id_avrge_amount_1day', 'account_id_avrge_amount_1week', 'account_id_avrge_amount_1month', 'account_id_avrge_amount_3month',
    'account_id_count_1day', 'account_id_count_1week', 'account_id_count_1month', 'account_id_count_3month',
    'transaction_in_weekend', 'transaction_at_night'
]

# Create a VectorAssembler for combining input features into a feature vector
assembler = VectorAssembler(inputCols=input_features, outputCol="features")

df_data=assembler.transform(data)

model_df=df_data.select(['features','is_fraud'])

# Split the data into training and test sets
(trainingData, testData) = model_df.randomSplit([0.7, 0.3])

CPU times: user 564 ms, sys: 132 ms, total: 695 ms
Wall time: 3.95 s


In [7]:
testData.show()

+--------------------+--------+
|            features|is_fraud|
+--------------------+--------+
|[2.08,2.08,2.08,2...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
|[3.194,3.194,3.19...|       0|
+--------------------+--------+
only showing top 20 rows



In [11]:
%%time
from pyspark.ml.classification import LinearSVC

lsvc = LinearSVC(maxIter=10, regParam=0.1, labelCol=output_feature)

# Fit the model
lsvcModel = lsvc.fit(trainingData)

# training results
train_results=lsvcModel.evaluate(trainingData).predictions

CPU times: user 17.6 ms, sys: 4.79 ms, total: 22.4 ms
Wall time: 11.5 s


In [12]:
%%time

from pyspark.sql.functions import expr

# Counting the total number of instances
total_count = train_results.count()

# Counting the number of instances where prediction matches is_fraud
matching_count = train_results.filter(expr("prediction = is_fraud")).count()

# Calculating the percentage
matching_percentage = (matching_count / total_count) * 100

print("Percentage of instances where prediction matches is_fraud:", matching_percentage)

Percentage of instances where prediction matches is_fraud: 93.37806098429986
CPU times: user 6.31 ms, sys: 3.18 ms, total: 9.49 ms
Wall time: 15.1 s


In [13]:
%%time

test_results=lsvcModel.evaluate(testData).predictions

CPU times: user 2.58 ms, sys: 858 µs, total: 3.44 ms
Wall time: 204 ms


In [14]:
%%time

from pyspark.sql.functions import expr

# Counting the total number of instances
total_count = test_results.count()

# Counting the number of instances where prediction matches is_fraud
matching_count = test_results.filter(expr("prediction = is_fraud")).count()

# Calculating the percentage
matching_percentage = (matching_count / total_count) * 100

print("Percentage of instances where prediction matches is_fraud:", matching_percentage)

Percentage of instances where prediction matches is_fraud: 93.38590440607062
CPU times: user 4.81 ms, sys: 1.25 ms, total: 6.05 ms
Wall time: 11.6 s


In [15]:
%%time

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Creating a MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="is_fraud", predictionCol="prediction")

# Calculating precision
precision = evaluator.evaluate(test_results, {evaluator.metricName: "weightedPrecision"})
print("Precision:", precision)

# Calculating recall
recall = evaluator.evaluate(test_results, {evaluator.metricName: "weightedRecall"})
print("Recall:", recall)

# Calculating accuracy
accuracy = evaluator.evaluate(test_results, {evaluator.metricName: "accuracy"})
print("Accuracy:", accuracy)

# Calculating the F1 score
f1_score = evaluator.evaluate(test_results, {evaluator.metricName: "f1"})

print("F1 Score:", f1_score)

Precision: 0.9329716000577126
Recall: 0.9338590440607062
Accuracy: 0.9338590440607062
F1 Score: 0.92767833500895
CPU times: user 59 ms, sys: 28.1 ms, total: 87 ms
Wall time: 21.1 s


In [17]:
# Save the model
modelPath = "svm_ml"
lsvcModel.save(modelPath)