In [1]:
import findspark
findspark.init()
import pyspark

In [2]:
from pyspark.sql.functions import col, substring

In [3]:
from pyspark.sql import SparkSession

# Create SparkSession
spark = SparkSession.builder\
        .appName('fraud_detection')\
        .config("spark.jars", "E:\spark-3.5.0-bin-hadoop3\jars\mysql-connector-j-8.1.0.jar")\
        .getOrCreate()

In [4]:
traindf = spark.read \
    .format("jdbc") \
    .option("driver","com.mysql.cj.jdbc.Driver") \
    .option("url", "jdbc:mysql://localhost:3306/creditcard") \
    .option("dbtable", "transaction") \
    .option("user", "root") \
    .option("password", "khang") \
    .load()
traindf.show(5)

+--------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+-------------------+-----------------+------------------+-------------------+-------------------+-------------------+------------------+------------------+------------------+-------------------+-------------------+------+-----+
|    Time|                V1|                V2|                V3|                V4|                V5|                V6|                V7|                V8|                 V9|               V10|               V11|               V12|               V13|               V14|               V15|               V16|                V17|                V18|              V19|               V20|                V21|                V22|     

In [32]:
traindf.printSchema()

root
 |-- Time: double (nullable = true)
 |-- V1: double (nullable = true)
 |-- V2: double (nullable = true)
 |-- V3: double (nullable = true)
 |-- V4: double (nullable = true)
 |-- V5: double (nullable = true)
 |-- V6: double (nullable = true)
 |-- V7: double (nullable = true)
 |-- V8: double (nullable = true)
 |-- V9: double (nullable = true)
 |-- V10: double (nullable = true)
 |-- V11: double (nullable = true)
 |-- V12: double (nullable = true)
 |-- V13: double (nullable = true)
 |-- V14: double (nullable = true)
 |-- V15: double (nullable = true)
 |-- V16: double (nullable = true)
 |-- V17: double (nullable = true)
 |-- V18: double (nullable = true)
 |-- V19: double (nullable = true)
 |-- V20: double (nullable = true)
 |-- V21: double (nullable = true)
 |-- V22: double (nullable = true)
 |-- V23: double (nullable = true)
 |-- V24: double (nullable = true)
 |-- V25: double (nullable = true)
 |-- V26: double (nullable = true)
 |-- V27: double (nullable = true)
 |-- V28: double (nulla

In [5]:
testdf = spark.read.csv("test.csv", header=True, inferSchema=True)
testdf.show(5)

+--------+-------------------+-------------------+------------------+-------------------+------------------+-------------------+------------------+-------------------+------------------+-------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+------+-----+
|    Time|                 V1|                 V2|                V3|                 V4|                V5|                 V6|                V7|                 V8|                V9|                V10|              V11|               V12|               V13|               V14|               V15|               V16|                V17|               V18|               V19|                V20|               V21|               V22|

In [29]:
import pandas as pd 
import numpy as np
from itertools import chain

from pyspark.ml.classification import LinearSVC, DecisionTreeClassifier

from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

from pyspark.sql.functions import isnan, when, count, col, length, expr
from pyspark.ml.feature import Imputer
from pyspark.ml.feature import StringIndexer, VectorIndexer, StringIndexerModel, IndexToString
from pyspark.sql.types import DoubleType

In [7]:
traindf.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in traindf.columns]).show()

+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|Time| V1| V2| V3| V4| V5| V6| V7| V8| V9|V10|V11|V12|V13|V14|V15|V16|V17|V18|V19|V20|V21|V22|V23|V24|V25|V26|V27|V28|Amount|Class|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+
|   0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|  0|     0|    0|
+----+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+---+------+-----+



In [8]:
inputFeatures = ['Time',
                 'V1',
                 'V2',
                 'V3',
                 'V4',
                 'V5',
                 'V6',
                 'V7',
                 'V8',
                 'V9',
                 'V10',
                 'V11',
                 'V12',
                 'V13',
                 'V14',
                 'V15',
                 'V16',
                 'V17',
                 'V18',
                 'V19',
                 'V20',
                 'V21',
                 'V22',
                 'V23',
                 'V24',
                 'V25',
                 'V26',
                 'V27',
                 'V28',
                 'Amount']
assembler = VectorAssembler(inputCols=inputFeatures,outputCol="features")
df1 = assembler.transform(traindf)
df1.show(2)

+-------+------------------+-----------------+------------------+-----------------+------------------+------------------+----------------+------------------+------------------+------------------+-----------------+-----------------+-----------------+------------------+------------------+------------------+------------------+------------------+-----------------+------------------+-------------------+-------------------+------------------+-----------------+------------------+------------------+------------------+-------------------+------+-----+--------------------+
|   Time|                V1|               V2|                V3|               V4|                V5|                V6|              V7|                V8|                V9|               V10|              V11|              V12|              V13|               V14|               V15|               V16|               V17|               V18|              V19|               V20|                V21|                V22|         

In [9]:
df2 = df1.select(['features', 'Class'])
df2.show(2)

+--------------------+-----+
|            features|Class|
+--------------------+-----+
|[63290.0,-0.20345...|    0|
|[93310.0,-1.33272...|    0|
+--------------------+-----+
only showing top 2 rows



In [10]:
y_collect = traindf.select("Class").groupBy("Class").count().collect()
unique_y = [x["Class"] for x in y_collect]
total_y = sum([x["count"] for x in y_collect])
unique_y_count = len(y_collect)
bin_count = [x["count"] for x in y_collect]

class_weights_spark = {i: ii for i, ii in zip(unique_y, total_y / (unique_y_count * np.array(bin_count)))}
print(class_weights_spark)


{1: 289.960407239819, 0: 0.5008636759769113}


In [11]:
mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_spark.items())])

df2 = df2.withColumn("weight", mapping_expr.getItem(F.col("Class")))



In [12]:
df2.show(5)

+--------------------+-----+------------------+
|            features|Class|            weight|
+--------------------+-----+------------------+
|[63290.0,-0.20345...|    0|0.5008636759769113|
|[93310.0,-1.33272...|    0|0.5008636759769113|
|[155516.0,1.92915...|    0|0.5008636759769113|
|[97804.0,-0.40829...|    0|0.5008636759769113|
|[35833.0,-1.96331...|    0|0.5008636759769113|
+--------------------+-----+------------------+
only showing top 5 rows



# Model Training & Testing

SVM using class weight

In [13]:
svm = LinearSVC(featuresCol="features", labelCol="Class", maxIter=1000, regParam=0.3, weightCol="weight")
svm_model = svm.fit(df2)

In [14]:
pipeline = Pipeline(stages = [assembler, svm_model])

In [15]:
testSolution = pipeline.fit(testdf).transform(testdf)

In [16]:
testSolution.show(5)

+--------+-------------------+-------------------+------------------+-------------------+------------------+-------------------+------------------+-------------------+------------------+-------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+------+-----+--------------------+--------------------+----------+
|    Time|                 V1|                 V2|                V3|                 V4|                V5|                 V6|                V7|                 V8|                V9|                V10|              V11|               V12|               V13|               V14|               V15|               V16|                V17|               V18|               V19|     

Decision Tree using class weight

In [17]:
dt = DecisionTreeClassifier(featuresCol="features", labelCol="Class", maxBins=12473, weightCol="weight")
dt_model = dt.fit(df2)

In [18]:
pipeline1 = Pipeline(stages = [assembler, dt_model])

In [19]:
testSolution_dt = pipeline1.fit(testdf).transform(testdf)

In [20]:
testSolution_dt.show(5)

+--------+-------------------+-------------------+------------------+-------------------+------------------+-------------------+------------------+-------------------+------------------+-------------------+-----------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+------------------+-------------------+------------------+------------------+------------------+------------------+------------------+-------------------+------------------+-------------------+------+-----+--------------------+--------------------+--------------------+----------+
|    Time|                 V1|                 V2|                V3|                 V4|                V5|                 V6|                V7|                 V8|                V9|                V10|              V11|               V12|               V13|               V14|               V15|               V16|                V17|               V18|   

# Model Testing

In [21]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="prediction", metricName="areaUnderPR")
auc_pr = evaluator.evaluate(testSolution)
auc_pr

0.5552282143107928

In [22]:
evaluator = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="prediction", metricName="areaUnderROC")
auc = evaluator.evaluate(testSolution)
auc

0.8696306978052898

In [23]:
evaluator = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(testSolution)
accuracy

0.998806263605084

In [24]:
evaluator = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="prediction", metricName="areaUnderPR")
auc_pr = evaluator.evaluate(testSolution_dt)
auc_pr

0.0577755494698406

In [25]:
evaluator = BinaryClassificationEvaluator(labelCol="Class", rawPredictionCol="prediction", metricName="areaUnderROC")
auc = evaluator.evaluate(testSolution_dt)
auc

0.889712296004502

In [26]:
evaluator = MulticlassClassificationEvaluator(labelCol="Class", predictionCol="prediction", metricName="accuracy")
accuracy = evaluator.evaluate(testSolution_dt)
accuracy

0.9791096130889685

# Consumer

In [27]:
import pandas as pd
def json_to_dataframe(json_data):
    pd_df = pd.DataFrame([json.loads(json_data)])
    spark_df = spark.createDataFrame(pd_df)
    return spark_df

In [34]:
from kafka import KafkaConsumer
import json

KAFKA_TOPIC_NAME = 'Fraud-detection'
KAFKA_BOOTSTRAP_SERVERS = 'localhost:9092'

consumer = KafkaConsumer(KAFKA_TOPIC_NAME, bootstrap_servers=KAFKA_BOOTSTRAP_SERVERS)
print("Kafka Consumer Application Started ... ")
for msg in consumer:
    try:
        spark_df = json_to_dataframe(msg.value.decode('utf-8'))
        for column in spark_df.columns:
            spark_df = spark_df.withColumn(column, spark_df[column].cast(DoubleType()))
        result_df = pipeline.fit(spark_df).transform(spark_df)
        result_df = result_df.drop(*['features', 'rawPrediction', 'probability'])
        print(result_df.toPandas().to_string(index=False, header=False))

        result_df.write \
            .format("jdbc") \
            .option("driver","com.mysql.jdbc.Driver") \
            .option("url", "jdbc:mysql://localhost:3306/creditcard") \
            .option("dbtable", "new_transaction") \
            .option("user", "root") \
            .option("password", "khang") \
            .mode("append") \
            .save()
    except KeyboardInterrupt:
        print('break')
        break
print("Kafka Consumer Application Completed. ")



Kafka Consumer Application Started ... 
171780.0 -0.008102 0.706326 0.125172 -0.78531 0.641731 -0.510636 0.844686 0.027664 -0.181697 -0.233965 0.356436 0.375607 -0.645078 0.419454 -0.955765 0.191474 -0.711481 -0.130407 0.287176 -0.042161 -0.230903 -0.56304 0.041141 -0.491279 -0.499226 0.152658 0.238847 0.082221 8.99 0.0
165611.0 1.988377 -0.39373 -0.486108 0.125929 -0.525295 -0.274312 -0.632676 0.136883 1.27089 -0.022093 0.358176 0.428478 -1.056867 0.468677 0.524302 0.609698 -1.015177 0.802051 0.329759 -0.293461 -0.111382 -0.269051 0.296201 -0.583236 -0.372361 -0.910227 0.047529 -0.044373 1.0 0.0
161914.0 -0.10354 -0.087521 1.011111 -1.608574 -1.15238 -0.085708 0.364862 -0.073341 -1.022815 0.312905 0.910662 0.036998 0.319201 -0.377528 -0.721682 1.227555 -0.069962 -0.844791 1.20987 0.093558 0.160583 0.416128 0.252354 0.065506 -1.259634 -0.58106 0.16317 0.191938 135.0 0.0
170292.0 -1.537999 -0.588936 -1.13059 2.094498 -11.240843 7.08115 7.444866 -0.942407 -0.170443 -0.450758 -1.827901 -0

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "E:\spark-3.5.0-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "E:\spark-3.5.0-bin-hadoop3\python\lib\py4j-0.10.9.7-src.zip\py4j\clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "e:\IDE Python\lib\socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


break
Kafka Consumer Application Completed. 
