In [None]:
import pyspark
print(pyspark.__version__)

In [1]:
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.2 pyspark-shell'

In [2]:
from pyspark.sql import SparkSession
# create a Spark session
spark = SparkSession.builder.appName("kafkaConsumer").getOrCreate()

In [3]:
# create a Kafka stream for transaction
df_transaction_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "dbserver1.fineract_default.m_savings_account_transaction") \
    .load()

# create a Kafka stream for account
df_account_stream = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "dbserver1.fineract_default.m_savings_account") \
    .load()

In [None]:
# select the value column from the Kafka stream
#value_df = df.selectExpr("CAST(value AS STRING)")

# print the value column to the console
#query = value_df \
#    .writeStream \
#    .outputMode("append") \
#    .format("console") \
#    .start()

# wait for the query to terminate
#query.awaitTermination()

In [5]:
from pyspark.sql.types import StructType, StructField, LongType, DoubleType, StringType, TimestampType

# Define the schema for the DataFrame
schema_transaction = StructType([
    StructField("@timestamp", StringType(), True),
    StructField("account-id", LongType(), True),
    StructField("amount", DoubleType(), True),
    StructField("customer-id", LongType(), True),
    StructField("datetime", StringType(), True),
    StructField("is_fraud", StringType(), True),
    StructField("transaction-id", LongType(), True)
])

# Define the schema for the DataFrame
schema_account = StructType([
    StructField("customer-id",  LongType(), True),
    StructField("account-id", LongType(), True)
])

transaction_df = spark.createDataFrame([], schema=schema_transaction)
account_df = spark.createDataFrame([], schema=schema_account)

In [6]:
from pyspark.sql.functions import from_unixtime, date_format
from pyspark.sql.functions import col

import json
import datetime
import base64
import decimal
from time import sleep

def getDecimalFromKafka(encoded):
    
    # Decode the Base64 encoded string and create a BigInteger from it
    decoded = decimal.Decimal(int.from_bytes(base64.b64decode(encoded), byteorder='big', signed=False))

    # Create a context object with the specified scale
    context = decimal.Context(prec=28, rounding=decimal.ROUND_HALF_DOWN)

    # Set the scale of the decimal value using the context object
    decimal_value = decoded.quantize(decimal.Decimal('.1') ** 3, context=context)

    return decimal_value/1000000

def write_to_es_transaction(df_t, epoch_id):
        
    global transaction_df
    
    row_transaction=df_t.first()
    
    if(row_transaction):
        value_dict_transaction = json.loads(row_transaction.value)
        
        timestamp = value_dict_transaction['payload']['after']['created_date']/1000
        # convert Unix timestamp to a datetime object
        dt = datetime.datetime.fromtimestamp(timestamp)
        # format datetime object as "yyyy-mm-dd hh:mm:ss"
        formatted_dt = dt.strftime("%Y-%m-%d %H:%M:%S")
        
        account_id = value_dict_transaction['payload']['after']['savings_account_id']
        # Filter the DataFrame to get rows where "account-id" is 12
        while account_df.filter(col("account-id") == account_id).count() == 0:
            # Wait for 0.1 second before checking again
            sleep(0.1)
        # Code to execute after the condition becomes true
        filtered_account_df = account_df.filter(account_df["account-id"] == account_id)
        # Select the "customer-id" column from the filtered DataFrame
        cutomer_id = filtered_account_df.select("customer-id").collect()[0][0]
        
        new_row_transaction = spark.createDataFrame([(formatted_dt,
                                                      account_id,
                                                      float(getDecimalFromKafka(value_dict_transaction['payload']['after']['amount'])),
                                                      cutomer_id,
                                                      formatted_dt,
                                                     # date_format(from_unixtime("timestamp"), "yyyy-MM-dd HH:mm:ss"),
                                                      'valid',
                                                      value_dict_transaction['payload']['after']['id'],
                                                     )], schema=schema_transaction)
        transaction_df = transaction_df.union(new_row_transaction)
        
        transaction_df.show()
        
        transaction_df = transaction_df.filter("1 = 0")
        
        
def write_to_es_account(df_a, epoch_id):
    
    global account_df
    
    row_account=df_a.first()
    
    if(row_account):
        value_dict_account = json.loads(row_account.value)
        new_row_account= spark.createDataFrame([(value_dict_account['payload']['after']['client_id'],
                                                      value_dict_account['payload']['after']['id'],
                                                     )], schema=schema_account)
        
        # Check if new_row_account is already present in acount_df
        if account_df.subtract(new_row_account).count() == account_df.count():
            # new_row_account does not exist in acount_df, so concatenate the two DataFrames
            account_df = account_df.union(new_row_account)


In [None]:
# Call the write_to_es function on each micro-batch of data
value_df_account = df_account_stream.selectExpr("CAST(value AS STRING)")
query_account = value_df_account.writeStream.foreachBatch(write_to_es_account).start()

value_df_transaction = df_transaction_stream.selectExpr("CAST(value AS STRING)")
query_transaction = value_df_transaction.writeStream.foreachBatch(write_to_es_transaction).start()

# Wait for the stream to finish
#query_account.awaitTermination()
query_transaction.awaitTermination()

+-------------------+----------+------+-----------+-------------------+--------+--------------+
|         @timestamp|account-id|amount|customer-id|           datetime|is_fraud|transaction-id|
+-------------------+----------+------+-----------+-------------------+--------+--------------+
|2023-05-05 15:24:51|         2| 300.0|          2|2023-05-05 15:24:51|   valid|             7|
+-------------------+----------+------+-----------+-------------------+--------+--------------+

+-------------------+----------+------+-----------+-------------------+--------+--------------+
|         @timestamp|account-id|amount|customer-id|           datetime|is_fraud|transaction-id|
+-------------------+----------+------+-----------+-------------------+--------+--------------+
|2023-05-05 15:28:18|         2| 300.0|          2|2023-05-05 15:28:18|   valid|             8|
+-------------------+----------+------+-----------+-------------------+--------+--------------+

+-------------------+----------+------