In [1]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StructField, StringType, ArrayType, MapType
from pyspark.sql.functions import split, from_json, col



In [2]:
spark = SparkSession.builder \
            .appName("ReadFromKafkaToBronzeLayer") \
            .getOrCreate()



:: loading settings :: url = jar:file:/usr/local/spark-3.1.2-bin-hadoop3.2/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
io.delta#delta-core_2.12 added as a dependency
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.postgresql#postgresql added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-548c93f8-13dc-45be-9360-b1ea433ecb99;1.0
	confs: [default]
	found io.delta#delta-core_2.12;1.0.0 in central
	found org.antlr#antlr4;4.7 in central
	found org.antlr#antlr4-runtime;4.7 in central
	found org.antlr#antlr-runtime;3.5.2 in central
	found org.antlr#ST4;4.0.8 in central
	found org.abego.treelayout#org.abego.treelayout.core;1.0.3 in central
	found org.glassfish#javax.json;1.0.4 in central
	found com.ibm.icu#icu4j;58.2 in central
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.1.2 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.1.2 in central
	found org.apache.kafka#kafka-clients;2.6.0 in central
	found com.github.lub

## Kafka Properties

In [3]:
bootstrap_server = "kafka:29092"
topic = "finance.broker.transactions.customers"

## Read data from Kafka topic

In [4]:
customer_stream_df = spark.readStream \
                          .format("kafka") \
                          .option("kafka.bootstrap.servers", bootstrap_server) \
                          .option("subscribe", topic) \
                          .option("startingOffsets", "earliest") \
                          .option("includeHeaders", "true") \
                          .load()

## Create columns to define data partitions

In [5]:
split_timestamp = split(customer_stream_df["timestamp"], " ")
customer_stream_df = customer_stream_df.withColumn("date", split_timestamp.getItem(0)) \
                                       .withColumn("hour", split_timestamp.getItem(1))
split_date = split(customer_stream_df["date"], "-")
customer_stream_df = customer_stream_df.withColumn("year", split_date.getItem(0)) \
                                       .withColumn("month", split_date.getItem(1)) \
                                       .withColumn("day", split_date.getItem(2)) \
                                       .drop("date", "hour")

## View dataframe schema

In [6]:
customer_stream_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)
 |-- headers: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- key: string (nullable = true)
 |    |    |-- value: binary (nullable = true)
 |-- year: string (nullable = true)
 |-- month: string (nullable = true)
 |-- day: string (nullable = true)



## Write in console

customer_stream_df.writeStream \
                  .format("console") \
                  .option("checkpointLocation", f'/home/jovyan/work/checkpoint/{topic}') \
                  .start().awaitTermination()

## Write into Datalake

In [7]:
customer_stream_df.writeStream \
                  .format("delta") \
                  .option("path", f'/home/jovyan/work/datalake/bronze/{topic}') \
                  .partitionBy("year", "month", "day") \
                  .option("checkpointLocation", f'/home/jovyan/work/checkpoint/{topic}') \
                  .start().awaitTermination()

                                                                                

KeyboardInterrupt: 