## **Consumo de datos desde un topico de Kafka y almacenados en HDFS utilizando Kafka Connect y Spark Structured Streaming**

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession. \
    builder. \
    config('spark.jars.packages', 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.1'). \
    config('spark.sql.warehouse.dir', '/user/local/spark/warehouse'). \
    enableHiveSupport(). \
    appName('Python - Integracion de Kafka y Spark'). \
    getOrCreate()

Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
:: loading settings :: url = jar:file:/usr/local/spark/jars/ivy-2.4.0.jar!/org/apache/ivy/core/settings/ivysettings.xml
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c38ec09b-f57f-4e9e-a632-ea05dc96f3b3;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.0.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.0.1 in central
	found org.apache.kafka#kafka-clients;2.4.1 in central
	found com.github.luben#zstd-jni;1.4.4-3 in central
	found org.lz4#lz4-java;1.7.1 in central
	found org.xerial.snappy#snappy-java;1.1.7.5 in central
	found org.slf4j#slf4j-api;1.7.30 in central
	found org.spark-project.spark#unused;1.0.0 in central
	found org.apache.commons#commons-pool2;2.6.2 in central
downloading https://repo1.maven.org/maven2/org/apache/spark/spark-sql-kafka-0-10_2.12/

In [5]:
kafka_bootstrap_servers = 'localhost:9092'

In [6]:
df = spark. \
  readStream. \
  format('kafka'). \
  option('kafka.bootstrap.servers', kafka_bootstrap_servers). \
  option('subscribe', 'retaildg'). \
  load()

In [7]:
from pyspark.sql.functions import date_format, to_date, split, substring

In [8]:
df.selectExpr("CAST(value AS STRING)"). \
    withColumn('log_date', to_date(substring(split('value', ' ')[3], 2, 21), '[dd/MMM/yyyy:HH:mm:ss')). \
    withColumn('year', date_format('log_date', 'yyyy')). \
    withColumn('month', date_format('log_date', 'MM')). \
    withColumn('dayofmonth', date_format('log_date', 'dd')). \
    writeStream. \
    partitionBy('year', 'month', 'dayofmonth'). \
    format('csv'). \
    option("checkpointLocation", '/proyecto/kafka/retail_logs/gen_logs/checkpoint'). \
    option('path', '/proyecto/kafka/retail_logs/gen_logs/data'). \
    option('header', True). \
    option('sep', '\t'). \
    trigger(processingTime='30 seconds'). \
    start()

<pyspark.sql.streaming.StreamingQuery at 0x7fd5a405d048>

                                                                                

In [9]:
!hdfs dfs -ls /proyecto/kafka/retail_logs/gen_logs

                                                                                

Found 2 items
drwxr-xr-x   - root supergroup          0 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/checkpoint
drwxr-xr-x   - root supergroup          0 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data


In [10]:
!hdfs dfs -ls -R /proyecto/kafka/retail_logs/gen_logs/data

drwxr-xr-x   - root supergroup          0 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata
-rw-r--r--   3 root supergroup          2 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/0
-rw-r--r--   3 root supergroup        839 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/1
-rw-r--r--   3 root supergroup        842 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/2
drwxr-xr-x   - root supergroup          0 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data/year=2024
drwxr-xr-x   - root supergroup          0 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data/year=2024/month=10
drwxr-xr-x   - root supergroup          0 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/data/year=2024/month=10/dayofmonth=08
-rw-r--r--   3 root supergroup        685 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data/year=2024/month=10/dayofmonth=08/part-00000-66f42a44-f551-43

In [11]:
!hdfs dfs -ls /proyecto/kafka/retail_logs/gen_logs/checkpoint

Found 4 items
drwxr-xr-x   - root supergroup          0 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/checkpoint/commits
-rw-r--r--   3 root supergroup         45 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/checkpoint/metadata
drwxr-xr-x   - root supergroup          0 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets
drwxr-xr-x   - root supergroup          0 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/checkpoint/sources


In [12]:
!hdfs dfs -ls -R /proyecto/kafka/retail_logs/gen_logs/checkpoint/sources

drwxr-xr-x   - root supergroup          0 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/checkpoint/sources/0
-rw-r--r--   3 root supergroup         42 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/checkpoint/sources/0/0


In [14]:
!hdfs dfs -cat /proyecto/kafka/retail_logs/gen_logs/checkpoint/sources/0/0

 v1
{"retaildg":{"2":187,"1":131,"0":138}}

                                                                                

In [15]:
!hdfs dfs -ls -R /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets

-rw-r--r--   3 root supergroup        495 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/0
-rw-r--r--   3 root supergroup        495 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/1
-rw-r--r--   3 root supergroup        495 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/2
-rw-r--r--   3 root supergroup        495 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/3
-rw-r--r--   3 root supergroup        495 2024-10-08 23:03 /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/4
-rw-r--r--   3 root supergroup        495 2024-10-08 23:03 /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/5


In [17]:
!hdfs dfs -cat /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/0

v1
{"batchWatermarkMs":0,"batchTimestampMs":1728428486756,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"200"}}
{"retaildg":{"2":187,"1":131,"0":138}}

In [18]:
!hdfs dfs -cat /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/1

v1
{"batchWatermarkMs":0,"batchTimestampMs":1728428492929,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"200"}}
{"retaildg":{"2":189,"1":134,"0":140}}

In [19]:
!hdfs dfs -cat /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/2

v1
{"batchWatermarkMs":0,"batchTimestampMs":1728428520039,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"200"}}
{"retaildg":{"2":197,"1":143,"0":150}}

                                                                                

In [20]:
!hdfs dfs -cat /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/3

v1
{"batchWatermarkMs":0,"batchTimestampMs":1728428550009,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"200"}}
{"retaildg":{"2":206,"1":153,"0":161}}

In [21]:
!hdfs dfs -cat /proyecto/kafka/retail_logs/gen_logs/checkpoint/offsets/4

v1
{"batchWatermarkMs":0,"batchTimestampMs":1728428580011,"conf":{"spark.sql.streaming.stateStore.providerClass":"org.apache.spark.sql.execution.streaming.state.HDFSBackedStateStoreProvider","spark.sql.streaming.join.stateFormatVersion":"2","spark.sql.streaming.flatMapGroupsWithState.stateFormatVersion":"2","spark.sql.streaming.multipleWatermarkPolicy":"min","spark.sql.streaming.aggregation.stateFormatVersion":"2","spark.sql.shuffle.partitions":"200"}}
{"retaildg":{"2":215,"1":162,"0":173}}

In [26]:
!hdfs dfs -ls -R /proyecto/kafka/retail_logs/gen_logs/data

drwxr-xr-x   - root supergroup          0 2024-10-08 23:07 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata
-rw-r--r--   3 root supergroup          2 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/0
-rw-r--r--   3 root supergroup        839 2024-10-08 23:01 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/1
-rw-r--r--   3 root supergroup        842 2024-10-08 23:06 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/10
-rw-r--r--   3 root supergroup        842 2024-10-08 23:06 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/11
-rw-r--r--   3 root supergroup        842 2024-10-08 23:07 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/12
-rw-r--r--   3 root supergroup        842 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/2
-rw-r--r--   3 root supergroup        842 2024-10-08 23:02 /proyecto/kafka/retail_logs/gen_logs/data/_spark_metadata/3
-rw-r--r--   3 root supergroup        842 2024-

                                                                                

In [28]:
df = spark.read.csv('/proyecto/kafka/retail_logs/gen_logs/data', sep='\t', header=True)

                                                                                

In [30]:
df.show(5, truncate=False)

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----+-----+----------+
|value                                                                                                                                                                                                                           |log_date  |year|month|dayofmonth|
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------+----+-----+----------+
|108.123.173.171 - - [08/Oct/2024:23:05:59 -0800] "GET /add_to_cart/1250 HTTP/1.1" 200 1785 "-" "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.125 Safari/537.36"  

In [31]:
df.printSchema()

root
 |-- value: string (nullable = true)
 |-- log_date: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- dayofmonth: integer (nullable = true)

