# Explorar acesso ao kafka

In [1]:
# !pip install --quiet azure-storage-blob delta

In [2]:
import sys
import requests
import time
import os
import pyspark
import seaborn as sns
import matplotlib.pyplot as plt
import socket

from azure.storage.blob import BlobClient
from delta import *
from os import path
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, to_varchar
from pyspark.sql.types import StringType, DateType, StructType, StructField, TimestampType, DoubleType
from pyspark.sql.avro.functions import *

In [3]:
spark = (SparkSession.builder
    .appName("ExploreKafka")
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension")
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog")         
    .getOrCreate()
)
spark.sparkContext.setLogLevel("ERROR")

In [4]:
bootstrap = "demo-kafka-demo-kafka-plain-0.nemesys-stream.svc:9092"
offset = "earliest"
topico = "stocks_intraday"
SCHEMA_REGISTRY = "http://nemesys-sr-schema-registry-headless.nemesys-stream-101.svc:8081"

In [5]:
# # retrieve the latest schema
# url = '{}/subjects/{}-value/versions/latest/schema'.format(SCHEMA_REGISTRY, topico)
# # print(url)
# response = requests.get(url)
# # error check
# response.raise_for_status()
# # extract the schema from the response
# schema = response.text

In [6]:
# (spark
#     .readStream
#     .format("kafka")
#     .option("kafka.bootstrap.servers", bootstrap)
#     .option("subscribe", topico)
#     .option("startingOffsets", offset)
#     .option("security.protocol", "SSL")
#     .load()
#     .selectExpr("substring(value, 6) as avro_value")
#     .select(from_avro(col("avro_value"), schema).alias("value"))
#     .select("value.*")
#     .writeStream
#     .format('delta')
#     .outputMode('append')
#     .option('mergeSchema', 'true')
#     .option('checkpointLocation', "lakehouse/checkpoint/stocks")
#     .trigger(once=True)
#     .start("lakehouse/stocks")
#     .awaitTermination()
# )
    

In [7]:
# df = (
#     spark
#     .read
#     .format("delta")
#     .load("lakehouse/stocks")
# )

In [8]:
%config SqlMagic.lazy_execution = True

In [9]:
%sql spark

Deploy FastAPI apps for free on Ploomber Cloud! Learn more: https://ploomber.io/s/signup


In [10]:
schema = StructType([
	StructField("ticker", StringType()),
	StructField("timestamp", TimestampType()),
	StructField("open",DoubleType()),
	StructField("high", DoubleType()),
	StructField("low", DoubleType()),
	StructField("close", DoubleType()),
	StructField("volume", DoubleType()),
]).json()
schema = """
    {
        "type": "record",
        "name": "Stock",
        "fields": [
            {"name": "ticker", "type": "string"},
            {"name": "timestamp", "type": "string"},
            {"name": "open", "type": "double"},
            {"name": "high", "type": "double"},
            {"name": "low", "type": "double"},
            {"name": "close", "type": "double"},
            {"name": "volume", "type": "long"}
        ]
    }
"""
# schema = '{"type":"record","name":"stocks","namespace":"stocks.StockData","fields":[{"name":"_id","type":["null","string"],"default":null},{"name":"ticker","type":["null","string"],"default":null},{"name":"description","type":["null","string"],"default":null},{"name":"timestamp","type":["null",{"type":"long","connect.version":1,"connect.name":"org.apache.kafka.connect.data.Timestamp","logicalType":"timestamp-millis"}],"default":null},{"name":"open","type":["null","double"],"default":null},{"name":"high","type":["null","double"],"default":null},{"name":"low","type":["null","double"],"default":null},{"name":"close","type":["null","double"],"default":null},{"name":"volume","type":["null","int"],"default":null},{"name":"__op","type":["null","string"],"default":null},{"name":"__collection","type":["null","string"],"default":null},{"name":"__ts_ms","type":["null","long"],"default":null}],"connect.name":"stocks.StockData.stocks"}'

In [15]:
df2 = (spark
    .read
    .format("kafka")
    .option("kafka.bootstrap.servers", bootstrap)
    .option("subscribe", topico)
    .option("startingOffsets", "earliest")
    # .option("security.protocol", "SSL")
    .load()
    # .selectExpr("substring(value, 6) as avro_value") # Deve ser usado no caso do Debezium
    .select(from_avro(col("value"), schema).alias("value"), col("timestamp").alias("_capture_time"))
    .select(col("value.*"), col("_capture_time"))
)
df2.createOrReplaceTempView("stocks")
df2.printSchema()

root
 |-- ticker: string (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- open: double (nullable = true)
 |-- high: double (nullable = true)
 |-- low: double (nullable = true)
 |-- close: double (nullable = true)
 |-- volume: long (nullable = true)
 |-- _capture_time: timestamp (nullable = true)



In [17]:
df2.pandas_api()



Unnamed: 0,ticker,timestamp,open,high,low,close,volume,_capture_time
0,AAPL,2024-06-13T13:43:00Z,216.25,216.429993,216.229996,216.345001,4738,2024-06-13 18:03:04.275
1,AAPL,2024-06-13T13:33:00Z,215.289993,216.259995,215.145004,215.940002,12762,2024-06-13 18:03:04.271
2,AAPL,2024-06-13T13:31:00Z,215.220001,215.220001,214.524994,215.009995,16138,2024-06-13 18:03:04.270
3,AAPL,2024-06-13T13:44:00Z,216.410004,216.529999,215.860001,216.110001,14067,2024-06-13 18:03:04.275
4,AAPL,2024-06-13T13:38:00Z,215.809998,216.570007,215.630005,216.520004,5224,2024-06-13 18:03:04.276
5,AAPL,2024-06-13T13:39:00Z,216.589996,216.589996,216.029999,216.229996,7232,2024-06-13 18:03:04.274
6,AAPL,2024-06-13T13:42:00Z,216.259995,216.259995,215.820007,216.25,7166,2024-06-13 18:03:04.275
7,AAPL,2024-06-13T13:45:00Z,216.160004,216.255005,215.949997,216.210007,8553,2024-06-13 18:03:04.276
8,AAPL,2024-06-13T13:30:00Z,214.720001,215.949997,214.580002,215.229996,52886,2024-06-13 18:03:04.270
9,AAPL,2024-06-13T13:46:00Z,216.25,216.360001,216.199997,216.360001,3185,2024-06-13 18:03:04.276


In [12]:
# df2 = (spark
#     .read
#     .format("kafka")
#     .option("kafka.bootstrap.servers", bootstrap)
#     .option("subscribe", topico)
#     .option("startingOffsets", "earliest")
#     # .option("security.protocol", "SSL")
#     .load()
#     .select(from_json(col("value").cast("string"), schema).alias("json_value"))
#     .select(col("json_value.*"))
# )
# df2.createOrReplaceTempView("stocks")
# df2.printSchema()

In [13]:
%%time
df2.toPandas()

ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer from Java side is empty")
py4j.protocol.Py4JNetworkError: Answer from Java side is empty

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 539, in send_command
    raise Py4JNetworkError(
py4j.protocol.Py4JNetworkError: Error while sending or receiving
ERROR:root:Exception while sending command.
Traceback (most recent call last):
  File "/opt/conda/lib/python3.11/site-packages/py4j/clientserver.py", line 516, in send_command
    raise Py4JNetworkError("Answer 

ConnectionRefusedError: [Errno 111] Connection refused

In [14]:
%%time

df = %sql \
    select \
        ticker, \
        timestamp, \
        open, \
        high, \
        low, \
        close, \
        volume \
    from stocks \
    where ticker in ("AAPL") \
      and timestamp >= current_timestamp - interval 8 hours \
    order by ticker, timestamp

ConnectionRefusedError: [Errno 111] Connection refused

In [15]:
df.toPandas()

NameError: name 'df' is not defined

In [None]:
sns.set_context('talk')
# sns.set_palette('Pastel2')

colors = sns.color_palette('pastel')[0:5]
plt.figure(figsize=(26,6))

sns.set_style("ticks",{'axes.grid' : True})

ax = sns.lineplot(df.toPandas(), x="timestamp", y="close", hue="ticker")
ax.tick_params(axis='x', rotation=90)