In [None]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

scala_version = '2.12'
spark_version = '3.5.0'
packages = [
    f'org.apache.spark:spark-sql-kafka-0-10_{scala_version}:{spark_version}',
    'org.apache.kafka:kafka-clients:3.6.0'
]
spark = SparkSession.builder.master('local').appName('Bigmart Sales Prediction').config('spark.jars.packages', ','.join(packages)).getOrCreate()
spark

In [10]:
from kafka import KafkaConsumer

kafka_topic_name = 'turtorial15'
kafka_bootstrap_servers = 'localhost:9092'

consumer = KafkaConsumer(kafka_topic_name, bootstrap_servers=kafka_bootstrap_servers)

In [None]:
from datetime import datetime
from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType, DoubleType, IntegerType, TimestampType
from pyspark.sql.functions import col, from_utc_timestamp
import json

# Define the schema based on the sample record
schema = StructType([
    StructField("Item_Identifier", StringType(), True),
    StructField("Item_Weight", DoubleType(), True),
    StructField("Item_Fat_Content", StringType(), True),
    StructField("Item_Visibility", DoubleType(), True),
    StructField("Item_Type", StringType(), True),
    StructField("Item_MRP", DoubleType(), True),
    StructField("Outlet_Identifier", StringType(), True),
    StructField("Outlet_Establishment_Year", IntegerType(), True),
    StructField("Outlet_Size", StringType(), True),
    StructField("Outlet_Location_Type", StringType(), True),
    StructField("Outlet_Type", StringType(), True),
    StructField("time_interval", StringType(), True)  # Keep it as StringType for now
])

for message in consumer:
    # Assuming your Kafka message is in the variable 'message'
    message_value = message.value.decode('utf-8')  # Decode bytes to string
    data_dict = json.loads(message_value)  # Convert JSON string to dictionary

    # Convert the 'time_interval' field to a timestamp
    data_dict["time_interval"] = datetime.strptime(data_dict["time_interval"], "%Y-%m-%d %H:%M:%S")

    # Create a Row object from the dictionary
    row = Row(**data_dict)

    # Create a DataFrame from the Row with the specified schema
    df = spark.createDataFrame([row], schema=schema)

    # Print the resulting DataFrame
    print(type(df))
    df.show()
