In [1]:
from kafka import KafkaProducer, KafkaAdminClient
from kafka.admin import NewTopic
from kafka.errors import TopicAlreadyExistsError
import json
import time
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_csv
import os

In [2]:
print("Creating Kafka topic...")

time.sleep(2)

admin_client = KafkaAdminClient(
    bootstrap_servers=['localhost:9092'],
    client_id='notebook_admin'
)

topic_name = 'test'
topic = NewTopic(
    name=topic_name, 
    num_partitions=1, 
    replication_factor=1
)

try:
    admin_client.create_topics(new_topics=[topic], validate_only=False)
    print(f"Topic '{topic_name}' created")
except TopicAlreadyExistsError:
    print(f"Topic '{topic_name}' already exists")
except Exception as e:
    print(f"Error: {e}")
finally:
    admin_client.close()

Creating Kafka topic...
Topic 'test' already exists


In [3]:
producer = KafkaProducer(
    bootstrap_servers=['localhost:9092'],
    value_serializer=lambda v: v.encode('utf-8')
)

rows = [
    "196,242,3.0,881250949",
    "186,302,3.0,891717742",
    "22,377,1.0,878887116",
    "244,51,2.0,880606923",
    "166,346,1.0,886397596",
]

for line in rows:
    producer.send('test', value=line)
    
producer.flush()
producer.close()
print(f"Sent {len(rows)} messages")

Sent 5 messages


In [4]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("kafka-structured-streaming-local")
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0,"
        "org.apache.spark:spark-token-provider-kafka-0-10_2.13:4.0.0"
    )
    .config("spark.sql.streaming.forceDeleteTempCheckpointLocation", "true")
    .getOrCreate()
)

spark

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
25/09/29 09:30:41 WARN Utils: Your hostname, Pranavs-MacBook-Pro.local, resolves to a loopback address: 127.0.0.1; using 192.168.0.38 instead (on interface en0)
25/09/29 09:30:41 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
:: loading settings :: url = jar:file:/Users/pranavrajan/Desktop/id2221-data-intensive-traffic/.venv/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /Users/pranavrajan/.ivy2.5.2/cache
The jars for the packages stored in: /Users/pranavrajan/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
org.apache.spark#spark-token-provider-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b0b8aaca-f32e-4953-a26e-c03eff7fd7f3;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;4.0.0 in central
	f

In [5]:
from pyspark.sql.types import StructType, StructField, IntegerType, DoubleType, LongType
from pyspark.sql.functions import col, from_csv

# Schema for the CSV in Kafka 'value'
schema_ddl = "UserId INT, MovieId INT, Rating DOUBLE, Timestamp LONG"

# Read from Kafka
raw = (spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "test")
    .option("startingOffsets", "earliest")
    .load())

In [6]:
parsed = (raw
    .selectExpr("CAST(value AS STRING) AS value_str", "timestamp AS KafkaTimestamp")
    .select(from_csv(col("value_str"), schema_ddl).alias("r"), col("KafkaTimestamp"))
    .select("r.*", "KafkaTimestamp")
    .where(col("UserId").isNotNull()))

In [7]:
from IPython.display import clear_output, display

def show_batch(batch_df, epoch_id):
    pdf = batch_df.limit(50).toPandas()
    clear_output(wait=True)
    if pdf.empty:
        print(f"(epoch {epoch_id}) no rows")
    else:
        display(pdf)

q = (parsed.writeStream
     .foreachBatch(show_batch)
     .outputMode("append")
     .start())

# Keep it alive ~15s so you see output.
q.awaitTermination(15)
q.stop()

Unnamed: 0,UserId,MovieId,Rating,Timestamp,KafkaTimestamp
0,196,242,3.0,881250949,2025-09-29 09:16:37.398
1,186,302,3.0,891717742,2025-09-29 09:16:37.398
2,22,377,1.0,878887116,2025-09-29 09:16:37.398
3,244,51,2.0,880606923,2025-09-29 09:16:37.398
4,166,346,1.0,886397596,2025-09-29 09:16:37.398
5,196,242,3.0,881250949,2025-09-29 09:22:18.646
6,186,302,3.0,891717742,2025-09-29 09:22:18.647
7,22,377,1.0,878887116,2025-09-29 09:22:18.647
8,244,51,2.0,880606923,2025-09-29 09:22:18.647
9,166,346,1.0,886397596,2025-09-29 09:22:18.647


25/09/29 09:30:59 WARN DAGScheduler: Failed to cancel job group eface530-af6c-49eb-94c1-822bb7c93ce7. Cannot find active jobs for it.
25/09/29 09:30:59 WARN DAGScheduler: Failed to cancel job group eface530-af6c-49eb-94c1-822bb7c93ce7. Cannot find active jobs for it.
