In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import *

# Stop any existing session
try:
    spark.stop()
except:
    pass

# Create NEW session with Kafka package
spark = SparkSession.builder \
    .appName("IoT Malware Detector - Setup") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .config("spark.sql.streaming.kafka.useDeprecatedOffsetFetching", "false") \
    .getOrCreate()

print("✅ Fresh Spark session with Kafka!")

✅ Fresh Spark session with Kafka!


In [2]:
# Read from Kafka
df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "big-data-final-project-kafka-1:29092") \
    .option("subscribe", "network-traffic") \
    .option("startingOffsets", "earliest") \
    .load()

print(f"✅ Connected to Kafka!")
print(f"Total messages available: {df_kafka.count()}")

✅ Connected to Kafka!
Total messages available: 156103


In [3]:
# Define schema based on the producer's data structure
# The producer sends these fields: 
# ts, uid, id.orig_h, id.orig_p, id.resp_h, id.resp_p, proto, service, duration, orig_bytes, resp_bytes, conn_state, local_orig, local_resp, missed_bytes, history, orig_pkts, orig_ip_bytes, resp_pkts, resp_ip_bytes, tunnel_parents, label, detailed-label
# We will select the most relevant ones for now, matching the previous notebook's schema but expanding if needed.

schema = StructType([
    StructField("ts", DoubleType()),
    StructField("id.orig_h", StringType()),
    StructField("id.orig_p", DoubleType()),
    StructField("id.resp_h", StringType()),
    StructField("id.resp_p", DoubleType()),
    StructField("proto", StringType()),
    StructField("duration", StringType()),
    StructField("orig_bytes", StringType()),
    StructField("resp_bytes", StringType()),
    StructField("conn_state", StringType()),
    StructField("label", StringType()),
    StructField("detailed-label", StringType())
])

# Parse JSON from Kafka value
df_parsed = df_kafka.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")

# Show parsed data
print("Sample parsed data:")
df_parsed.show(5, truncate=False)

Sample parsed data:
+-------------------+-----------+---------+--------------+---------+-----+--------+----------+----------+----------+---------+-------------------------+
|ts                 |id.orig_h  |id.orig_p|id.resp_h     |id.resp_p|proto|duration|orig_bytes|resp_bytes|conn_state|label    |detailed-label           |
+-------------------+-----------+---------+--------------+---------+-----+--------+----------+----------+----------+---------+-------------------------+
|1.5267562618665E9  |192.168.2.5|38792.0  |200.168.87.203|59353.0  |tcp  |2.998333|0         |0         |S0        |Malicious|PartOfAHorizontalPortScan|
|1.526756268874876E9|192.168.2.5|38792.0  |200.168.87.203|59353.0  |tcp  |-       |-         |-         |S0        |Malicious|PartOfAHorizontalPortScan|
|1.526756272877722E9|192.168.2.5|38793.0  |200.168.87.203|59353.0  |tcp  |2.997182|0         |0         |S0        |Malicious|PartOfAHorizontalPortScan|
|1.526756279884959E9|192.168.2.5|38793.0  |200.168.87.203|5935