In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, count, avg, stddev, min, max, desc
from pyspark.sql.types import *

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("IoT Malware Detector - EDA") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .getOrCreate()

# Read from Kafka (Batch mode for EDA)
df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "big-data-final-project-kafka-1:29092") \
    .option("subscribe", "network-traffic") \
    .option("startingOffsets", "earliest") \
    .load()

# Define Schema
schema = StructType([
    StructField("ts", DoubleType()),
    StructField("id.orig_h", StringType()),
    StructField("id.orig_p", DoubleType()),
    StructField("id.resp_h", StringType()),
    StructField("id.resp_p", DoubleType()),
    StructField("proto", StringType()),
    StructField("duration", StringType()),
    StructField("orig_bytes", StringType()),
    StructField("resp_bytes", StringType()),
    StructField("conn_state", StringType()),
    StructField("label", StringType()),
    StructField("detailed-label", StringType())
])

# Parse Data
df = df_kafka.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")

# Basic Cleaning for Analysis (Type Casting)
df_cleaned = df.withColumn("duration", col("duration").cast("double")) \
    .withColumn("orig_bytes", col("orig_bytes").cast("long")) \
    .withColumn("resp_bytes", col("resp_bytes").cast("long")) \
    .withColumn("orig_port", col("`id.orig_p`").cast("int")) \
    .withColumn("resp_port", col("`id.resp_p`").cast("int")) \
    .fillna(0, subset=["duration", "orig_bytes", "resp_bytes"])

df_cleaned.cache() # Cache for faster analysis
print(f"Total records for analysis: {df_cleaned.count()}")

Total records for analysis: 156103


## 1. Label Distribution
Analyze the balance between Malicious and Benign traffic.

In [2]:
print("Label Distribution:")
df_cleaned.groupBy("label").count().show()

print("Detailed Label Distribution:")
df_cleaned.groupBy("detailed-label").count().orderBy(desc("count")).show(truncate=False)

Label Distribution:
+---------+------+
|    label| count|
+---------+------+
|Malicious|151567|
|   Benign|  4536|
+---------+------+

Detailed Label Distribution:
+-------------------------+------+
|detailed-label           |count |
+-------------------------+------+
|PartOfAHorizontalPortScan|145597|
|Attack                   |5962  |
|-                        |4536  |
|C&C                      |8     |
+-------------------------+------+



## 2. Protocol Analysis
Which protocols are most used?

In [3]:
print("Protocol Distribution:")
df_cleaned.groupBy("proto").count().orderBy(desc("count")).show()

print("Protocol vs Label:")
df_cleaned.groupBy("proto", "label").count().orderBy("proto").show()

Protocol Distribution:
+-----+------+
|proto| count|
+-----+------+
|  tcp|154158|
| icmp|  1207|
|  udp|   738|
+-----+------+

Protocol vs Label:
+-----+---------+------+
|proto|    label| count|
+-----+---------+------+
| icmp|   Benign|  1207|
|  tcp|   Benign|  2591|
|  tcp|Malicious|151567|
|  udp|   Benign|   738|
+-----+---------+------+



## 3. Connection State Analysis
Analyze connection states (e.g., S0, SF, REJ).

In [4]:
print("Connection State Distribution:")
df_cleaned.groupBy("conn_state").count().orderBy(desc("count")).show()

Connection State Distribution:
+----------+------+
|conn_state| count|
+----------+------+
|        S0|145610|
|        SF|  6688|
|       REJ|  2323|
|       OTH|  1216|
|      RSTO|   121|
|      RSTR|    61|
|        S1|    42|
|        S2|    32|
|        SH|     8|
|     RSTRH|     1|
|       SHR|     1|
+----------+------+



## 4. Numerical Feature Statistics
Analyze duration and bytes transferred.

In [5]:
print("Statistics for Duration and Bytes by Label:")
df_cleaned.groupBy("label") \
    .agg(
        avg("duration").alias("avg_duration"),
        stddev("duration").alias("std_duration"),
        avg("orig_bytes").alias("avg_orig_bytes"),
        avg("resp_bytes").alias("avg_resp_bytes")
    ) \
    .show(truncate=False)

Statistics for Duration and Bytes by Label:
+---------+------------------+------------------+------------------+----------------+
|label    |avg_duration      |std_duration      |avg_orig_bytes    |avg_resp_bytes  |
+---------+------------------+------------------+------------------+----------------+
|Malicious|2.4333049935738016|247.1103917392797 |22.744885100318672|72.3492712793682|
|Benign   |4.463892593474415 |24.455278871211867|39.607804232804234|97.831569664903 |
+---------+------------------+------------------+------------------+----------------+

