In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, when, hour, dayofweek
from pyspark.sql.types import *
from pyspark.ml.feature import StringIndexer, OneHotEncoder, VectorAssembler, StandardScaler
from pyspark.ml import Pipeline

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("IoT Malware Detector - Feature Engineering") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0") \
    .getOrCreate()

# Read from Kafka
df_kafka = spark.read \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "big-data-final-project-kafka-1:29092") \
    .option("subscribe", "network-traffic") \
    .option("startingOffsets", "earliest") \
    .load()

# Define Schema
schema = StructType([
    StructField("ts", DoubleType()),
    StructField("id.orig_h", StringType()),
    StructField("id.orig_p", DoubleType()),
    StructField("id.resp_h", StringType()),
    StructField("id.resp_p", DoubleType()),
    StructField("proto", StringType()),
    StructField("duration", StringType()),
    StructField("orig_bytes", StringType()),
    StructField("resp_bytes", StringType()),
    StructField("conn_state", StringType()),
    StructField("label", StringType()),
    StructField("detailed-label", StringType())
])

# Parse Data
df = df_kafka.selectExpr("CAST(value AS STRING) as json") \
    .select(from_json(col("json"), schema).alias("data")) \
    .select("data.*")

## 1. Data Cleaning & Handling Missing Values
- Convert types.
- Handle missing values in `duration`, `orig_bytes`, `resp_bytes` (replace '-' with 0).

In [None]:
df_cleaned = df.withColumn("duration", col("duration").cast("double")) \
    .withColumn("orig_bytes", col("orig_bytes").cast("long")) \
    .withColumn("resp_bytes", col("resp_bytes").cast("long")) \
    .withColumn("orig_port", col("`id.orig_p`").cast("int")) \
    .withColumn("resp_port", col("`id.resp_p`").cast("int")) \
    .fillna(0, subset=["duration", "orig_bytes", "resp_bytes"])

# Filter out rows with null labels if any
df_cleaned = df_cleaned.filter(col("label").isNotNull())

print("Data cleaned and types casted.")
df_cleaned.printSchema()

## 2. Feature Extraction
- Create time-based features from timestamp.
- Create derived features like `total_bytes`, `bytes_per_sec`.

In [None]:
# Convert timestamp to timestamp type if needed, or just use it as is for now.
# 'ts' is double (epoch).
from pyspark.sql.functions import from_unixtime

df_features = df_cleaned.withColumn("timestamp", from_unixtime("ts").cast("timestamp")) \
    .withColumn("hour_of_day", hour("timestamp")) \
    .withColumn("day_of_week", dayofweek("timestamp")) \
    .withColumn("total_bytes", col("orig_bytes") + col("resp_bytes")) \
    .withColumn("bytes_per_sec", (col("orig_bytes") + col("resp_bytes")) / (col("duration") + 0.001))

print("New features created: hour_of_day, day_of_week, total_bytes, bytes_per_sec")
df_features.select("ts", "hour_of_day", "day_of_week", "total_bytes", "bytes_per_sec").show(5)

## 3. Categorical Encoding
- Convert `proto`, `conn_state` to numerical format using StringIndexer and OneHotEncoder.
- Convert `label` to binary (Malicious=1, Benign=0).

In [None]:
# String Indexing
indexer_proto = StringIndexer(inputCol="proto", outputCol="proto_index", handleInvalid="keep")
indexer_state = StringIndexer(inputCol="conn_state", outputCol="conn_state_index", handleInvalid="keep")
indexer_label = StringIndexer(inputCol="label", outputCol="label_index") # Malicious/Benign

# One Hot Encoding
encoder = OneHotEncoder(inputCols=["proto_index", "conn_state_index"], 
                        outputCols=["proto_vec", "conn_state_vec"])

# Pipeline for encoding
pipeline_encoding = Pipeline(stages=[indexer_proto, indexer_state, indexer_label, encoder])
model_encoding = pipeline_encoding.fit(df_features)
df_encoded = model_encoding.transform(df_features)

print("Categorical features encoded.")
df_encoded.select("proto", "proto_vec", "conn_state", "conn_state_vec", "label", "label_index").show(5)

## 4. Normalization & Vector Assembly
- Assemble all features into a single vector.
- Normalize numerical features.

In [None]:
# Define input columns for the model
numerical_cols = ["duration", "orig_bytes", "resp_bytes", "orig_port", "resp_port", 
                  "total_bytes", "bytes_per_sec", "hour_of_day", "day_of_week"]
categorical_vecs = ["proto_vec", "conn_state_vec"]

assembler_inputs = numerical_cols + categorical_vecs

# Vector Assembler
assembler = VectorAssembler(inputCols=assembler_inputs, outputCol="features_raw")

# Standard Scaler (Normalization)
scaler = StandardScaler(inputCol="features_raw", outputCol="features", withMean=True, withStd=True)

# Pipeline for final features
pipeline_final = Pipeline(stages=[assembler, scaler])
model_final = pipeline_final.fit(df_encoded)
df_final = model_final.transform(df_encoded)

print("Features assembled and normalized.")
df_final.select("features", "label_index").show(5, truncate=False)

## 5. Feature Selection (Optional/Preview)
Check the importance of features (e.g., using correlation or a simple Tree model).
For now, we have prepared the 'features' column ready for Phase 5 (Model Training).

In [None]:
# Save the processed data (optional, or just keep the pipeline for the next notebook)
# For streaming, we usually build this pipeline into the streaming job.
# For now, we verify the shape.
print(f"Final dataset count: {df_final.count()}")