In [6]:
# Import necessary libraries
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when
from pyspark.ml.feature import VectorAssembler, StandardScaler, Imputer, StringIndexer
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from random import randint

# Initialize Spark session
spark = SparkSession.builder.config("spark.driver.memory", "16g").appName('clustering_anomalies').getOrCreate()

# Load the dataset
data_without_header = spark.read.option("inferSchema", True).option("header", False).csv("kddcup.data_10_percent_corrected")
column_names = ["duration", "protocol_type", "service", "flag", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", 
                "hot", "num_failed_logins", "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", 
                "num_file_creations", "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", 
                "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate", 
                "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", "dst_host_same_srv_rate", 
                "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate", "dst_host_serror_rate", 
                "dst_host_srv_serror_rate", "dst_host_rerror_rate", "dst_host_srv_rerror_rate", "label"]
data = data_without_header.toDF(*column_names)

# Step 1: Handle Missing Values and Scale Numerical Features

# Separate numerical and categorical columns
numerical_columns = ["duration", "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent", "hot", "num_failed_logins", 
                     "logged_in", "num_compromised", "root_shell", "su_attempted", "num_root", "num_file_creations", 
                     "num_shells", "num_access_files", "num_outbound_cmds", "is_host_login", "is_guest_login", 
                     "count", "srv_count", "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate", 
                     "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate", "dst_host_count", "dst_host_srv_count", 
                     "dst_host_same_srv_rate", "dst_host_diff_srv_rate", "dst_host_same_src_port_rate", 
                     "dst_host_srv_diff_host_rate", "dst_host_serror_rate", "dst_host_srv_serror_rate", 
                     "dst_host_rerror_rate", "dst_host_srv_rerror_rate"]

categorical_columns = ["protocol_type", "service", "flag"]

# Handle missing values for numerical columns
imputer = Imputer(strategy="mean").setInputCols(numerical_columns).setOutputCols(numerical_columns)
data_imputed = imputer.fit(data).transform(data)

# Index categorical columns
indexers = [StringIndexer(inputCol=col, outputCol=col + "_indexed") for col in categorical_columns]

# Scale the numerical features
assembler = VectorAssembler(inputCols=numerical_columns + [col + "_indexed" for col in categorical_columns], outputCol="featureVector")
scaler = StandardScaler(inputCol="featureVector", outputCol="scaledFeatureVector", withStd=True, withMean=False)

# Step 2: K-Means Clustering

# Perform K-means clustering
kmeans = KMeans().setK(5).setSeed(randint(100, 100000)).setFeaturesCol("scaledFeatureVector").setPredictionCol("cluster")
pipeline = Pipeline(stages=indexers + [assembler, scaler, kmeans])
pipeline_model = pipeline.fit(data_imputed)

# Get the clustering results
data_with_clusters = pipeline_model.transform(data_imputed)

# Step 3: Label Data Points as Anomalies Based on Cluster Assignments

# Mark the smallest cluster as anomalies (or you can use a threshold for cluster sizes)
cluster_sizes = data_with_clusters.groupBy("cluster").count().orderBy("count").collect()
smallest_cluster = cluster_sizes[0]['cluster']  # Get the smallest cluster
data_with_anomalies = data_with_clusters.withColumn("anomaly", when(col("cluster") == smallest_cluster, 1).otherwise(0))

# Step 4: Evaluate the K-Means Clustering Model in Detecting Anomalies

# Calculate entropy-based evaluation for anomaly detection
def entropy(counts):
    total = sum(counts)
    probs = [count / total for count in counts]
    return -sum([p * F.log2(p) for p in probs if p > 0])

cluster_label = data_with_anomalies.select("cluster", "label").groupBy("cluster", "label").count()
window_spec = Window.partitionBy("cluster")
cluster_label_with_probs = cluster_label.withColumn("cluster_total", F.sum("count").over(window_spec))
cluster_label_with_probs = cluster_label_with_probs.withColumn("p", F.col("count") / F.col("cluster_total"))

# Calculate entropy for each cluster
cluster_entropy = cluster_label_with_probs.groupBy("cluster").agg(
    F.sum(F.col("p") * F.log2(F.col("p"))).alias("entropy")
)
total_entropy = cluster_entropy.agg(F.sum("entropy")).collect()[0][0]

# Show the results
data_with_anomalies.select("cluster", "label", "anomaly").show()
print(f"Total entropy of the model: {total_entropy}")


                                                                                

+-------+-------+-------+
|cluster|  label|anomaly|
+-------+-------+-------+
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
|      1|normal.|      0|
+-------+-------+-------+
only showing top 20 rows

Total entropy of the model: -3.0408280571150432


                                                                                