In [None]:
import pyspark
import os
import sys
from pyspark import SparkContext

# Set up the environment for PySpark
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable
from pyspark.sql import SparkSession

# Initialize Spark session with specified memory allocation
spark = SparkSession.builder.config("spark.driver.memory", "16g")\
    .appName('chapter_5').getOrCreate()

# Load the data without header and infer schema
data_without_header = spark.read.option("inferSchema", True)\
    .option("header", False)\
    .csv("data/kddcup.data_10_percent_corrected")

# Define column names for the dataset
column_names = ["duration", "protocol_type", "service", "flag",
                "src_bytes", "dst_bytes", "land", "wrong_fragment", "urgent",
                "hot", "num_failed_logins", "logged_in", "num_compromised",
                "root_shell", "su_attempted", "num_root", "num_file_creations",
                "num_shells", "num_access_files", "num_outbound_cmds",
                "is_host_login", "is_guest_login", "count", "srv_count",
                "serror_rate", "srv_serror_rate", "rerror_rate", "srv_rerror_rate",
                "same_srv_rate", "diff_srv_rate", "srv_diff_host_rate",
                "dst_host_count", "dst_host_srv_count",
                "dst_host_same_srv_rate", "dst_host_diff_srv_rate",
                "dst_host_same_src_port_rate", "dst_host_srv_diff_host_rate",
                "dst_host_serror_rate", "dst_host_srv_serror_rate",
                "dst_host_rerror_rate", "dst_host_srv_rerror_rate",
                "label"]

# Assign column names to the dataset
data = data_without_header.toDF(*column_names)

# Display count of each label for analysis
from pyspark.sql.functions import col
data.select("label").groupBy("label").count()\
    .orderBy(col("count").desc()).show(25)

# Drop categorical columns and cache numeric-only dataset for clustering
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans, KMeansModel
from pyspark.ml import Pipeline

numeric_only = data.drop("protocol_type", "service", "flag").cache()

# Set up pipeline with VectorAssembler and KMeans model
assembler = VectorAssembler().setInputCols(numeric_only.columns[:-1])\
    .setOutputCol("featureVector")
kmeans = KMeans().setPredictionCol("cluster").setFeaturesCol("featureVector")
pipeline = Pipeline().setStages([assembler, kmeans])

# Fit the pipeline to the data
pipeline_model = pipeline.fit(numeric_only)
kmeans_model = pipeline_model.stages[1]

# Print the cluster centers
from pprint import pprint
pprint(kmeans_model.clusterCenters())

# Add cluster predictions and display count by label for each cluster
with_cluster = pipeline_model.transform(numeric_only)
with_cluster.select("cluster", "label").groupBy("cluster", "label").count()\
    .orderBy(col("cluster"), col("count").desc()).show(25)

# Define a function to calculate the clustering score (training cost) for given k
from pyspark.sql import DataFrame
from random import randint
def clustering_score(input_data, k):
    input_numeric_only = input_data.drop("protocol_type", "service", "flag")
    assembler = VectorAssembler().setInputCols(input_numeric_only.columns[:-1])\
        .setOutputCol("featureVector")
    kmeans = KMeans().setSeed(randint(100,100000)).setK(k)\
        .setPredictionCol("cluster").setFeaturesCol("featureVector")
    pipeline = Pipeline().setStages([assembler, kmeans])
    pipeline_model = pipeline.fit(input_numeric_only)
    kmeans_model = pipeline_model.stages[-1]
    training_cost = kmeans_model.summary.trainingCost
    return training_cost

# Calculate and print training cost for different values of k
for k in list(range(20,100, 20)):
    print(clustering_score(numeric_only, k))

# Define a function to calculate clustering score with higher iterations and tolerance
def clustering_score_1(input_data, k):
    input_numeric_only = input_data.drop("protocol_type", "service", "flag")
    assembler = VectorAssembler()\
        .setInputCols(input_numeric_only.columns[:-1])\
        .setOutputCol("featureVector")
    kmeans = KMeans().setSeed(randint(100,100000)).setK(k).setMaxIter(40)\
        .setTol(1.0e-5)\
        .setPredictionCol("cluster").setFeaturesCol("featureVector")
    pipeline = Pipeline().setStages([assembler, kmeans])
    pipeline_model = pipeline.fit(input_numeric_only)
    kmeans_model = pipeline_model.stages[-1]
    training_cost = kmeans_model.summary.trainingCost
    return training_cost

for k in list(range(20,101, 20)):
    print(k, clustering_score_1(numeric_only, k))

# Define a function to perform clustering with feature scaling
from pyspark.ml.feature import StandardScaler
def clustering_score_2(input_data, k):
    input_numeric_only = input_data.drop("protocol_type", "service", "flag")
    assembler = VectorAssembler()\
        .setInputCols(input_numeric_only.columns[:-1])\
        .setOutputCol("featureVector")
    scaler = StandardScaler().setInputCol("featureVector")\
        .setOutputCol("scaledFeatureVector")\
        .setWithStd(True).setWithMean(False)
    kmeans = KMeans().setSeed(randint(100,100000))\
        .setK(k).setMaxIter(40)\
        .setTol(1.0e-5).setPredictionCol("cluster")\
        .setFeaturesCol("scaledFeatureVector")
    pipeline = Pipeline().setStages([assembler, scaler, kmeans])
    pipeline_model = pipeline.fit(input_numeric_only)
    kmeans_model = pipeline_model.stages[-1]
    training_cost = kmeans_model.summary.trainingCost
    return training_cost

for k in list(range(60, 271, 30)):
    print(k, clustering_score_2(numeric_only, k))

# Function to create one-hot encoding pipeline for categorical columns
from pyspark.ml.feature import OneHotEncoder, StringIndexer
def one_hot_pipeline(input_col):
    indexer = StringIndexer().setInputCol(input_col).setOutputCol(input_col + "_indexed")
    encoder = OneHotEncoder().setInputCol(input_col + "_indexed")\
        .setOutputCol(input_col + "_vec")
    pipeline = Pipeline().setStages([indexer, encoder])
    return pipeline, input_col + "_vec"

def clustering_score_3(input_data, k):
    # Create one-hot encoding pipelines for categorical columns
    proto_type_pipeline, proto_type_vec_col = one_hot_pipeline("protocol_type")
    service_pipeline, service_vec_col = one_hot_pipeline("service")
    flag_pipeline, flag_vec_col = one_hot_pipeline("flag")

    # Assemble all columns into a feature vector
    assemble_cols = set(input_data.columns) - \
                    {"label", "protocol_type", "service", "flag"} | \
                    {proto_type_vec_col, service_vec_col, flag_vec_col}
    assembler = VectorAssembler().setInputCols(list(assemble_cols))\
        .setOutputCol("featureVector")
    scaler = StandardScaler().setInputCol("featureVector")\
        .setOutputCol("scaledFeatureVector").setWithStd(True).setWithMean(False)
    kmeans = KMeans().setSeed(randint(100,100000)).setK(k).setMaxIter(40)\
        .setTol(1.0e-5).setPredictionCol("cluster")\
        .setFeaturesCol("scaledFeatureVector")
    pipeline = Pipeline().setStages([proto_type_pipeline, service_pipeline,
                                     flag_pipeline, assembler, scaler, kmeans])
    pipeline_model = pipeline.fit(input_data)
    kmeans_model = pipeline_model.stages[-1]
    training_cost = kmeans_model.summary.trainingCost
    return training_cost

for k in list(range(60, 271, 30)):
    print(k, clustering_score_3(data, k))

# Define a function to calculate entropy
from math import log
def entropy(counts):
    values = [c for c in counts if (c > 0)]
    n = sum(values)
    p = [v/n for v in values]
    return sum([-1*(p_v) * log(p_v) for p_v in p])

# Calculate entropy for each cluster and evaluate clustering quality
from pyspark.sql import functions as fun
from pyspark.sql import Window
cluster_label = pipeline_model.transform(data).select("cluster", "label")
df = cluster_label.groupBy("cluster", "label").count().orderBy("cluster")
w = Window.partitionBy("cluster")
p_col = df['count'] / fun.sum(df['count']).over(w)
with_p_col = df.withColumn("p_col", p_col)
result = with_p_col.groupBy("cluster").agg(
    (-fun.sum(col("p_col") * fun.log2(col("p_col")))).alias("entropy"),
    fun.sum(col("count")).alias("cluster_size"))
result = result.withColumn('weightedClusterEntropy', fun.col('entropy') * fun.col('cluster_size'))
weighted_cluster_entropy_avg = result.agg(fun.sum(col('weightedClusterEntropy'))).collect()
weighted_cluster_entropy_avg[0][0] / data.count()

# Define a function to fit the full pipeline and calculate clustering score with entropy
def fit_pipeline_4(data, k):
    proto_type_pipeline, proto_type_vec_col = one_hot_pipeline("protocol_type")
    service_pipeline, service_vec_col = one_hot_pipeline("service")
    flag_pipeline, flag_vec_col = one_hot_pipeline("flag")
    assemble_cols = set(data.columns) - {"label", "protocol_type", "service", "flag"} | {proto_type_vec_col, service_vec_col, flag_vec_col}
    assembler = VectorAssembler(inputCols=list(assemble_cols), outputCol="featureVector")
    scaler = StandardScaler(inputCol="featureVector", outputCol="scaledFeatureVector", withStd=True, withMean=False)
    kmeans = KMeans(seed=randint(100, 100000), k=k, predictionCol="cluster", featuresCol="scaledFeatureVector", maxIter=40, tol=1.0e-5)
    pipeline = Pipeline(stages=[proto_type_pipeline, service_pipeline, flag_pipeline, assembler, scaler, kmeans])
    return pipeline.fit(data)

def clustering_score_4(input_data, k):
    pipeline_model = fit_pipeline_4(input_data, k)
    cluster_label = pipeline_model.transform(input_data).select("cluster", "label")
    df = cluster_label.groupBy("cluster", "label").count().orderBy("cluster")
    w = Window.partitionBy("cluster")
    p_col = df['count'] / fun.sum(df['count']).over(w)
    with_p_col = df.withColumn("p_col", p_col)
    result = with_p_col.groupBy("cluster").agg(
        -fun.sum(col("p_col") * fun.log2(col("p_col"))).alias("entropy"),
        fun.sum(col("count")).alias("cluster_size"))
    result = result.withColumn('weightedClusterEntropy', col('entropy') * col('cluster_size'))
    weighted_cluster_entropy_avg = result.agg(fun.sum(col('weightedClusterEntropy'))).collect()
    return weighted_cluster_entropy_avg[0][0] / input_data.count()

pipeline_model = fit_pipeline_4(data, 180)
count_by_cluster_label = pipeline_model.transform(data).\
select("cluster", "label").\
groupBy("cluster", "label").\
count().orderBy("cluster", "label")
count_by_cluster_label.show()

# Stop the Spark session
spark.stop()