### 0. Data set availability

Dataset availability: https://kdd.ics.uci.edu/databases/kddcup99/kddcup99.html

Full dataset: http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz

10% dataset:http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data_10_percent.gz

### 1. Import libraries

In [16]:
import os
import sys
import re

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.types import *
from pyspark.sql import Row
from pyspark.sql.functions import *
import pyspark.sql.functions as func
from pyspark.mllib.clustering import KMeans, KMeansModel

# Uncomment these if you would like to make any graphs using Matplotlib
# from mpl_toolkits.mplot3d import Axes3D
# import matplotlib.pyplot as plt
# import matplotlib.patches as mpatches

# plt.style.use('ggplot')
# plt.rcParams['figure.figsize'] = (20.0, 8.0)

# %matplotlib inline


### 2. Initiate Spark session & load dataset

In [17]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('app').getOrCreate()

In [18]:
input_file = r"C:\Users\douaa\OneDrive\Desktop\DSS Program\DSS 2 Big Data\kddcup.data.gz"

In [19]:
dataSchema = StructType([ \
    StructField('duration', IntegerType(), True), \
    StructField('protocol_type', StringType(), True), \
    StructField('service', StringType(), True), \
    StructField('flag', StringType(), True), \
    StructField('src_bytes', IntegerType(), True), \
    StructField('dst_bytes', IntegerType(), True), \
    StructField('land', StringType(), True), \
    StructField('wrong_fragment', IntegerType(), True), \
    StructField('urgent', IntegerType(), True), \
    StructField('hot', IntegerType(), True), \
    StructField('num_failed_logins', IntegerType(), True), \
    StructField('logged_in', StringType(), True), \
    StructField('num_compromised', IntegerType(), True), \
    StructField('root_shell', IntegerType(), True), \
    StructField('su_attempted', IntegerType(), True), \
    StructField('num_root', IntegerType(), True), \
    StructField('num_file_creations', IntegerType(), True), \
    StructField('num_shells', IntegerType(), True), \
    StructField('num_access_files', IntegerType(), True), \
    StructField('num_outbound_cmds', IntegerType(), True), \
    StructField('is_host_login', StringType(), True), \
    StructField('is_guest_login', StringType(), True), \
    StructField('count', IntegerType(), True), \
    StructField('srv_count', IntegerType(), True), \
    StructField('serror_rate', FloatType(), True), \
    StructField('srv_serror_rate', FloatType(), True), \
    StructField('rerror_rate', FloatType(), True), \
    StructField('srv_rerror_rate', FloatType(), True), \
    StructField('same_srv_rate', FloatType(), True), \
    StructField('diff_srv_rate', FloatType(), True), \
    StructField('srv_diff_host_rate', FloatType(), True), \
    StructField('dst_host_count', IntegerType(), True), \
    StructField('dst_host_srv_count', IntegerType(), True), \
    StructField('dst_host_same_srv_rate', FloatType(), True), \
    StructField('dst_host_diff_srv_rate', FloatType(), True), \
    StructField('dst_host_same_src_port_rate', FloatType(), True), \
    StructField('dst_host_srv_diff_host_rate', FloatType(), True), \
    StructField('dst_host_serror_rate', FloatType(), True), \
    StructField('dst_host_srv_serror_rate', FloatType(), True), \
    StructField('dst_host_rerror_rate', FloatType(), True), \
    StructField('dst_host_srv_rerror_rate', FloatType(), True), \
    StructField('type', StringType(), True) \
])

In [20]:
df = spark.read \
    .format('csv') \
    .options(header='True') \
    .options(delimiter=',') \
    .load(input_file, schema=dataSchema) 

In [21]:
df.show(5)

+--------+-------------+-------+----+---------+---------+----+--------------+------+---+-----------------+---------+---------------+----------+------------+--------+------------------+----------+----------------+-----------------+-------------+--------------+-----+---------+-----------+---------------+-----------+---------------+-------------+-------------+------------------+--------------+------------------+----------------------+----------------------+---------------------------+---------------------------+--------------------+------------------------+--------------------+------------------------+-------+
|duration|protocol_type|service|flag|src_bytes|dst_bytes|land|wrong_fragment|urgent|hot|num_failed_logins|logged_in|num_compromised|root_shell|su_attempted|num_root|num_file_creations|num_shells|num_access_files|num_outbound_cmds|is_host_login|is_guest_login|count|srv_count|serror_rate|srv_serror_rate|rerror_rate|srv_rerror_rate|same_srv_rate|diff_srv_rate|srv_diff_host_rate|dst_hos

### 3. Tasks

1. Using only numerical features, identify any anomalies in network connections. Remember, an anomaly is a data point, which does not fit in a 'reasonable' set of clusters for any given dataset 

2. In the above model, also include categorical features and determine any anomalies in network connections

3. Finally, make a 3D graph of data points using three dimensions to visualize anomalies

In [25]:
#convert to pandas DataFrame
# Sometimes it is more convinient to convert Spark DF to Pandas DF for exploration
import pandas as pd 
df_pd=df.toPandas()

In [23]:
#Check Missing value with proportion in Pandas DataFrame#Check Missing value with proportion in Pandas DataFrame

def missing_values_table(df_pd):
    """Input pandas dataframe and Return columns with missing value&percentage and stored as pandas dataframe"""
    
    mis_val = df_pd.isnull().sum() #count total of null in each columns in dataframe
    mis_val_percent = 100 * df_pd.isnull().sum() / len(df_pd) #count percentage of null in each columns
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)  #join to left (as column) between mis_val and mis_val_percent and create it as dataframe
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'}) #rename columns in table, mis_val to Missing Values and mis_val_percent to % of Total Values
    mis_val_table_ren_columns = mis_val_table_ren_columns[
    mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)         #sort column % of Total Values descending and round 1 after point(coma)
    print ("Your selected dataframe has " + str(df_pd.shape[1]) + " columns.\n"    #.shape[1] : just view total columns in dataframe  
    "There are " + str(mis_val_table_ren_columns.shape[0]) +              
    " columns that have missing values.") #.shape[0] : just view total rows in dataframe
    return mis_val_table_ren_columns


In [26]:
missing_values_table(df_pd)

Your selected dataframe has 42 columns.
There are 0 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values


In [31]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
from pyspark.ml.clustering import KMeans
from pyspark.ml.linalg import Vectors
from pyspark.sql.functions import col, udf
from pyspark.sql.types import FloatType


if 'features' in df.columns:
    df = df.drop('features')
    

# Define categorical columns
categoricalCols = ['protocol_type', 'service', 'flag', 'land', 'logged_in', 'is_host_login', 'is_guest_login']

# Filter out categorical columns with only one distinct value
valid_categoricalCols = [col for col in categoricalCols if df.select(col).distinct().count() > 1]

# Create indexers and encoders
indexers = [StringIndexer(inputCol=col, outputCol=col + "_index") for col in valid_categoricalCols]
encoders = [OneHotEncoder(inputCol=col + "_index", outputCol=col + "_vec") for col in valid_categoricalCols]

# Create the stages list for the Pipeline
stages = indexers + encoders

# Define numerical columns
numericalCols = ['duration', 'src_bytes', 'dst_bytes', 'wrong_fragment', 'urgent', 'hot', 'num_failed_logins', 
                 'num_compromised', 'root_shell', 'su_attempted', 'num_root', 'num_file_creations', 'num_shells', 
                 'num_access_files', 'num_outbound_cmds', 'count', 'srv_count', 'serror_rate', 'srv_serror_rate', 
                 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate', 
                 'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate', 'dst_host_diff_srv_rate', 
                 'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate', 'dst_host_serror_rate', 
                 'dst_host_srv_serror_rate', 'dst_host_rerror_rate', 'dst_host_srv_rerror_rate']

# Assemble features
assemblerInputs = [col + "_vec" for col in valid_categoricalCols] + numericalCols
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
stages.append(assembler)

# Apply the stages to the DataFrame
pipeline = Pipeline(stages=stages)
pipelineModel = pipeline.fit(df)
df = pipelineModel.transform(df)

# Apply K-means clustering
kmeans = KMeans(k=10, seed=1, featuresCol="features", predictionCol="prediction")
model = kmeans.fit(df)
transformed = model.transform(df)

# Calculate distances to cluster centers to identify outliers
def get_dist_to_center(point, centers, cluster):
    center = centers[cluster]
    return float(Vectors.squared_distance(Vectors.dense(point), Vectors.dense(center)))

centers = model.clusterCenters()
distance_udf = udf(lambda point, cluster: get_dist_to_center(point, centers, cluster), FloatType())

transformed = transformed.withColumn("distanceToCenter", distance_udf(col("features"), col("prediction")))

# Define a threshold for identifying outliers
threshold = transformed.agg({"distanceToCenter": "mean"}).collect()[0][0] * 3
outliers = transformed.filter(col("distanceToCenter") > threshold)

# Show anomalies
outliers.select("duration", "src_bytes", "dst_bytes", "distanceToCenter").show()

+--------+---------+---------+----------------+
|duration|src_bytes|dst_bytes|distanceToCenter|
+--------+---------+---------+----------------+
|       0|      300|    42747|    1.77689101E9|
|       0|      293|    38125|    1.40865894E9|
|       0|      284|    43129|    1.80926221E9|
|       0|      212|    43129|    1.80936461E9|
|       0|      284|    43129|    1.80922701E9|
|       0|      226|    74301|     5.4323031E9|
|       0|      188|    74810|     5.5076367E9|
|       0|      198|    74810|     5.5076142E9|
|       0|      256|   125015|   1.54792335E10|
|       0|      332|    61480|    3.70673382E9|
|       0|      226|    61480|    3.70687744E9|
|       0|      236|    39873|    1.54299661E9|
|       0|      231|    36340|     1.2779831E9|
|       0|      309|    81172|     6.4921482E9|
|       0|      330|    80476|     6.3804411E9|
|       0|      212|    39751|     1.5334528E9|
|       0|      203|    39751|    1.53347533E9|
|       0|      228|    44578|    1.9346

In [35]:
import pandas as pd
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Define the sample size and chunk size
sample_fraction = 0.001  # 0.1% sample
chunk_size = 500  # Adjust as needed
max_records = 10000  # Maximum number of records to collect

# Sample a smaller fraction of the data
outliers_sample = outliers.sample(False, sample_fraction, seed=42)

# Select only the needed columns
selected_columns = ["duration", "src_bytes", "dst_bytes", "distanceToCenter"]
outliers_pd_list = []

# Collect data in chunks to avoid memory issues
collected_records = 0
for start in range(0, outliers_sample.count(), chunk_size):
    try:
        chunk = outliers_sample.select(selected_columns).limit(chunk_size).collect()
        outliers_pd_list.extend(chunk)
        collected_records += len(chunk)
        if collected_records >= max_records:
            break
    except Exception as e:
        print(f"Error collecting chunk starting at {start}: {e}")
        break

# Convert collected data to Pandas DataFrame
outliers_pd = pd.DataFrame(outliers_pd_list, columns=selected_columns)

# Plotting in 3D if there are enough records
if not outliers_pd.empty:
    fig = plt.figure(figsize=(12, 10))
    ax = fig.add_subplot(111, projection='3d')

    sc = ax.scatter(outliers_pd['duration'], outliers_pd['src_bytes'], outliers_pd['dst_bytes'], c=outliers_pd['distanceToCenter'], cmap="seismic")
    plt.colorbar(sc)
    ax.set_title('Network Connection Anomalies')
    ax.set_xlabel('Duration')
    ax.set_ylabel('Source Bytes')
    ax.set_zlabel('Destination Bytes')

    plt.show()
else:
    print("Not enough data to plot.")

Error collecting chunk starting at 0: An error occurred while calling o1966.collectToPython.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 145.0 failed 1 times, most recent failure: Lost task 0.0 in stage 145.0 (TID 3328, localhost, executor driver): org.apache.spark.api.python.PythonException: Traceback (most recent call last):
  File "c:\Users\douaa\anaconda3\ANACONDA\envs\PySpark_2_4_5\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 377, in main
  File "c:\Users\douaa\anaconda3\ANACONDA\envs\PySpark_2_4_5\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\worker.py", line 372, in process
  File "c:\Users\douaa\anaconda3\ANACONDA\envs\PySpark_2_4_5\Lib\site-packages\pyspark\python\lib\pyspark.zip\pyspark\serializers.py", line 352, in dump_stream
    self.serializer.dump_stream(self._batched(iterator), stream)
  File "c:\Users\douaa\anaconda3\ANACONDA\envs\PySpark_2_4_5\Lib\site-packages\pyspark\python\lib\pyspark.