In [1]:
#Investigate and see if there were 2 or 3 hackers in the security incident
import findspark
findspark.init('/home/nick/spark-3.0.1-bin-hadoop2.7')

from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler, VectorAssembler
from pyspark.sql.functions import count, isnan, isnull, when, countDistinct

spark = SparkSession.builder.appName('Detecting Hacks').getOrCreate()

In [2]:
data = spark.read.csv('Clustering/hack_data.csv', header=True, inferSchema=True)

In [3]:
data.show(5)
data.printSchema()
data.describe().show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [4]:
# Check for missing data
data.select([count(when(isnan(c), c)).alias(c) for c in data.columns]).show()
data.select([count(when(isnull(c), c)).alias(c) for c in data.columns]).show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+
|                      0|                0|              0|                0|              0|       0|               0|
+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+

+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------+----------------+
|                      0|              

In [5]:
#Investigate the location column
data.select('location').show()

+--------------------+
|            location|
+--------------------+
|            Slovenia|
|British Virgin Is...|
|             Tokelau|
|             Bolivia|
|                Iraq|
|    Marshall Islands|
|             Georgia|
|         Timor-Leste|
|Palestinian Terri...|
|          Bangladesh|
|Northern Mariana ...|
|            Zimbabwe|
|         Isle of Man|
|Sao Tome and Prin...|
|              Greece|
|     Solomon Islands|
|       Guinea-Bissau|
|        Burkina Faso|
|            Mongolia|
|             Nigeria|
+--------------------+
only showing top 20 rows



In [6]:
data.select(countDistinct('location')).show()
# 181 - Conclude Location cannot tell us much

+------------------------+
|count(DISTINCT location)|
+------------------------+
|                     181|
+------------------------+



In [7]:
feature_columns = ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'WPM_Typing_Speed']

In [8]:
data.select(feature_columns).show()

+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|            70.8|
|                   20.0|            408.5|              0|             3.57|            8.0|           71.28|
|                    1.0|           390.69|              1|             2.79|            9.0|           71.57|
|

In [9]:
assembler = VectorAssembler(inputCols=feature_columns, outputCol='features')
model_input = assembler.transform(data)
model_input.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [10]:
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
scaler_model = scaler.fit(model_input)
cluster_final_data = scaler_model.transform(model_input)
cluster_final_data.printSchema()  # Use scaled_features

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaled_features: vector (nullable = true)



In [11]:
kmeans2 = KMeans(featuresCol='scaled_features', k=2)

In [12]:
kmeans3 = KMeans(featuresCol='scaled_features', k=3)

In [14]:
model_k2 = kmeans2.fit(cluster_final_data)
model_k3 = kmeans3.fit(cluster_final_data)

In [15]:
model_k3.transform(cluster_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+



In [16]:
model_k2.transform(cluster_final_data).groupBy('prediction').count().show()
# can conclude that their were 2 hackers since there are even counts.

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+

