In [1]:
appname = "K-Means Clustering - Documentation example"

# Look into https://spark.apache.org/downloads.html for the latest version
spark_mirror = "https://mirrors.sonic.net/apache/spark"
spark_version = "3.3.1"
hadoop_version = "3"

# Install Java 8 (Spark does not work with newer Java versions)
! apt-get update
! apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Download and extract Spark binary distribution
! rm -rf spark-{spark_version}-bin-hadoop{hadoop_version}.tgz spark-{spark_version}-bin-hadoop{hadoop_version}
! wget -q {spark_mirror}/spark-{spark_version}/spark-{spark_version}-bin-hadoop{hadoop_version}.tgz
! tar xzf spark-{spark_version}-bin-hadoop{hadoop_version}.tgz

# The only 2 environment variables needed to set up Java and Spark
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/spark-{spark_version}-bin-hadoop{hadoop_version}"

# Set up the Spark environment based on the environment variable SPARK_HOME 
! pip install -q findspark
import findspark
findspark.init()

# Get the Spark session object (basic entry point for every operation)
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName(appname).master("local[*]").getOrCreate()

Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Hit:2 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease [1,581 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Get:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:9 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-backports InRelease [83.3 kB]
Hit:11 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Get:12 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic InRelease [21.3 kB]
Get:13 http

In [2]:
#DOWNLOAD AND READ THE FILES
from google.colab import files 
  
hack = files.upload()

Saving hack_data.csv to hack_data.csv


In [3]:
df = spark.read.options(inferSchema=True, header=True).csv('hack_data.csv')
df.show()

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
|                   31.0|           356.32|              1|             3.71|            8.0|             Tokelau|           70.58|
|                    2.0|           228.08|              1|             2.48|            8.0|             Bolivia|            70.8|
|                   20.0|            408.5|              0|             3.57

In [4]:
df.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)



In [5]:
df.describe().show()

+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|summary|Session_Connection_Time| Bytes Transferred|   Kali_Trace_Used|Servers_Corrupted|   Pages_Corrupted|   Location|  WPM_Typing_Speed|
+-------+-----------------------+------------------+------------------+-----------------+------------------+-----------+------------------+
|  count|                    334|               334|               334|              334|               334|        334|               334|
|   mean|     30.008982035928145| 607.2452694610777|0.5119760479041916|5.258502994011977|10.838323353293413|       null|57.342395209580864|
| stddev|     14.088200614636158|286.33593163576757|0.5006065264451406| 2.30190693339697|  3.06352633036022|       null| 13.41106336843464|
|    min|                    1.0|              10.0|                 0|              1.0|               6.0|Afghanistan|              40.0|
|    max|           

## Create Feature Set

In [6]:
#VECTORASSEMBLER ASSEMBLES ALL THE FEATURES INTO ONE VECTOR FROM MULTIPLE COLUMNS THAT CONTAIN TYPE DOUBLE. 
features = [ 'Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used', 'Servers_Corrupted', 'Pages_Corrupted', 'WPM_Typing_Speed']
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=features, outputCol='features')
with_features = assembler.transform(df).select('features')
with_features.show()

+--------------------+
|            features|
+--------------------+
|[8.0,391.09,1.0,2...|
|[20.0,720.99,0.0,...|
|[31.0,356.32,1.0,...|
|[2.0,228.08,1.0,2...|
|[20.0,408.5,0.0,3...|
|[1.0,390.69,1.0,2...|
|[18.0,342.97,1.0,...|
|[22.0,101.61,1.0,...|
|[15.0,275.53,1.0,...|
|[12.0,424.83,1.0,...|
|[15.0,249.09,1.0,...|
|[32.0,242.48,0.0,...|
|[23.0,514.54,0.0,...|
|[9.0,284.77,0.0,3...|
|[27.0,779.25,1.0,...|
|[12.0,307.31,1.0,...|
|[21.0,355.94,1.0,...|
|[10.0,372.65,0.0,...|
|[20.0,347.23,1.0,...|
|[22.0,456.57,0.0,...|
+--------------------+
only showing top 20 rows



##Feature Scaling

In [7]:
from pyspark.ml.feature import StandardScaler
scaler = StandardScaler(inputCol='features', outputCol='scaled_features')
scalar_model = scaler.fit(with_features)
scaled_data = scalar_model.transform(with_features)
scaled_data.select('scaled_features').head(1)

[Row(scaled_features=DenseVector([0.5679, 1.3658, 1.9976, 1.2859, 2.2849, 5.3963]))]

##Train KMeans Model and Interpret Cluster Results

In [52]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import numpy as np

evaluator = ClusteringEvaluator(metricName='silhouette', distanceMeasure='squaredEuclidean')
# let's try several different values of k
ka = [3, 2, 4, 5]
silhouette = np.zeros(len(ka))
model = np.zeros(len(ka))
results = []
for i in range(len(ka)):
  kmeans = KMeans(featuresCol='scaled_features', k= ka[i])
  model = kmeans.fit(scaled_data)

  result = model.transform(scaled_data)
  results.append(result)
  silhouette[i] = evaluator.evaluate(result)


maxProbability = max(silhouette)
idxMaxProbability = np.argmax(silhouette)
numHackers = ka[idxMaxProbability]
print(f"idx:{np.argmax(silhouette)}, Max silhouette: {max(silhouette)}, k Max silhouette: {ka[np.argmax(silhouette)]}")


idx:1, Max silhouette: 0.6683623593283755, k Max silhouette: 2


As we can see, of all the possibilities we have covered for k (from 2 to 5), grouping in 2 clusters is the option that makes the most sense, so we deduce in the end that it was 2 hackers who carried out the crime.


##Get clustering results

In [53]:
results[idxMaxProbability].show()

+--------------------+--------------------+----------+
|            features|     scaled_features|prediction|
+--------------------+--------------------+----------+
|[8.0,391.09,1.0,2...|[0.56785108466505...|         1|
|[20.0,720.99,0.0,...|[1.41962771166263...|         1|
|[31.0,356.32,1.0,...|[2.20042295307707...|         1|
|[2.0,228.08,1.0,2...|[0.14196277116626...|         1|
|[20.0,408.5,0.0,3...|[1.41962771166263...|         1|
|[1.0,390.69,1.0,2...|[0.07098138558313...|         1|
|[18.0,342.97,1.0,...|[1.27766494049636...|         1|
|[22.0,101.61,1.0,...|[1.56159048282889...|         1|
|[15.0,275.53,1.0,...|[1.06472078374697...|         1|
|[12.0,424.83,1.0,...|[0.85177662699757...|         1|
|[15.0,249.09,1.0,...|[1.06472078374697...|         1|
|[32.0,242.48,0.0,...|[2.27140433866020...|         1|
|[23.0,514.54,0.0,...|[1.63257186841202...|         1|
|[9.0,284.77,0.0,3...|[0.63883247024818...|         1|
|[27.0,779.25,1.0,...|[1.91649741074455...|         1|
|[12.0,307