In [1]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('hack_find').getOrCreate()

In [2]:
from pyspark.ml.clustering import KMeans
dataset=spark.read.csv('hack_data.csv',header=True,inferSchema=True)

In [3]:
dataset.show(2)

+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|Session_Connection_Time|Bytes Transferred|Kali_Trace_Used|Servers_Corrupted|Pages_Corrupted|            Location|WPM_Typing_Speed|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
|                    8.0|           391.09|              1|             2.96|            7.0|            Slovenia|           72.37|
|                   20.0|           720.99|              0|             3.04|            9.0|British Virgin Is...|           69.08|
+-----------------------+-----------------+---------------+-----------------+---------------+--------------------+----------------+
only showing top 2 rows



In [4]:
from pyspark.ml.feature import VectorAssembler
assembler=VectorAssembler(inputCols=['Session_Connection_Time', 'Bytes Transferred', 'Kali_Trace_Used',
             'Servers_Corrupted', 'Pages_Corrupted','WPM_Typing_Speed'],outputCol='features')

In [5]:
final_data=assembler.transform(dataset)

In [6]:
from pyspark.ml.feature import StandardScaler
scaler=StandardScaler(inputCol='features',outputCol='scaledFeatures',withMean=True,withStd=True)

In [7]:
scaled_data=scaler.fit(final_data).transform(final_data)

In [8]:
kmeans2=KMeans(featuresCol='scaledFeatures',k=2)
kmeans3=KMeans(featuresCol='scaledFeatures',k=3)
model_k2=kmeans2.fit(scaled_data)
model_k3=kmeans3.fit(scaled_data)


In [18]:
wssse2=model_k2.computeCost(scaled_data)
wssse3=model_k3.computeCost(scaled_data)
print(wssse2)
print(wssse3)

601.7707512676687
434.149289871582


In [19]:
for k in range(2,9):
    kmeans = KMeans(featuresCol='scaledFeatures',k=k)
    model = kmeans.fit(scaled_data)
    wssse = model.computeCost(scaled_data)
    print("With K={}".format(k))
    print("Within Set Sum of Squared Errors = " + str(wssse))
    print('--'*30)

With K=2
Within Set Sum of Squared Errors = 601.7707512676687
------------------------------------------------------------
With K=3
Within Set Sum of Squared Errors = 434.149289871582
------------------------------------------------------------
With K=4
Within Set Sum of Squared Errors = 415.73725012808256
------------------------------------------------------------
With K=5
Within Set Sum of Squared Errors = 246.920992463409
------------------------------------------------------------
With K=6
Within Set Sum of Squared Errors = 227.20366243156627
------------------------------------------------------------
With K=7
Within Set Sum of Squared Errors = 217.66575854773967
------------------------------------------------------------
With K=8
Within Set Sum of Squared Errors = 201.0281554861066
------------------------------------------------------------


In [23]:
model_k2.transform(scaled_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [24]:
model_k3.transform(scaled_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         2|   84|
|         0|   83|
+----------+-----+



In [26]:
#Bingo! It was 2 hackers, in fact, our clustering algorithm created 
#two equally sized clusters with K=2, no way that is a coincidence!