In [0]:
from pyspark.sql import SparkSession 
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import StandardScaler,VectorAssembler
spark = SparkSession.builder.appName("cluster").getOrCreate()
df = spark.read.options(header=True,inferSchema=True).csv("/FileStore/tables/hack_data.csv")
features = [x for x in df.columns if x != 'Location' ]

In [0]:
assembler = VectorAssembler(inputCols=features,outputCol='features')
df2 = assembler.transform(df).select('features')
df2.show()

+--------------------+
|            features|
+--------------------+
|[8.0,391.09,1.0,2...|
|[20.0,720.99,0.0,...|
|[31.0,356.32,1.0,...|
|[2.0,228.08,1.0,2...|
|[20.0,408.5,0.0,3...|
|[1.0,390.69,1.0,2...|
|[18.0,342.97,1.0,...|
|[22.0,101.61,1.0,...|
|[15.0,275.53,1.0,...|
|[12.0,424.83,1.0,...|
|[15.0,249.09,1.0,...|
|[32.0,242.48,0.0,...|
|[23.0,514.54,0.0,...|
|[9.0,284.77,0.0,3...|
|[27.0,779.25,1.0,...|
|[12.0,307.31,1.0,...|
|[21.0,355.94,1.0,...|
|[10.0,372.65,0.0,...|
|[20.0,347.23,1.0,...|
|[22.0,456.57,0.0,...|
+--------------------+
only showing top 20 rows



In [0]:
scaler = StandardScaler(inputCol='features',outputCol='scaled_features')
scaler_model = scaler.fit(df2)
df3 = scaler_model.transform(df2)
df3.show()

+--------------------+--------------------+
|            features|     scaled_features|
+--------------------+--------------------+
|[8.0,391.09,1.0,2...|[0.56785108466505...|
|[20.0,720.99,0.0,...|[1.41962771166263...|
|[31.0,356.32,1.0,...|[2.20042295307707...|
|[2.0,228.08,1.0,2...|[0.14196277116626...|
|[20.0,408.5,0.0,3...|[1.41962771166263...|
|[1.0,390.69,1.0,2...|[0.07098138558313...|
|[18.0,342.97,1.0,...|[1.27766494049636...|
|[22.0,101.61,1.0,...|[1.56159048282889...|
|[15.0,275.53,1.0,...|[1.06472078374697...|
|[12.0,424.83,1.0,...|[0.85177662699757...|
|[15.0,249.09,1.0,...|[1.06472078374697...|
|[32.0,242.48,0.0,...|[2.27140433866020...|
|[23.0,514.54,0.0,...|[1.63257186841202...|
|[9.0,284.77,0.0,3...|[0.63883247024818...|
|[27.0,779.25,1.0,...|[1.91649741074455...|
|[12.0,307.31,1.0,...|[0.85177662699757...|
|[21.0,355.94,1.0,...|[1.49060909724576...|
|[10.0,372.65,0.0,...|[0.70981385583131...|
|[20.0,347.23,1.0,...|[1.41962771166263...|
|[22.0,456.57,0.0,...|[1.5615904

In [0]:
kmeans_two_attacker = KMeans(featuresCol='scaled_features',k=2)
kmeans_model_two_attacker = kmeans_two_attacker.fit(df3)
df4 = kmeans_model_two_attacker.transform(df3).select("features",'prediction')
df4.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



In [0]:
kmeans_three_attacker = KMeans(featuresCol='scaled_features',k=3)
kmeans_model_three_attacker = kmeans_three_attacker.fit(df3)
df5 = kmeans_model_three_attacker.transform(df3).select("features",'prediction')
df5.groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+



Conclusion: Since the hackers trade off attacks, the data represents two hackers being involved in all the records of attacks because they should have roughly the same amount of attacks