# Spark ML KMeans Clustering Project

In [0]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster_consult').getOrCreate()

In [0]:
data = spark.read.csv('/FileStore/tables/hack_data.csv', inferSchema=True, header=True)

In [0]:
data.head(1)

Out[3]: [Row(Session_Connection_Time=8.0, Bytes Transferred=391.09, Kali_Trace_Used=1, Servers_Corrupted=2.96, Pages_Corrupted=7.0, Location='Slovenia', WPM_Typing_Speed=72.37)]

# Process Data

In [0]:
from pyspark.ml.feature import VectorAssembler, StandardScaler

In [0]:
data.columns

Out[6]: ['Session_Connection_Time',
 'Bytes Transferred',
 'Kali_Trace_Used',
 'Servers_Corrupted',
 'Pages_Corrupted',
 'Location',
 'WPM_Typing_Speed']

In [0]:
feature_cols = [
    'Session_Connection_Time',
    'Bytes Transferred',
    'Kali_Trace_Used',
    'Servers_Corrupted',
    'Pages_Corrupted',
    'WPM_Typing_Speed'
]

In [0]:
assembler = VectorAssembler(
    inputCols = feature_cols,
    outputCol='features'
)

In [0]:
final_data = assembler.transform(data)

In [0]:
final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)



In [0]:
scaler = StandardScaler(
    inputCol='features',
    outputCol='scaledFeatures'
)

In [0]:
scaled_final_data = scaler.fit(final_data).transform(final_data)

In [0]:
scaled_final_data.printSchema()

root
 |-- Session_Connection_Time: double (nullable = true)
 |-- Bytes Transferred: double (nullable = true)
 |-- Kali_Trace_Used: integer (nullable = true)
 |-- Servers_Corrupted: double (nullable = true)
 |-- Pages_Corrupted: double (nullable = true)
 |-- Location: string (nullable = true)
 |-- WPM_Typing_Speed: double (nullable = true)
 |-- features: vector (nullable = true)
 |-- scaledFeatures: vector (nullable = true)



# Model

In [0]:
from pyspark.ml.clustering import KMeans

In [0]:
kmeans2 = KMeans(featuresCol='scaledFeatures', k=2)
kmeans3 = KMeans(featuresCol='scaledFeatures', k=3)

In [0]:
model_k2 = kmeans2.fit(scaled_final_data)
model_k3 = kmeans3.fit(scaled_final_data)

# Evaluate

In [0]:
model_k3.transform(scaled_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|   83|
|         2|   84|
|         0|  167|
+----------+-----+



In [0]:
model_k2.transform(scaled_final_data).groupBy('prediction').count().show()

+----------+-----+
|prediction|count|
+----------+-----+
|         1|  167|
|         0|  167|
+----------+-----+



# Conclusion

Only two hackers!