In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [3]:
from pyspark.ml.clustering import KMeans

In [4]:
dataset = spark.read.format('libsvm').load('sample_kmeans_data.txt')

In [5]:
dataset.show()

+-----+--------------------+
|label|            features|
+-----+--------------------+
|  0.0|           (3,[],[])|
|  1.0|(3,[0,1,2],[0.1,0...|
|  2.0|(3,[0,1,2],[0.2,0...|
|  3.0|(3,[0,1,2],[9.0,9...|
|  4.0|(3,[0,1,2],[9.1,9...|
|  5.0|(3,[0,1,2],[9.2,9...|
+-----+--------------------+



In [6]:
final_data = dataset.select('features')

In [7]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



### Create our model

Because it is an unsupervised dataset we only need the featured column

In [10]:
kmeans = KMeans().setK(2).setSeed(1)

### Fit the model

In [11]:
model = kmeans.fit(final_data)

### Within set sum squared errors

In [12]:
wsse = model.computeCost(final_data)

In [13]:
print(wsse)

0.11999999999994547


### Checking the clusters centers

In [14]:
final_data.show()

+--------------------+
|            features|
+--------------------+
|           (3,[],[])|
|(3,[0,1,2],[0.1,0...|
|(3,[0,1,2],[0.2,0...|
|(3,[0,1,2],[9.0,9...|
|(3,[0,1,2],[9.1,9...|
|(3,[0,1,2],[9.2,9...|
+--------------------+



In [15]:
centers = model.clusterCenters()

In [16]:
centers

[array([0.1, 0.1, 0.1]), array([9.1, 9.1, 9.1])]

In [18]:
results = model.transform(final_data)

In [19]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         0|
|(3,[0,1,2],[0.1,0...|         0|
|(3,[0,1,2],[0.2,0...|         0|
|(3,[0,1,2],[9.0,9...|         1|
|(3,[0,1,2],[9.1,9...|         1|
|(3,[0,1,2],[9.2,9...|         1|
+--------------------+----------+



### Now running with a higher k value

In [20]:
kmeans = KMeans().setK(3).setSeed(1)

In [21]:
model = kmeans.fit(final_data)

In [22]:
wsse = model.computeCost(final_data)

In [23]:
print(wsse)

0.07499999999994544


In [24]:
centers = model.clusterCenters()

In [25]:
centers

[array([9.1, 9.1, 9.1]), array([0.05, 0.05, 0.05]), array([0.2, 0.2, 0.2])]

In [26]:
results = model.transform(final_data)

In [27]:
results.show()

+--------------------+----------+
|            features|prediction|
+--------------------+----------+
|           (3,[],[])|         1|
|(3,[0,1,2],[0.1,0...|         1|
|(3,[0,1,2],[0.2,0...|         2|
|(3,[0,1,2],[9.0,9...|         0|
|(3,[0,1,2],[9.1,9...|         0|
|(3,[0,1,2],[9.2,9...|         0|
+--------------------+----------+

