In [1]:
import findspark

findspark.find()
findspark.init()

In [2]:
from pyspark import SparkContext
from pyspark.sql import SparkSession, Row
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [3]:
spark = SparkSession \
        .builder \
        .appName("K-means_ML") \
        .getOrCreate()

In [4]:
# Loads data.
dataset = spark.read.format("libsvm").load("data/mllib/sample_kmeans_data.txt")

In [5]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

In [6]:
# Make predictions
predictions = model.transform(dataset)

In [7]:
# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

In [8]:
silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

Silhouette with squared euclidean distance = 0.9997530305375207


In [9]:
# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]
