# Clustering

## K-Means

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [3]:
sc = SparkContext("local", "sqlContext")
sqc = SQLContext(sc)

In [4]:
path = 'D:/ProgramFiles/Spark/spark-3.0.0-bin-hadoop2.7/data/mllib/'

In [5]:
# Loads data.
dataset = sqc.read.format("libsvm").load(path + "sample_kmeans_data.txt")
dataset.toPandas()

Unnamed: 0,label,features
0,0.0,"(0.0, 0.0, 0.0)"
1,1.0,"(0.1, 0.1, 0.1)"
2,2.0,"(0.2, 0.2, 0.2)"
3,3.0,"(9.0, 9.0, 9.0)"
4,4.0,"(9.1, 9.1, 9.1)"
5,5.0,"(9.2, 9.2, 9.2)"


In [6]:
# Trains a k-means model.
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

# Make predictions
predictions = model.transform(dataset)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette = evaluator.evaluate(predictions)
print("Silhouette with squared euclidean distance = " + str(silhouette))

# Shows the result.
centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Silhouette with squared euclidean distance = 0.9997530305375207
Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


In [7]:
sc.stop()

## Gaussian Mixture Model

In [10]:
from pyspark.ml.clustering import GaussianMixture

In [13]:
sc = SparkContext("local", "sqlContext")
sqc = SQLContext(sc)

In [14]:
# loads data
dataset = sqc.read.format("libsvm").load(path + "sample_kmeans_data.txt")
dataset.toPandas()

Unnamed: 0,label,features
0,0.0,"(0.0, 0.0, 0.0)"
1,1.0,"(0.1, 0.1, 0.1)"
2,2.0,"(0.2, 0.2, 0.2)"
3,3.0,"(9.0, 9.0, 9.0)"
4,4.0,"(9.1, 9.1, 9.1)"
5,5.0,"(9.2, 9.2, 9.2)"


In [15]:
gmm = GaussianMixture().setK(2).setSeed(538009335)
model = gmm.fit(dataset)

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Gaussians shown as a DataFrame: 
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|mean                                                         |cov                                                                                                                                                                                                     |
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.10000000000001552,0.10000000000001552,0.10000000000001552]|0.006666666666806454  0.006666666666806454  0.006666666666806454  
0.006666666666806454  0.006666666666806454

## Credits & Links

http://spark.apache.org/docs/latest/ml-clustering.html#k-means