# Clustering

## K-Means

In [2]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

In [17]:
spark = SparkContext("local", "sqlContext")
sql = SQLContext(spark)

In [18]:
path = 'D:/ProgramFiles/Spark/spark-3.0.0-bin-hadoop2.7/data/mllib/'

In [10]:
# Loads data.
df = sql.read.format("libsvm").load(path + "sample_kmeans_data.txt")
df.toPandas()

Unnamed: 0,label,features
0,0.0,"(0.0, 0.0, 0.0)"
1,1.0,"(0.1, 0.1, 0.1)"
2,2.0,"(0.2, 0.2, 0.2)"
3,3.0,"(9.0, 9.0, 9.0)"
4,4.0,"(9.1, 9.1, 9.1)"
5,5.0,"(9.2, 9.2, 9.2)"


#### Build the model

In [11]:
kmeans = KMeans().setK(2).setSeed(1)
model = kmeans.fit(dataset)

centers = model.clusterCenters()
print("Cluster Centers: ")
for center in centers:
    print(center)

Cluster Centers: 
[9.1 9.1 9.1]
[0.1 0.1 0.1]


#### Test

In [13]:
pred = model.transform(df)
pred.toPandas()

Unnamed: 0,label,features,prediction
0,0.0,"(0.0, 0.0, 0.0)",1
1,1.0,"(0.1, 0.1, 0.1)",1
2,2.0,"(0.2, 0.2, 0.2)",1
3,3.0,"(9.0, 9.0, 9.0)",0
4,4.0,"(9.1, 9.1, 9.1)",0
5,5.0,"(9.2, 9.2, 9.2)",0


#### Validate

In [14]:
eval = ClusteringEvaluator()
silhouette = eval.evaluate(pred)
print('Silhouette = ', str(silhouette))

Silhouette =  0.9997530305375207


## Gaussian Mixture Model

In [16]:
from pyspark.ml.clustering import GaussianMixture

In [19]:
df = sql.read.format("libsvm").load(path + "sample_kmeans_data.txt")
df.toPandas()

Unnamed: 0,label,features
0,0.0,"(0.0, 0.0, 0.0)"
1,1.0,"(0.1, 0.1, 0.1)"
2,2.0,"(0.2, 0.2, 0.2)"
3,3.0,"(9.0, 9.0, 9.0)"
4,4.0,"(9.1, 9.1, 9.1)"
5,5.0,"(9.2, 9.2, 9.2)"


In [20]:
gmm = GaussianMixture().setK(2).setSeed(538009335)
model = gmm.fit(df)

print("Gaussians shown as a DataFrame: ")
model.gaussiansDF.show(truncate=False)

Gaussians shown as a DataFrame: 
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|mean                                                         |cov                                                                                                                                                                                                     |
+-------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|[0.10000000000001552,0.10000000000001552,0.10000000000001552]|0.006666666666806454  0.006666666666806454  0.006666666666806454  
0.006666666666806454  0.006666666666806454

In [15]:
spark.stop()

## Credits & Links

http://spark.apache.org/docs/latest/ml-clustering.html#k-means