# Ref: [A sample ML Pipeline for Clustering in Spark](https://blog.knoldus.com/2016/02/09/a-sample-ml-pipeline-for-clustering-in-spark/)

In [None]:
from pyspark.ml import Pipeline
from pyspark.ml.feature import  StringIndexer,  VectorIndexer,VectorAssembler
from pyspark.ml.clustering import KMeans
import random
import matplotlib.pyplot as plt
%matplotlib inline

# 產生sample data

In [None]:
x1 = [random.uniform(-3,-1) for i in range(30)]
y1 = [random.uniform(-0.5,0.5) for i in range(30)]
p1x = sc.parallelize(x1)
p1y = sc.parallelize(y1)
p1 = p1x.zip(p1y)

x2 = [random.uniform(-1,1) for i in range(30)]
y2 = [random.uniform(0.5,1.5) for i in range(30)]
p2x = sc.parallelize(x2)
p2y = sc.parallelize(y2)
p2 = p2x.zip(p2y)

x3 = [random.uniform(1,3) for i in range(30)]
y3 = [random.uniform(-0.5,0.5) for i in range(30)]
p3x = sc.parallelize(x3)
p3y = sc.parallelize(y3)
p3 = p3x.zip(p3y)

data = p1.union(p2).union(p3).cache()


# 建立dataframe

In [None]:
pointDF = data.toDF().toDF("x","y")
pointDF.count()
pointDF.show(5)

# 建立kmeans pipeline

In [None]:
vectorAssembler = VectorAssembler(inputCols=["x","y"], outputCol="features")
kmeans = KMeans().setK(3).setSeed(1L).setFeaturesCol("features").setPredictionCol("prediction")
km_pipeline = Pipeline(stages=[vectorAssembler,kmeans])

# training

In [None]:
km_pipelineModel = km_pipeline.fit(pointDF)

# prediction

In [None]:
predictionResult = km_pipelineModel.transform(pointDF)

In [None]:
predictionResult.show(10)

# 轉成Pandas dataframe呈現

In [None]:
pdf = predictionResult.toPandas()
pdf

In [None]:
ax = pdf[(pdf.prediction == 0)].plot.scatter(x='x', y='y', color='Green');
pdf[(pdf.prediction == 1)].plot.scatter(x='x', y='y', color='Red', ax=ax);
pdf[(pdf.prediction == 2)].plot.scatter(x='x', y='y', color='Blue', ax=ax);