### Dataset availability: https://github.com/farhanafayez/PySpark-K-means-Clustering-ML/blob/master/seeds_dataset.csv

In [2]:
from pyspark.sql import SparkSession

In [3]:
spark = SparkSession.builder.appName('cluster').getOrCreate()
df = spark.read.csv('seeds_dataset.csv', inferSchema=True, header=True)
df.show(3)

+-----+---------+-----------+-----------------+------------------+---------------------+----------------+
| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+
|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|
|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|
|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+
only showing top 3 rows



In [4]:
df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [5]:
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.clustering import KMeans

assembler = VectorAssembler(inputCols = df.columns, outputCol = 'features')
final_df = assembler.transform(df)
final_df.show(3)

+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+
| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+
|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|
|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|
|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+
only showing top 3 rows



In [6]:
final_df.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [7]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')
scaler_model = scaler.fit(final_df)
final_df = scaler_model.transform(final_df)
final_df.show(3)

+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+
| area|perimeter|compactness| length_of_kernel|   width_of_kernel|asymmetry_coefficient|length_of_groove|            features|      scaledFeatures|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+--------------------+--------------------+
|15.26|    14.84|      0.871|            5.763|             3.312|                2.221|            5.22|[15.26,14.84,0.87...|[5.24452795332028...|
|14.88|    14.57|     0.8811|5.553999999999999|             3.333|                1.018|           4.956|[14.88,14.57,0.88...|[5.11393027165175...|
|14.29|    14.09|      0.905|            5.291|3.3369999999999997|                2.699|           4.825|[14.29,14.09,0.90...|[4.91116018695588...|
+-----+---------+-----------+-----------------+------------------+---------------------+----------------+-------

In [8]:
final_df.take(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [9]:
kmeans = KMeans(featuresCol = 'scaledFeatures', k=3)
model = kmeans.fit(final_df)

In [10]:
print('WSSSE:', model.computeCost(final_df))

WSSSE: 428.6333432285446


In [11]:
centers = model.clusterCenters()
print(centers)

[array([ 4.07135818, 10.14438097, 35.86461803, 11.81349589,  7.53471695,
        3.18317127, 10.39230304]), array([ 4.94114963, 10.95557919, 37.3028184 , 12.42383591,  8.60815545,
        1.80983376, 10.40657797]), array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
        2.41585013, 12.29286107])]


In [12]:
model.transform(final_df).select('scaledFeatures', 'prediction').show()

+--------------------+----------+
|      scaledFeatures|prediction|
+--------------------+----------+
|[5.24452795332028...|         1|
|[5.11393027165175...|         1|
|[4.91116018695588...|         1|
|[4.75650503761158...|         1|
|[5.54696468981581...|         1|
|[4.94209121682475...|         1|
|[5.04863143081749...|         1|
|[4.84929812721816...|         1|
|[5.71536696354628...|         2|
|[5.65006812271202...|         1|
|[5.24452795332028...|         1|
|[4.82180387844584...|         1|
|[4.77368894309428...|         1|
|[4.73588435103234...|         1|
|[4.72213722664617...|         1|
|[5.01426361985209...|         1|
|[4.80805675405968...|         1|
|[5.39230954047151...|         1|
|[5.05206821191403...|         1|
|[4.37158555479908...|         0|
+--------------------+----------+
only showing top 20 rows

