In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("Kmeans").getOrCreate()

In [3]:
dataset = spark.read.csv('./data/seeds_dataset.csv', inferSchema=True, header=True)

In [4]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [5]:
dataset.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [8]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [9]:
dataset.columns 

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [12]:
assembler = VectorAssembler(inputCols=[
         'area',
         'perimeter',
         'compactness',
         'length_of_kernel',
         'width_of_kernel',
         'asymmetry_coefficient',
         'length_of_groove'], outputCol='features')

In [13]:
final_data = assember.transform(dataset)

In [14]:
from pyspark.ml.feature import StandardScaler

In [15]:
scaler = StandardScaler(inputCol='features',
                       outputCol='scaledFeatures')

In [16]:
scaler_model = scaler.fit(final_data)

In [19]:
final_data = scaler_model.transform(final_data)

In [20]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [21]:
model = kmeans.fit(final_data)

In [23]:
centers = model.clusterCenters()

In [24]:
print(centers)

[array([ 6.32636687, 12.38115343, 37.39222755, 13.9206997 ,  9.75485787,
        2.41428142, 12.28078861]), array([ 4.078007  , 10.15076404, 35.87686106, 11.81860981,  7.5430707 ,
        3.17727834, 10.39174095]), array([ 4.9360523 , 10.94499696, 37.33487983, 12.40173794,  8.61516278,
        1.7804233 , 10.36535821])]


In [25]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows

