# Clustering example

In [2]:
import findspark

In [3]:
findspark.init('/home/oussama/spark-2.4.0-bin-hadoop2.7')

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [5]:
dataset = spark.read.csv('seeds_dataset.csv',header=True,
                        inferSchema=True)

In [6]:
dataset.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [7]:
dataset.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [8]:
from pyspark.ml.clustering import KMeans

In [9]:
from pyspark.ml.feature import VectorAssembler

In [10]:
dataset.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [11]:
assembler = VectorAssembler(inputCols=dataset.columns,
                           outputCol='features')

In [12]:
final_data = assembler.transform(dataset)

In [13]:
final_data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
from pyspark.ml.feature import StandardScaler

In [17]:
scaler = StandardScaler(inputCol='features',
                       outputCol='scaledFeatures',
                       )

In [18]:
scaler_model = scaler.fit(final_data)

In [19]:
final_data = scaler_model.transform(final_data)

In [20]:
final_data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [21]:
kmeans = KMeans(featuresCol='scaledFeatures', k=3)

In [32]:
model = kmeans.fit(final_data)

In [33]:
print('wssse')
print(model.computeCost(final_data))

wssse
429.07559671506715


In [34]:
centers = model.clusterCenters()

In [35]:
print(centers)

[array([ 6.31670546, 12.37109759, 37.39491396, 13.91155062,  9.748067  ,
        2.39849968, 12.2661748 ]), array([ 4.06105916, 10.13979506, 35.80536984, 11.82133095,  7.50395937,
        3.27184732, 10.42126018]), array([ 4.87257659, 10.88120146, 37.27692543, 12.3410157 ,  8.55443412,
        1.81649011, 10.32998598])]


In [38]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         0|
|         0|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         2|
|         1|
+----------+
only showing top 20 rows

