In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('cluster').getOrCreate()

In [5]:
data = spark.read.csv('seeds_dataset.csv', header=True, inferSchema=True)

In [6]:
data.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)



In [8]:
data.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)]

In [9]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.feature import VectorAssembler

In [10]:
data.columns

['area',
 'perimeter',
 'compactness',
 'length_of_kernel',
 'width_of_kernel',
 'asymmetry_coefficient',
 'length_of_groove']

In [13]:
assembler = VectorAssembler(inputCols = data.columns, outputCol = 'features')

output = assembler.transform(data)
output.printSchema()

root
 |-- area: double (nullable = true)
 |-- perimeter: double (nullable = true)
 |-- compactness: double (nullable = true)
 |-- length_of_kernel: double (nullable = true)
 |-- width_of_kernel: double (nullable = true)
 |-- asymmetry_coefficient: double (nullable = true)
 |-- length_of_groove: double (nullable = true)
 |-- features: vector (nullable = true)



In [14]:
from pyspark.ml.feature import StandardScaler

Scaling the data set to bring things in a range

In [16]:
scaler = StandardScaler(inputCol = 'features', outputCol = 'scaledFeatures')

In [17]:
mldata = scaler.fit(output).transform(output)

In [18]:
mldata.head(1)

[Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [21]:
kmean = KMeans(featuresCol='scaledFeatures' ,k=3)

In [22]:
model = kmean.fit(mldata)

In [23]:
print("With in Sum of Squared Errors :")
print(model.computeCost(mldata))

With in Sum of Squared Errors :
428.60820118716356


In [26]:
print("Cluster centers :")
centers = model.clusterCenters()
i = 1
for np in centers:
    print("Center {} : {}".format(i, np))
    i = i+1

Cluster centers :
Center 1 : [  6.35645488  12.40730852  37.41990178  13.93860446   9.7892399
   2.41585013  12.29286107]
Center 2 : [  4.96198582  10.97871333  37.30930808  12.44647267   8.62880781
   1.80061978  10.41913733]
Center 3 : [  4.07497225  10.14410142  35.89816849  11.80812742   7.54416916
   3.15410901  10.38031464]


In [29]:
print("Prediction by the model: ")
model.transform(mldata).select('prediction').show()

Prediction by the model: 
+----------+
|prediction|
+----------+
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         0|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         1|
|         2|
+----------+
only showing top 20 rows

