In [1]:
from pyspark.sql import SparkSession

In [2]:
spark=SparkSession.builder.appName('clustering').getOrCreate()

In [4]:
input_file_path="file:///C:/Users/ckp43_000/Documents/seeds_dataset.csv"

In [5]:
data=spark.read.csv(input_file_path,inferSchema=True,header=True)

In [6]:
data.show(5)

+---+-----+---------+-----------+--------------+-------------+--------------------+--------------------+--------+
| ID| area|perimeter|compactness|lengthOfKernel|widthOfKernel|asymmetryCoefficient|lengthOfKernelGroove|seedType|
+---+-----+---------+-----------+--------------+-------------+--------------------+--------------------+--------+
|  1|15.26|    14.84|      0.871|         5.763|        3.312|               2.221|                5.22|       1|
|  2|14.88|    14.57|     0.8811|         5.554|        3.333|               1.018|               4.956|       1|
|  3|14.29|    14.09|      0.905|         5.291|        3.337|               2.699|               4.825|       1|
|  4|13.84|    13.94|     0.8955|         5.324|        3.379|               2.259|               4.805|       1|
|  5|16.14|    14.99|     0.9034|         5.658|        3.562|               1.355|               5.175|       1|
+---+-----+---------+-----------+--------------+-------------+--------------------+-----

In [8]:
from pyspark.ml.clustering import KMeans,KMeansModel

In [9]:
from pyspark.ml.feature import VectorAssembler

In [11]:
data.columns

['ID',
 'area',
 'perimeter',
 'compactness',
 'lengthOfKernel',
 'widthOfKernel',
 'asymmetryCoefficient',
 'lengthOfKernelGroove',
 'seedType']

In [13]:
assembler=VectorAssembler(inputCols=['area',
 'perimeter',
 'compactness',
 'lengthOfKernel',
 'widthOfKernel',
 'asymmetryCoefficient',
 'lengthOfKernelGroove'],outputCol='features')

In [14]:
final_data=assembler.transform(data)

In [15]:
final_data.show(5)

+---+-----+---------+-----------+--------------+-------------+--------------------+--------------------+--------+--------------------+
| ID| area|perimeter|compactness|lengthOfKernel|widthOfKernel|asymmetryCoefficient|lengthOfKernelGroove|seedType|            features|
+---+-----+---------+-----------+--------------+-------------+--------------------+--------------------+--------+--------------------+
|  1|15.26|    14.84|      0.871|         5.763|        3.312|               2.221|                5.22|       1|[15.26,14.84,0.87...|
|  2|14.88|    14.57|     0.8811|         5.554|        3.333|               1.018|               4.956|       1|[14.88,14.57,0.88...|
|  3|14.29|    14.09|      0.905|         5.291|        3.337|               2.699|               4.825|       1|[14.29,14.09,0.90...|
|  4|13.84|    13.94|     0.8955|         5.324|        3.379|               2.259|               4.805|       1|[13.84,13.94,0.89...|
|  5|16.14|    14.99|     0.9034|         5.658|       

In [16]:
from pyspark.ml.feature import StandardScaler

In [17]:
scaler=StandardScaler(inputCol='features',outputCol='scaledFeatures')

In [22]:
scaler_model=scaler.fit(final_data)

In [23]:
final_data=scaler_model.transform(final_data)

In [25]:
final_data.head(1)

[Row(ID=1, area=15.26, perimeter=14.84, compactness=0.871, lengthOfKernel=5.763, widthOfKernel=3.312, asymmetryCoefficient=2.221, lengthOfKernelGroove=5.22, seedType=1, features=DenseVector([15.26, 14.84, 0.871, 5.763, 3.312, 2.221, 5.22]), scaledFeatures=DenseVector([5.2445, 11.3633, 36.8608, 13.0072, 8.7685, 1.4772, 10.621]))]

In [26]:
kmeans=KMeans(featuresCol='scaledFeatures',k=3)

In [29]:
model=kmeans.fit(final_data)

In [30]:
wssse=model.computeCost(final_data)

AttributeError: 'KMeansModel' object has no attribute 'computeCost'

In [35]:
centers=model.clusterCenters()

In [36]:
centers

[array([ 4.96198582, 10.97871333, 37.30930808, 12.44647267,  8.62880781,
         1.80062386, 10.41913733]),
 array([ 6.35645488, 12.40730852, 37.41990178, 13.93860446,  9.7892399 ,
         2.41585309, 12.29286107]),
 array([ 4.07497225, 10.14410142, 35.89816849, 11.80812742,  7.54416916,
         3.15411286, 10.38031464])]

In [38]:
model.transform(final_data).show(5)

+---+-----+---------+-----------+--------------+-------------+--------------------+--------------------+--------+--------------------+--------------------+----------+
| ID| area|perimeter|compactness|lengthOfKernel|widthOfKernel|asymmetryCoefficient|lengthOfKernelGroove|seedType|            features|      scaledFeatures|prediction|
+---+-----+---------+-----------+--------------+-------------+--------------------+--------------------+--------+--------------------+--------------------+----------+
|  1|15.26|    14.84|      0.871|         5.763|        3.312|               2.221|                5.22|       1|[15.26,14.84,0.87...|[5.24452795332028...|         0|
|  2|14.88|    14.57|     0.8811|         5.554|        3.333|               1.018|               4.956|       1|[14.88,14.57,0.88...|[5.11393027165175...|         0|
|  3|14.29|    14.09|      0.905|         5.291|        3.337|               2.699|               4.825|       1|[14.29,14.09,0.90...|[4.91116018695588...|         0

In [40]:
model.transform(final_data).select('prediction').show()

+----------+
|prediction|
+----------+
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         1|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         0|
|         2|
+----------+
only showing top 20 rows

