In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.0.tar.gz (281.3 MB)
[K     |████████████████████████████████| 281.3 MB 42 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 45.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.0-py2.py3-none-any.whl size=281764026 sha256=572963157579f1ca0c0e18a5f7ed3a4cd127a9d24c7d106a7ce21c088fcefba4
  Stored in directory: /root/.cache/pip/wheels/7a/8e/1b/f73a52650d2e5f337708d9f6a1750d451a7349a867f928b885
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.0


In [19]:
import pandas as pd

df = pd.read_csv('iris.csv', header=0)
ds = df.sample(frac=1).reset_index(drop=True)
ds.to_csv('iris_shuffled.csv', index=False)

In [20]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('clustering').getOrCreate()
df = spark.read.csv('iris_shuffled.csv', header = True, inferSchema = True)

In [21]:
df.columns

['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']

In [22]:
df.select("species").distinct().show()

+---------------+
|        species|
+---------------+
| Iris-virginica|
|    Iris-setosa|
|Iris-versicolor|
+---------------+



In [23]:
df = df.replace(['Iris-versicolor', 'Iris-setosa', 'Iris-virginica'], ['0', '1', '2'], 'species')
df = df.withColumn("species",df.species.cast('int'))
df.show()

+------------+-----------+------------+-----------+-------+
|sepal_length|sepal_width|petal_length|petal_width|species|
+------------+-----------+------------+-----------+-------+
|         5.0|        3.5|         1.3|        0.3|      1|
|         6.7|        3.3|         5.7|        2.5|      2|
|         5.2|        4.1|         1.5|        0.1|      1|
|         4.8|        3.1|         1.6|        0.2|      1|
|         4.7|        3.2|         1.3|        0.2|      1|
|         6.0|        2.9|         4.5|        1.5|      0|
|         5.1|        2.5|         3.0|        1.1|      0|
|         4.4|        2.9|         1.4|        0.2|      1|
|         4.9|        3.0|         1.4|        0.2|      1|
|         5.4|        3.4|         1.7|        0.2|      1|
|         5.7|        3.0|         4.2|        1.2|      0|
|         7.1|        3.0|         5.9|        2.1|      2|
|         6.7|        3.3|         5.7|        2.1|      2|
|         5.8|        2.7|         5.1| 

K Means

In [24]:
from pyspark.ml.feature import VectorAssembler

features =  ('sepal_length', 'sepal_width', 'petal_length', 'petal_width') 
assembler = VectorAssembler(inputCols=features,outputCol="features")
dataset = assembler.transform(df)

In [25]:
from pyspark.ml.clustering import KMeans
from sklearn.metrics import accuracy_score

trainingData, testData = dataset.randomSplit([0.8, 0.2])

# Trains a k-means model
kmeans = KMeans().setK(3).setSeed(1)
model = kmeans.fit(trainingData)

# Make predictions
predictions = model.transform(testData)

true_labels=predictions.select('species')
km_predictions=predictions.select('prediction')

accuracy = accuracy_score(true_labels.toPandas(), km_predictions.toPandas())
print("KMeans Accuracy =",accuracy*100,"%") # ~~> will vary with each run depending on the testing portion of the data (since our dataset is very small to begin with)

KMeans Accuracy = 96.55172413793103 %


In [26]:
predictions.show()

+------------+-----------+------------+-----------+-------+-----------------+----------+
|sepal_length|sepal_width|petal_length|petal_width|species|         features|prediction|
+------------+-----------+------------+-----------+-------+-----------------+----------+
|         4.3|        3.0|         1.1|        0.1|      1|[4.3,3.0,1.1,0.1]|         1|
|         4.5|        2.3|         1.3|        0.3|      1|[4.5,2.3,1.3,0.3]|         1|
|         4.8|        3.0|         1.4|        0.1|      1|[4.8,3.0,1.4,0.1]|         1|
|         4.8|        3.4|         1.9|        0.2|      1|[4.8,3.4,1.9,0.2]|         1|
|         5.0|        3.5|         1.6|        0.6|      1|[5.0,3.5,1.6,0.6]|         1|
|         5.1|        3.7|         1.5|        0.4|      1|[5.1,3.7,1.5,0.4]|         1|
|         5.3|        3.7|         1.5|        0.2|      1|[5.3,3.7,1.5,0.2]|         1|
|         5.5|        2.3|         4.0|        1.3|      0|[5.5,2.3,4.0,1.3]|         0|
|         5.5|       