In [1]:
!sudo apt update
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# Check this site for the latest download link
# https://www.apache.org/dyn/closer.lua/spark
!wget -q https://dlcdn.apache.org/spark/spark-3.5.2/spark-3.5.2-bin-hadoop3.tgz
!tar xf spark-3.5.2-bin-hadoop3.tgz
!pip install -q findspark
!pip install pyspark
!pip install py4j

Hit:1 http://archive.ubuntu.com/ubuntu jammy InRelease
Get:2 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [128 kB]
Get:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease [3,626 B]
Get:4 http://security.ubuntu.com/ubuntu jammy-security InRelease [129 kB]
Get:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [127 kB]
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Get:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease [6,555 B]
Hit:8 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:9 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/ubuntugis/ppa/ubuntu jammy InRelease
Get:11 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1,452 kB]
Get:12 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2,700 kB]
Get:13 https://r2u.stat.illinois.edu/ubuntu jammy/main all Packages 

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [20]:
from pyspark.sql import SparkSession
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
from pyspark.sql import Row
from pyspark.ml.feature import VectorAssembler

# Initialize SparkSession
spark = SparkSession.builder \
            .appName("Kmeans-Example") \
            .config("spark.ui.port","4040") \
            .getOrCreate()

spark.sparkContext.setLogLevel("ERROR")

wines_Df = spark.read.csv("/content/drive/MyDrive/Colab Notebooks (1)/datasets/wine-clustering.csv", header=True, inferSchema=True)

In [21]:
wines_Df.show(truncate = False)

+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|Alcohol|Malic_Acid|Ash |Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity|Hue |OD280|Proline|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|14.23  |1.71      |2.43|15.6        |127      |2.8          |3.06      |0.28                |2.29           |5.64           |1.04|3.92 |1065   |
|13.2   |1.78      |2.14|11.2        |100      |2.65         |2.76      |0.26                |1.28           |4.38           |1.05|3.4  |1050   |
|13.16  |2.36      |2.67|18.6        |101      |2.8          |3.24      |0.3                 |2.81           |5.68           |1.03|3.17 |1185   |
|14.37  |1.95      |2.5 |16.8        |113      |3.85         |3.49      |0.24                |2.18           |7.8           

In [22]:
# Use VectorAssembler to combine "x" and "y" into a single "features" vector column
assembler = VectorAssembler(inputCols=["Alcohol", "Malic_Acid", "Ash", "Ash_Alcanity", "Total_Phenols", "Flavanoids", "Nonflavanoid_Phenols", "Proanthocyanins", "Color_Intensity", "Hue", "OD280", "Proline"], outputCol="features")
assembled_df = assembler.transform(wines_Df)


# Initialize KMeans
k = 2 # Number of clusters
kmeans_2 = KMeans().setK(k).setSeed(1)
model_2 = kmeans_2.fit(assembled_df)

k = 10 # Number of clusters
kmeans_10 = KMeans().setK(k).setSeed(1)
model_10 = kmeans_10.fit(assembled_df)


k = 15 # Number of clusters
kmeans_15 = KMeans().setK(k).setSeed(1)
model_15 = kmeans_15.fit(assembled_df)

k = 20 # Number of clusters
kmeans_20 = KMeans().setK(k).setSeed(1)
model_20 = kmeans_20.fit(assembled_df)


# Make predictions
predictions_2 = model_2.transform(assembled_df)
predictions_10 = model_10.transform(assembled_df)
predictions_15 = model_15.transform(assembled_df)
predictions_20 = model_20.transform(assembled_df)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions_2)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model_2.clusterCenters():
    print(center)

silhouette = evaluator.evaluate(predictions_10)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model_10.clusterCenters():
    print(center)

silhouette = evaluator.evaluate(predictions_15)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model_15.clusterCenters():
    print(center)

silhouette = evaluator.evaluate(predictions_20)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model_20.clusterCenters():
    print(center)


# Stop the Spark session
spark.stop()

Silhouette score: 0.8223063951375856
Cluster Centers: 
[1.27028455e+01 2.54455285e+00 2.33910569e+00 2.04081301e+01
 2.06211382e+00 1.64146341e+00 3.92682927e-01 1.45406504e+00
 4.85138211e+00 9.08617886e-01 2.40821138e+00 5.65869919e+02]
[1.36665455e+01 1.87072727e+00 2.42781818e+00 1.74527273e+01
 2.81618182e+00 2.89654545e+00 2.92909091e-01 1.89690909e+00
 5.52036364e+00 1.06665455e+00 3.06672727e+00 1.15172727e+03]
Silhouette score: 0.7315300014998459
Cluster Centers: 
[1.29443333e+01 2.67166667e+00 2.36400000e+00 1.94300000e+01
 2.09400000e+00 1.49666667e+00 4.17333333e-01 1.53100000e+00
 5.91200000e+00 8.82333333e-01 2.24433333e+00 7.01566667e+02]
[1.37623529e+01 1.78058824e+00 2.54058824e+00 1.73588235e+01
 2.83294118e+00 2.97588235e+00 3.08235294e-01 1.82352941e+00
 5.91647059e+00 1.09529412e+00 3.03823529e+00 1.27088235e+03]
[1.23835714e+01 1.87928571e+00 2.26000000e+00 1.99500000e+01
 2.30785714e+00 1.94071429e+00 3.65714286e-01 1.54285714e+00
 3.61714286e+00 1.10428571e+00 2