# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Lab 13 - Clustering with K-Means** </center>
---
## <center> **Big Data** </center>
---
### <center> **Spring 2025** </center>
---
### <center> **05/11/2025** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Team**: Foraneos

**Students**: Eddie, Konrad 

In [2]:
import findspark
findspark.init()

#### Spark Session creation

In [3]:
from pyspark.sql import SparkSession

konrad_port = "0638c7435d1d"
eddie_port = "8776010e8f6a"

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master("spark://{}:7077".format(eddie_port)) \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/12 00:27:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Data preparation

In [4]:
from foraneos.spark_utils import SparkUtils as SpU

wine_schema = SpU.generate_schema([
    ("alcohol", "float", True),                      # Alcohol content
    ("malic_acid", "float", True),                   # Malic acid content
    ("ash", "float", True),                          # Ash content
    ("alcalinity_of_ash", "float", True),            # Alcalinity of ash
    ("magnesium", "int", True),                      # Magnesium content
    ("total_phenols", "float", True),                # Total phenols
    ("flavanoids", "float", True),                   # Flavanoids content
    ("nonflavanoid_phenols", "float", True),         # Nonflavanoid phenols
    ("proanthocyanins", "float", True),              # Proanthocyanins
    ("color_intensity", "float", True),              # Color intensity
    ("hue", "float", True),                          # Hue
    ("od280_od315_of_diluted_wines", "float", True), # OD280/OD315 of diluted wines
    ("proline", "int", True),                        # Proline content
    ("class", "int", True)                           # Class/cultivar identifier (1, 2, or 3)
])

df = spark.read \
                .schema(wine_schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/wine-clustering.csv")

df.printSchema()

df.show(10, truncate=False)

root
 |-- alcohol: float (nullable = true)
 |-- malic_acid: float (nullable = true)
 |-- ash: float (nullable = true)
 |-- alcalinity_of_ash: float (nullable = true)
 |-- magnesium: integer (nullable = true)
 |-- total_phenols: float (nullable = true)
 |-- flavanoids: float (nullable = true)
 |-- nonflavanoid_phenols: float (nullable = true)
 |-- proanthocyanins: float (nullable = true)
 |-- color_intensity: float (nullable = true)
 |-- hue: float (nullable = true)
 |-- od280_od315_of_diluted_wines: float (nullable = true)
 |-- proline: integer (nullable = true)
 |-- class: integer (nullable = true)



                                                                                

+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-----+
|alcohol|malic_acid|ash |alcalinity_of_ash|magnesium|total_phenols|flavanoids|nonflavanoid_phenols|proanthocyanins|color_intensity|hue |od280_od315_of_diluted_wines|proline|class|
+-------+----------+----+-----------------+---------+-------------+----------+--------------------+---------------+---------------+----+----------------------------+-------+-----+
|14.23  |1.71      |2.43|15.6             |127      |2.8          |3.06      |0.28                |2.29           |5.64           |1.04|3.92                        |1065   |NULL |
|13.2   |1.78      |2.14|11.2             |100      |2.65         |2.76      |0.26                |1.28           |4.38           |1.05|3.4                         |1050   |NULL |
|13.16  |2.36      |2.67|18.6             |101      |2.8          |3.24      |0.3                 |2

### Assemble the features into a single vector column

In [11]:
from pyspark.ml.feature import VectorAssembler

feature_cols = [col for col in df.columns if col != "class"]

# Create a VectorAssembler with the feature columns
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_df = assembler.transform(df)
assembled_df.head()

                                                                                

Row(alcohol=14.229999542236328, malic_acid=1.7100000381469727, ash=2.430000066757202, alcalinity_of_ash=15.600000381469727, magnesium=127, total_phenols=2.799999952316284, flavanoids=3.059999942779541, nonflavanoid_phenols=0.2800000011920929, proanthocyanins=2.2899999618530273, color_intensity=5.639999866485596, hue=1.0399999618530273, od280_od315_of_diluted_wines=3.9200000762939453, proline=1065, class=None, features=DenseVector([14.23, 1.71, 2.43, 15.6, 127.0, 2.8, 3.06, 0.28, 2.29, 5.64, 1.04, 3.92, 1065.0]))

# Initialize KMeans

In [13]:
from pyspark.ml.clustering import KMeans

kmeans_2 = KMeans().setK(2).setSeed(19)
kmeans_10 = KMeans().setK(10).setSeed(19)
kmeans_15 = KMeans().setK(15).setSeed(19)
kmeans_20 = KMeans().setK(20).setSeed(19)

### Training

In [14]:
model_2 = kmeans_2.fit(assembled_df)
model_10 = kmeans_10.fit(assembled_df)
model_15 = kmeans_15.fit(assembled_df)
model_20 = kmeans_20.fit(assembled_df)

25/05/12 00:34:33 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/12 00:34:33 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS


### Predictions

In [15]:
predictions_2 = model_2.transform(assembled_df)
predictions_10 = model_10.transform(assembled_df)
predictions_15 = model_15.transform(assembled_df)
predictions_20 = model_20.transform(assembled_df)

### Silhouettes

In [16]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()

silhouette_2 = evaluator.evaluate(predictions_2)
silhouette_10 = evaluator.evaluate(predictions_10)
silhouette_15 = evaluator.evaluate(predictions_15)
silhouette_20 = evaluator.evaluate(predictions_20)

print("Silhouette with K=2: " + str(silhouette_2))
print("Silhouette with K=10: " + str(silhouette_10))
print("Silhouette with K=15: " + str(silhouette_15))
print("Silhouette with K=20: " + str(silhouette_20))

Silhouette with K=2: 0.821360351333219
Silhouette with K=10: 0.6099911274077665
Silhouette with K=15: 0.6663309347635443
Silhouette with K=20: 0.6341822740902026


### Cluster centers

In [17]:
# Print cluster centers for K=2
print("Cluster Centers for K=2:")
for center in model_2.clusterCenters():
    print(center)
print("\n")

# Print cluster centers for K=10
print("Cluster Centers for K=10:")
for center in model_10.clusterCenters():
    print(center)
print("\n")

# Print cluster centers for K=15
print("Cluster Centers for K=15:")
for center in model_15.clusterCenters():
    print(center)
print("\n")

# Print cluster centers for K=20
print("Cluster Centers for K=20:")
for center in model_20.clusterCenters():
    print(center)

Cluster Centers for K=2:
[1.27028455e+01 2.54455285e+00 2.33910569e+00 2.04081301e+01
 9.68130081e+01 2.06211382e+00 1.64146342e+00 3.92682924e-01
 1.45406503e+00 4.85138211e+00 9.08617886e-01 2.40821138e+00
 5.65869919e+02]
[1.36665455e+01 1.87072727e+00 2.42781818e+00 1.74527272e+01
 1.06290909e+02 2.81618182e+00 2.89654548e+00 2.92909090e-01
 1.89690911e+00 5.52036361e+00 1.06665455e+00 3.06672727e+00
 1.15172727e+03]


Cluster Centers for K=10:
[1.39289474e+01 1.78157895e+00 2.48684212e+00 1.70789474e+01
 1.05315789e+02 2.90578946e+00 3.09947371e+00 2.84736841e-01
 1.93789475e+00 6.37105263e+00 1.11000000e+00 3.00894736e+00
 1.36747368e+03]
[1.26437931e+01 2.80896552e+00 2.26724137e+00 2.07241379e+01
 9.27586207e+01 1.80034481e+00 1.35379312e+00 4.06206897e-01
 1.34310344e+00 4.53482758e+00 8.73793103e-01 2.26344827e+00
 5.29827586e+02]
[1.32000002e+01 2.87375000e+00 2.45999998e+00 1.94875001e+01
 1.13375000e+02 2.14125003e+00 1.78874999e+00 3.67499998e-01
 1.60374998e+00 6.2550000

In [18]:
sc.stop()