# <center> <img src="img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Carrera: Ingeniería en Sistemas Computacionales** </center>
---
### <center> **Primavera 2025** </center>
---

**Lab 13**: Clustering with k-means

**Fecha**: 02/05/25

**Nombre del Estudiante**: Angel Ramirez, Roberto Osorno, Yochabel Cazares, Samuel Romero

**Profesor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master("spark://f04d2745dc57:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/02 16:14:03 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


### Preparación de datos

In [4]:
from team_name.spark_utils import SparkUtils

wine_schema = SparkUtils.generate_schema([
    ("Alcohol", "double"),
    ("Malic_Acid", "double"),
    ("Ash", "double"),
    ("Ash_Alcanity", "double"),
    ("Magnesium", "integer"),
    ("Total_Phenols", "double"),
    ("Flavanoids", "double"),
    ("Nonflavanoid_Phenols", "double"),
    ("Proanthocyanins", "double"),
    ("Color_Intensity", "double"),
    ("Hue", "double"),
    ("OD280", "double"),
    ("Proline", "integer")
])

wine_df = spark.read \
               .schema(wine_schema) \
               .option("header", "true") \
               .csv("/home/jovyan/notebooks/data/wine-clustering.csv")

wine_df.printSchema()
wine_df.show(5, truncate=False)

root
 |-- Alcohol: double (nullable = true)
 |-- Malic_Acid: double (nullable = true)
 |-- Ash: double (nullable = true)
 |-- Ash_Alcanity: double (nullable = true)
 |-- Magnesium: integer (nullable = true)
 |-- Total_Phenols: double (nullable = true)
 |-- Flavanoids: double (nullable = true)
 |-- Nonflavanoid_Phenols: double (nullable = true)
 |-- Proanthocyanins: double (nullable = true)
 |-- Color_Intensity: double (nullable = true)
 |-- Hue: double (nullable = true)
 |-- OD280: double (nullable = true)
 |-- Proline: integer (nullable = true)



                                                                                

+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|Alcohol|Malic_Acid|Ash |Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity|Hue |OD280|Proline|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|14.23  |1.71      |2.43|15.6        |127      |2.8          |3.06      |0.28                |2.29           |5.64           |1.04|3.92 |1065   |
|13.2   |1.78      |2.14|11.2        |100      |2.65         |2.76      |0.26                |1.28           |4.38           |1.05|3.4  |1050   |
|13.16  |2.36      |2.67|18.6        |101      |2.8          |3.24      |0.3                 |2.81           |5.68           |1.03|3.17 |1185   |
|14.37  |1.95      |2.5 |16.8        |113      |3.85         |3.49      |0.24                |2.18           |7.8           

### Ensamblar las características en una sola columna vectorial

In [5]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=
                            ["Alcohol", "Malic_Acid", "Ash", "Ash_Alcanity", "Magnesium", 
                             "Total_Phenols", "Flavanoids", "Nonflavanoid_Phenols", 
                             "Proanthocyanins", "Color_Intensity", "Hue", "OD280", "Proline"],outputCol="features")
assembled_df = assembler.transform(wine_df)
assembled_df.show(5, truncate=False)

25/05/02 16:20:35 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.
[Stage 1:>                                                          (0 + 1) / 1]

+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+---------------------------------------------------------------------+
|Alcohol|Malic_Acid|Ash |Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity|Hue |OD280|Proline|features                                                             |
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+---------------------------------------------------------------------+
|14.23  |1.71      |2.43|15.6        |127      |2.8          |3.06      |0.28                |2.29           |5.64           |1.04|3.92 |1065   |[14.23,1.71,2.43,15.6,127.0,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065.0]|
|13.2   |1.78      |2.14|11.2        |100      |2.65         |2.76      |0.26                |1.28           |4.38           |1.05|3.4  

                                                                                

### Inicializar KMeans

In [16]:
from pyspark.ml.clustering import KMeans

k_group = [2, 10, 15, 20]
for k in k_group:
    kmeans = KMeans().setK(k).setSeed(19)

### Training

In [17]:
model = kmeans.fit(assembled_df)

### Predicciones

In [18]:
predictions = model.transform(assembled_df)

### Evaluar el modelo

In [19]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)

# Show the result
for k in k_group:
    print(f"Results for k = {k}:")
    print(f"Silhouette score: {silhouette}")
    print("Cluster Centers: ")
    for center in model.clusterCenters():
        print(center)

Results for k = 2:
Silhouette score: 0.6341822740457447
Cluster Centers: 
[1.38278571e+01 1.74214286e+00 2.53428571e+00 1.76000000e+01
 1.06214286e+02 2.75928571e+00 2.90500000e+00 3.05714286e-01
 1.77642857e+00 5.93214286e+00 1.11857143e+00 3.00000000e+00
 1.28821429e+03]
[1.26077778e+01 2.49222222e+00 2.24500000e+00 2.02500000e+01
 9.15000000e+01 1.76000000e+00 1.40000000e+00 4.02222222e-01
 1.29333333e+00 4.42777778e+00 8.91111111e-01 2.27555556e+00
 5.09166667e+02]
[1.40200000e+01 1.89666667e+00 2.34666667e+00 1.63666667e+01
 1.04333333e+02 3.03666667e+00 3.38333333e+00 2.43333333e-01
 2.43000000e+00 6.71666667e+00 1.18000000e+00 3.01666667e+00
 1.52400000e+03]
[1.32153846e+01 3.05692308e+00 2.40000000e+00 1.91076923e+01
 1.11076923e+02 2.08000000e+00 1.50307692e+00 3.82307692e-01
 1.58923077e+00 5.38461538e+00 8.87692308e-01 2.40923077e+00
 7.48307692e+02]
[1.27000e+01 2.18625e+00 2.37375e+00 2.01875e+01 1.04750e+02 1.93750e+00
 1.21250e+00 3.55000e-01 1.25875e+00 5.45250e+00 7.86

In [20]:
sc.stop()