# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Aprendizaje Automático (Machine Learning): K-means** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

#### Creacion de la conexión con el cluster de spark


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master("spark://f5db43ce3d38:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/04 04:21:20 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Preparación de Datos

In [7]:
from grandeInformacion.spark_utils import SparkUtils

# Define the schema for wine-clustering.csv
columns = [
    ("Alcohol", "double"),
    ("Malic_Acid", "double"),
    ("Ash", "double"),
    ("Ash_Alcanity", "double"),  # Note: Original CSV has "Ash_Alcanity" (typo for Alkalinity?)
    ("Magnesium", "integer"),
    ("Total_Phenols", "double"),
    ("Flavanoids", "double"),
    ("Nonflavanoid_Phenols", "double"),
    ("Proanthocyanins", "double"),
    ("Color_Intensity", "double"),
    ("Hue", "double"),
    ("OD280", "double"),
    ("Proline", "integer")
]

# Generate the schema (assuming team_name.spark_utils is available)
schema = SparkUtils.generate_schema(columns)

# Create DataFrame
data = spark \
                .read \
                .schema(schema) \
                .option("header", "true") \
                .csv("/home/jovyan/notebooks/data/wine-clustering.csv")

# Check for null values (though this dataset appears clean)
data = data.na.drop()

# Show the dataframe to verify
data.show(5)

                                                                                

+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|Alcohol|Malic_Acid| Ash|Ash_Alcanity|Magnesium|Total_Phenols|Flavanoids|Nonflavanoid_Phenols|Proanthocyanins|Color_Intensity| Hue|OD280|Proline|
+-------+----------+----+------------+---------+-------------+----------+--------------------+---------------+---------------+----+-----+-------+
|  14.23|      1.71|2.43|        15.6|      127|          2.8|      3.06|                0.28|           2.29|           5.64|1.04| 3.92|   1065|
|   13.2|      1.78|2.14|        11.2|      100|         2.65|      2.76|                0.26|           1.28|           4.38|1.05|  3.4|   1050|
|  13.16|      2.36|2.67|        18.6|      101|          2.8|      3.24|                 0.3|           2.81|           5.68|1.03| 3.17|   1185|
|  14.37|      1.95| 2.5|        16.8|      113|         3.85|      3.49|                0.24|           2.18|            7.

### Assemble the features into a single vector column

In [None]:
from pyspark.ml.feature import VectorAssembler

# Seleccionar características
feature_cols = ["Alcohol", "Malic_Acid", "Ash", "Ash_Alcanity", "Magnesium", 
               "Total_Phenols", "Flavanoids", "Nonflavanoid_Phenols", 
               "Proanthocyanins", "Color_Intensity", "Hue", "OD280", "Proline"]

# Crear vector de características
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
assembled_data = assembler.transform(data)

# KMeans for k = 2

In [14]:
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator

kmeans = KMeans().setK(2).setSeed(15)
model = kmeans.fit(assembled_data)
predictions = model.transform(assembled_data)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

Silhouette score: 0.8213603513331723
Cluster Centers: 
[1.27028455e+01 2.54455285e+00 2.33910569e+00 2.04081301e+01
 9.68130081e+01 2.06211382e+00 1.64146341e+00 3.92682927e-01
 1.45406504e+00 4.85138211e+00 9.08617886e-01 2.40821138e+00
 5.65869919e+02]
[1.36665455e+01 1.87072727e+00 2.42781818e+00 1.74527273e+01
 1.06290909e+02 2.81618182e+00 2.89654545e+00 2.92909091e-01
 1.89690909e+00 5.52036364e+00 1.06665455e+00 3.06672727e+00
 1.15172727e+03]


# Initialize KMeans for k = 10

In [15]:
kmeans = KMeans().setK(10).setSeed(15)
model = kmeans.fit(assembled_data)
predictions = model.transform(assembled_data)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

Silhouette score: 0.7046694551736816
Cluster Centers: 
[1.26437931e+01 2.80896552e+00 2.26724138e+00 2.07241379e+01
 9.27586207e+01 1.80034483e+00 1.35379310e+00 4.06206897e-01
 1.34310345e+00 4.53482759e+00 8.73793103e-01 2.26344828e+00
 5.29827586e+02]
[1.38278571e+01 1.74214286e+00 2.53428571e+00 1.76000000e+01
 1.06214286e+02 2.75928571e+00 2.90500000e+00 3.05714286e-01
 1.77642857e+00 5.93214286e+00 1.11857143e+00 3.00000000e+00
 1.28821429e+03]
[1.313500e+01 2.833750e+00 2.338125e+00 1.880625e+01 1.090000e+02
 2.163750e+00 1.720625e+00 3.643750e-01 1.635625e+00 5.208125e+00
 9.325000e-01 2.550000e+00 7.466875e+02]
[1.37784211e+01 2.04473684e+00 2.33210526e+00 1.71105263e+01
 1.05736842e+02 2.78368421e+00 2.87105263e+00 2.65789474e-01
 1.79789474e+00 4.96526316e+00 1.04105263e+00 3.30473684e+00
 1.03842105e+03]
[1.24722727e+01 2.13772727e+00 2.35227273e+00 2.10363636e+01
 9.30909091e+01 2.20909091e+00 1.95818182e+00 3.90454545e-01
 1.51409091e+00 4.45727268e+00 1.01045455e+00 2.44

# Initialize KMeans for k = 15

In [16]:
kmeans = KMeans().setK(15).setSeed(15)
model = kmeans.fit(assembled_data)
predictions = model.transform(assembled_data)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

Silhouette score: 0.6673675629387169
Cluster Centers: 
[ 12.30416667   2.17416667   2.24166667  20.58333333  92.5
   2.26333333   2.145        0.3725       1.52833333   2.65416667
   1.00166667   2.8775     333.        ]
[1.4126e+01 1.8800e+00 2.3980e+00 1.6220e+01 1.0760e+02 3.2460e+00
 3.4060e+00 2.6200e-01 2.2880e+00 6.9400e+00 1.0900e+00 3.0700e+00
 1.5004e+03]
[1.29926667e+01 2.37666667e+00 2.45800000e+00 1.99666667e+01
 1.11800000e+02 2.27400000e+00 1.91066667e+00 3.53333333e-01
 1.66866667e+00 5.30066667e+00 9.09066667e-01 2.63666667e+00
 8.70200000e+02]
[1.37166667e+01 1.81333333e+00 2.59666667e+00 1.67666667e+01
 1.05333333e+02 2.81666667e+00 2.82000000e+00 3.26666667e-01
 1.65000000e+00 6.40000000e+00 1.18666667e+00 2.90333333e+00
 1.33500000e+03]
[1.26138462e+01 2.52230769e+00 2.32769231e+00 2.10769231e+01
 9.28461538e+01 1.84500000e+00 1.58115385e+00 4.11923077e-01
 1.34576923e+00 4.88923073e+00 8.77307692e-01 2.30807692e+00
 4.97730769e+02]
[1.3761875e+01 1.9450000e+00 2.2

# Initialize KMeans for k = 20

In [17]:
kmeans = KMeans().setK(20).setSeed(15)
model = kmeans.fit(assembled_data)
predictions = model.transform(assembled_data)

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

Silhouette score: 0.631815967191204
Cluster Centers: 
[1.26305263e+01 2.51684211e+00 2.24789474e+00 2.01315789e+01
 9.01578947e+01 1.77789474e+00 1.38473684e+00 4.21052632e-01
 1.35000000e+00 4.51105263e+00 8.92105263e-01 2.28684211e+00
 5.05000000e+02]
[1.4126e+01 1.8800e+00 2.3980e+00 1.6220e+01 1.0760e+02 3.2460e+00
 3.4060e+00 2.6200e-01 2.2880e+00 6.9400e+00 1.0900e+00 3.0700e+00
 1.5004e+03]
[1.38581818e+01 1.72272727e+00 2.51727273e+00 1.78272727e+01
 1.06454545e+02 2.74363636e+00 2.92818182e+00 3.00000000e-01
 1.81090909e+00 5.80454545e+00 1.10000000e+00 3.02636364e+00
 1.27545455e+03]
[1.3452e+01 1.6800e+00 2.3160e+00 1.9700e+01 1.2480e+02 2.9040e+00
 2.8480e+00 2.8400e-01 2.2760e+00 4.4540e+00 1.1100e+00 3.1280e+00
 9.7340e+02]
[ 12.26181818   2.06         2.26545455  21.          93.63636364
   2.32090909   2.22636364   0.36727273   1.59181818   2.58636364
   1.02909091   2.94636364 329.45454545]
[1.34566667e+01 1.96000000e+00 2.57000000e+00 1.62333333e+01
 1.01666667e+02 3.

In [18]:
sc.stop()