# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Machine Learning: K-means** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("ML: K-means") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Example 1: Clustering with 2D points

In [None]:
from pcamarillor.spark_utils import SparkUtils

# Sample data in Python (e.g., 2D points)
data = [
    (0, 1.0, 1.0),
    (1, 2.0, 1.0),
    (2, 4.0, 5.0),
    (3, 5.0, 5.0),
    (4, 10.0, 10.0),
    (5, 12.0, 11.0)
]

# Define schema for the DataFrame
schema = SparkUtils.generate_schema([("id", "int"), ("x", "float"), ("y", "float")])

# Create DataFrame for k means
random_points_df = spark.createDataFrame(data, schema)

## Assemble the features into a single vector column

In [None]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["x", "y"], outputCol="features")
assembled_df = assembler.transform(random_points_df)

## Configure K-means

In [None]:
from pyspark.ml.clustering import KMeans

kmeans = KMeans().setK(2).setSeed(19)

## Train model

In [None]:
model = kmeans.fit(assembled_df)
print("K-means model trained successfully")
kmeans_model_path = "/opt/spark/work-dir/data/mlmodels/kmeans/2D"
model.write().overwrite().save(kmeans_model_path)
model.__class__


## Get Predictions

In [None]:
from pyspark.ml.clustering import KMeansModel
k_model = KMeansModel.load(kmeans_model_path)
predictions = k_model.transform(assembled_df)

## Evaluate model

In [None]:
from pyspark.ml.evaluation import ClusteringEvaluator

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

# Example 2: Wine dataset

In [None]:
# Downlod dataset from https://www.kaggle.com/datasets/harrywang/wine-dataset-for-clustering

columns_types = [("Alcohol", "float"),
                                     ("Malic_Acid", "float"),
                                     ("Ash", "float"),
                                     ("Ash_Alcanity", "float"),
                                     ("Magnesium", "float"),
                                     ("Total_Phenols", "float"),
                                     ("Flavanoids", "float"),
                                     ("Nonflavanoid_Phenols", "float"),
                                     ("Proanthocyanins", "float"),
                                     ("Color_Intensity", "float"),
                                     ("Hue", "float"),
                                     ("OD280", "float"),
                                     ("Proline", "float")]

# Define schema for the DataFrame
wines_schema = SparkUtils.generate_schema(columns_types)

# Create DataFrame from wines csv
wines_df = spark \
                    .read \
                    .option("header", "true") \
                    .schema(wines_schema) \
                    .csv("/opt/spark/work-dir/data/ml/kmeans")


assembler = VectorAssembler(inputCols=[x for x,_ in columns_types], outputCol="features")
assembled_df = assembler.transform(wines_df)

# TODO: Find the optimial K
# TODO: Add the code here to iterate from k = 2, 4, .., 10 and get the silhouette score for each k

k = 2
kmeans = KMeans().setK(k).setSeed(13)

model = kmeans.fit(assembled_df)
print(f"K-means model trained successfully for {k} clusters")

# Evaluate clustering by computing Silhouette score
evaluator = ClusteringEvaluator()
silhouette = evaluator.evaluate(predictions)
print(f"Silhouette score: {silhouette}")

# Show the result
print("Cluster Centers: ")
for center in model.clusterCenters():
    print(center)

In [None]:
sc.stop()