# <center> <img src="../../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **K means model** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Integrantes**:
- Lorena Ruelas Gaytán
- Yael Alejandro Rodríguez Barreto
- Ximena Isaac Horta
- Alberto Renteria Camacho

In [21]:
import findspark
findspark.init()

In [22]:
SPARK_ID = "3c4c7def4de3"

#### Spark Conexion

In [23]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master(f"spark://{SPARK_ID}:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Extraction of data

In [24]:
data_path = "/home/jovyan/notebooks/data/parquet/"

tweets_df = spark.read \
    .parquet(data_path)

print("Rows:", tweets_df.count(), "Cols:", len(tweets_df.columns))
tweets_df.show(10)
tweets_df.printSchema()

                                                                                

Rows: 3020 Cols: 11
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|            tweet_id|user_id|           timestamp|                text|            hashtags|            mentions|retweet_count|favorite_count|reply_count|quote_count|views|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|e9ff298e-0593-403...|      3|2024-09-18T06:19:...|Id nesciunt elige...|["quod","voluptat...|        ["zhidalgo"]|          246|           892|         23|         17| 9767|
|bd456ef7-2eed-425...|      3|2024-09-20T12:59:...|Praesentium sint ...|                  []|  ["monteroarcelia"]|          246|           610|          4|         17|16917|
|2e9e17f3-1f8b-496...|      3|2025-01-30T01:07:...|Voluptates incidu...|      ["neque","in"]|["barriosgeorgina

### Assemble the features into a single vector column

In [25]:
from pyspark.ml.feature import VectorAssembler

numeric_cols = [
    "retweet_count", "favorite_count", "reply_count", "quote_count", "views"
]

assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
assembled_df = assembler.transform(tweets_df)

---
# KMeans at the same time

#### Initialize KMeans

In [26]:
from pyspark.ml.clustering import KMeans

k_values = [2, 5, 10, 15, 20]
kmeans = [KMeans().setK(k).setSeed(19) for k in k_values]

#### Training

In [27]:
models = [k.fit(assembled_df) for k in kmeans]

                                                                                

#### Save models

In [29]:
path_models = "/home/jovyan/notebooks/data/models/kmeans/"
#path_models = "/home/jovyan/notebooks/final_project/equipo/models/kmeans/"

for m, k in zip(models, k_values):
    m.save(f"{path_models}modelk{k}")

---
# See results of models

#### Read models

In [30]:
from pyspark.ml.clustering import KMeansModel

path_models = "/home/jovyan/notebooks/data/models/kmeans/"
#path_models = "/home/jovyan/notebooks/final_project/equipo/models/kmeans/"

models = []
for i in k_values:
    m = KMeansModel.load(f"{path_models}modelk{i}")
    models.append(m)
    print("Model", i, "ready")


Model 2 ready
Model 5 ready
Model 10 ready
Model 15 ready
Model 20 ready


#### Predictions

In [31]:
predictions = [model.transform(assembled_df) for model in models]

#### Evaluate model

In [32]:
from pyspark.ml.evaluation import ClusteringEvaluator

for i in range(len(k_values)):
    print(f"\n K values = {k_values[i]}")
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions[i])
    print(f"Silhouette score: {silhouette}")


 K values = 2


                                                                                

Silhouette score: 0.7919830010309039

 K values = 5


                                                                                

Silhouette score: 0.7152105159465849

 K values = 10


                                                                                

Silhouette score: 0.6551446939244672

 K values = 15


                                                                                

Silhouette score: 0.6043697501219919

 K values = 20




Silhouette score: 0.544809423964091


                                                                                

In [33]:
for i in range(len(k_values)):
    print(f"\n K values = {k_values[i]}")
    print("Cluster Centers: ")
    for center in models[i].clusterCenters():
        print(center)


 K values = 2
Cluster Centers: 
[  251.79237845   497.15703022    51.04007884    25.30354796
 15213.80091984]
[ 249.49732977  499.96528705   49.90921228   24.0493992  5216.24365821]

 K values = 5
Cluster Centers: 
[  245.00497512   507.00497512    48.07960199    25.15091211
 10070.57545605]
[ 257.45719178  502.20376712   51.41267123   23.92294521 2132.06678082]
[  253.3460925    488.98405104    51.70494418    25.33492823
 18054.68740032]
[  256.2917342    499.18314425    51.17017828    24.86871961
 14154.55591572]
[ 240.92020374  495.79117148   49.98132428   24.06112054 6108.6655348 ]

 K values = 10
Cluster Centers: 
[ 251.21269841  497.68253968   47.08253968   24.14285714 9028.37142857]
[  265.01320132   514.7359736     50.98019802    25.11221122
 13345.03630363]
[  252.55709343   498.37716263    50.88235294    25.00692042
 19095.29065744]
[ 252.68070175  499.43859649   51.86315789   24.57894737 1110.9122807 ]
[ 235.66433566  476.03496503   49.81468531   23.50699301 5015.6013986 ]


In [34]:
sc.stop()