# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **K means model** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Integrantes**:
- Lorena Ruelas Gaytán
- Yael Alejandro Rodríguez Barreto
- Ximena Isaac Horta
- Alberto Renteria Camacho

In [15]:
import findspark
findspark.init()

In [None]:
SPARK_ID = "5af50e5e22eb"

#### Spark Conexion

In [17]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master(f"spark://{SPARK_ID}:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

#### Extraction of data

In [18]:
data_path = "/home/jovyan/notebooks/data/parquet/"

tweets_df = spark.read \
    .parquet(data_path)

print("Rows:", tweets_df.count(), "Cols:", len(tweets_df.columns))
tweets_df.show(10)
tweets_df.printSchema()

Rows: 47 Cols: 11
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|            tweet_id|user_id|           timestamp|                text|            hashtags|            mentions|retweet_count|favorite_count|reply_count|quote_count|views|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|53a34b7a-a612-442...|      1|2025-01-12T21:39:...|Voluptatum veniam...|["veritatis","nes...|        ["rafael92"]|          190|           370|         94|          5| 5056|
|74528c8a-2958-439...|      1|2024-02-06T17:35:...|Inventore non quo...|["amet","at","eli...|                  []|          107|           640|         31|         13| 4343|
|5ac61920-5040-458...|      1|2024-12-02T04:29:...|Id magnam impedit...|         ["impedit"]|["humberto33","fe..

#### Data preparation

In [5]:
from equipo.spark_utils import SparkUtils
from pyspark.sql.functions import from_json

headers = [
        ("tweet_id", "string"),
        ("user_id", "integer"),
        ("timestamp", "string"),
        ("text", "string"),
        ("hashtags", "string"),
        ("mentions", "string"),
        ("retweet_count", "integer"),
        ("favorite_count", "integer"),
        ("reply_count", "integer"),
        ("quote_count", "integer"),
        ("views", "integer")
]

schema = SparkUtils.generate_schema([(head[0], head[1]) for head in headers])

tweets_df = raw_df.select(from_json(raw_df.value_str, schema).alias("data")).select("data.*")

print("Rows:", tweets_df.count(), "Cols:", len(tweets_df.columns))
tweets_df.printSchema()
tweets_df.show(10)

                                                                                

Rows: 392 Cols: 11
root
 |-- tweet_id: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- views: integer (nullable = true)

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|            tweet_id|user_id|           timestamp|                text|            hashtags|            mentions|retweet_count|favorite_count|reply_count|quote_count|views|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+----

### Assemble the features into a single vector column

In [19]:
from pyspark.ml.feature import VectorAssembler

numeric_cols = [
    "retweet_count", "favorite_count", "reply_count", "quote_count", "views"
]

assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
assembled_df = assembler.transform(tweets_df)

---
# KMeans at the same time

#### Initialize KMeans

In [20]:
from pyspark.ml.clustering import KMeans

k_values = [2, 5, 10, 15, 20]
kmeans = [KMeans().setK(k).setSeed(19) for k in k_values]

#### Training

In [21]:
models = [k.fit(assembled_df) for k in kmeans]

                                                                                

#### Save models

In [22]:
path_models = "/home/jovyan/notebooks/data/models/kmeans/"
#path_models = "/home/jovyan/notebooks/final_project/equipo/models/kmeans/"

for m, k in zip(models, k_values):
    m.save(f"{path_models}modelk{k}")

---
# See results of models

#### Read models

In [25]:
from pyspark.ml.clustering import KMeansModel

path_models = "/home/jovyan/notebooks/data/models/kmeans/"
#path_models = "/home/jovyan/notebooks/final_project/equipo/models/kmeans/"

models = []
for i in k_values:
    m = KMeansModel.load(f"{path_models}modelk{i}")
    models.append(m)
    print("Model", i, "ready")


Model 2 ready
Model 5 ready
Model 10 ready
Model 15 ready
Model 20 ready


#### Predictions

In [26]:
predictions = [model.transform(assembled_df) for model in models]

#### Evaluate model

In [27]:
from pyspark.ml.evaluation import ClusteringEvaluator

for i in range(len(k_values)):
    print(f"\n K values = {k_values[i]}")
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions[i])
    print(f"Silhouette score: {silhouette}")


 K values = 2
Silhouette score: 0.8348885785955833

 K values = 5
Silhouette score: 0.6588631545348812

 K values = 10
Silhouette score: 0.6720180173656797

 K values = 15
Silhouette score: 0.6437146596205457

 K values = 20
Silhouette score: 0.5677783945643664


In [28]:
for i in range(len(k_values)):
    print(f"\n K values = {k_values[i]}")
    print("Cluster Centers: ")
    for center in models[i].clusterCenters():
        print(center)


 K values = 2
Cluster Centers: 
[ 235.2962963   536.59259259   46.85185185   22.88888889 4919.55555556]
[  229.2    556.35    49.4     21.1  14830.8 ]

 K values = 5
Cluster Centers: 
[ 284.85714286  517.14285714   44.42857143   23.35714286 2841.71428571]
[  248.625   589.5      63.875    22.25  17436.25 ]
[ 228.125  694.75    44.      30.25  9512.125]
[  247.           449.88888889    36.44444444    16.88888889
 13786.33333333]
[ 114.     506.5     55.      17.625 6249.5  ]

 K values = 10
Cluster Centers: 
[ 338.8  429.2   50.4   21.4 2650.6]
[  247.           449.88888889    36.44444444    16.88888889
 13786.33333333]
[ 253.5  574.5   40.5   36.  8192.5]
[  227.           221.            67.33333333    27.33333333
 18637.66666667]
[ 103.14285714  526.           49.42857143   19.42857143 6420.        ]
[  261.6   810.6    61.8    19.2 16715.4]
[  124.           787.33333333    49.66666667    30.66666667
 11016.33333333]
[262.         724.33333333  48.          22.66666667 632.666666

In [29]:
sc.stop()