# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **K means model** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

**Integrantes**:
- Lorena Ruelas Gaytán
- Yael Alejandro Rodríguez Barreto
- Ximena Isaac Horta
- Alberto Renteria Camacho

In [None]:
import findspark
findspark.init()

In [3]:
SPARK_ID = "b37d873f80db"

#### Spark Conexion

In [4]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("MLSpark-K-means") \
    .master(f"spark://{SPARK_ID}:7077") \
    .config("spark.ui.port","4040") \
    .getOrCreate()
sc = spark.sparkContext
spark.conf.set("spark.sql.shuffle.partitions", "5")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/10 00:25:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


#### Extraction of data

In [5]:
data_path = "/home/jovyan/notebooks/data/parquet/"

raw_df = spark.read \
    .parquet(data_path)

print("Rows:", raw_df.count(), "Cols:", len(raw_df.columns))
raw_df.show(10)

                                                                                

Rows: 36 Cols: 8


                                                                                

+----+--------------------+-------+---------+------+--------------------+-------------+--------------------+
| key|               value|  topic|partition|offset|           timestamp|timestampType|           value_str|
+----+--------------------+-------+---------+------+--------------------+-------------+--------------------+
|NULL|[7B 22 74 77 65 6...|tweet-1|        0|    17|2025-05-10 00:22:...|            0|{"tweet_id": "6ff...|
|NULL|[7B 22 74 77 65 6...|tweet-2|        0|    17|2025-05-10 00:22:...|            0|{"tweet_id": "dc4...|
|NULL|[7B 22 74 77 65 6...|tweet-3|        0|    17|2025-05-10 00:22:...|            0|{"tweet_id": "209...|
|NULL|[7B 22 74 77 65 6...|tweet-4|        0|    17|2025-05-10 00:22:...|            0|{"tweet_id": "f54...|
|NULL|[7B 22 74 77 65 6...|tweet-4|        0|    11|2025-05-10 00:21:...|            0|{"tweet_id": "d55...|
|NULL|[7B 22 74 77 65 6...|tweet-4|        0|    12|2025-05-10 00:21:...|            0|{"tweet_id": "6de...|
|NULL|[7B 22 74 77 

#### Data preparation

In [6]:
from equipo.spark_utils import SparkUtils
from pyspark.sql.functions import from_json

headers = [
        ("tweet_id", "string"),
        ("user_id", "integer"),
        ("timestamp", "string"),
        ("text", "string"),
        ("hashtags", "string"),
        ("mentions", "string"),
        ("retweet_count", "integer"),
        ("favorite_count", "integer"),
        ("reply_count", "integer"),
        ("quote_count", "integer"),
        ("views", "integer")
]

schema = SparkUtils.generate_schema([(head[0], head[1]) for head in headers])

tweets_df = raw_df.select(from_json(raw_df.value_str, schema).alias("data")).select("data.*")

print("Rows:", tweets_df.count(), "Cols:", len(tweets_df.columns))
tweets_df.printSchema()
tweets_df.show(10)

                                                                                

Rows: 36 Cols: 11
root
 |-- tweet_id: string (nullable = true)
 |-- user_id: integer (nullable = true)
 |-- timestamp: string (nullable = true)
 |-- text: string (nullable = true)
 |-- hashtags: string (nullable = true)
 |-- mentions: string (nullable = true)
 |-- retweet_count: integer (nullable = true)
 |-- favorite_count: integer (nullable = true)
 |-- reply_count: integer (nullable = true)
 |-- quote_count: integer (nullable = true)
 |-- views: integer (nullable = true)



[Stage 8:>                                                          (0 + 1) / 1]

+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|            tweet_id|user_id|           timestamp|                text|            hashtags|            mentions|retweet_count|favorite_count|reply_count|quote_count|views|
+--------------------+-------+--------------------+--------------------+--------------------+--------------------+-------------+--------------+-----------+-----------+-----+
|6ff004dd-e045-4bd...|      1|2025-03-03T20:50:...|Dolor non magni d...|["pariatur","repe...|    ["fidelsantana"]|          235|            71|          1|         24|11548|
|dc41957c-3fb8-4df...|      2|2024-10-25T13:39:...|Quia necessitatib...|["facere","delect...|                  []|          410|           312|         13|         41|14705|
|2095e1ec-7fcf-4e9...|      3|2025-04-19T02:59:...|Fuga praesentium ...|["iure","perspici...|       ["mariano09"]|          251|  

                                                                                

### Assemble the features into a single vector column

In [7]:
from pyspark.ml.feature import VectorAssembler

numeric_cols = [
    "retweet_count", "favorite_count", "reply_count", "quote_count", "views"
]

assembler = VectorAssembler(inputCols=numeric_cols, outputCol="features")
assembled_df = assembler.transform(tweets_df)

---
# KMeans at the same time

#### Initialize KMeans

In [11]:
from pyspark.ml.clustering import KMeans

k_values = [2, 5, 10, 15, 20]
kmeans = [KMeans().setK(k).setSeed(19) for k in k_values]

#### Training

In [22]:
models = [k.fit(assembled_df) for k in kmeans]

                                                                                

#### Save models

In [None]:
path_models = "/home/jovyan/notebooks/data/models/kmeans/"
#path_models = "/home/jovyan/notebooks/final_project/equipo/models/kmeans/"

for m, k in zip(models, k_values):
    m.save(f"{path_models}modelk{k}")

                                                                                

---
# See results of models

#### Read models

In [None]:
from pyspark.ml.clustering import KMeansModel

path_models = "/home/jovyan/notebooks/data/models/"
#path_models = "/home/jovyan/notebooks/final_project/equipo/models/kmeans/"

models = []
for i in k_values:
    m = KMeansModel.load(f"{path_models}modelk{i}")
    models.append(m)
    print("Model", i, "ready")


Model 2 ready
Model 5 ready
Model 10 ready
Model 15 ready
Model 20 ready


#### Predictions

In [29]:
predictions = [model.transform(assembled_df) for model in models]

#### Evaluate model

In [30]:
from pyspark.ml.evaluation import ClusteringEvaluator

for i in range(len(k_values)):
    print(f"\n K values = {k_values[i]}")
    evaluator = ClusteringEvaluator()
    silhouette = evaluator.evaluate(predictions[i])
    print(f"Silhouette score: {silhouette}")


 K values = 2


                                                                                

Silhouette score: 0.8489901992457758

 K values = 5


                                                                                

Silhouette score: 0.6607459863802264

 K values = 10


                                                                                

Silhouette score: 0.6619176817316299

 K values = 15


                                                                                

Silhouette score: 0.5558191669077037

 K values = 20




Silhouette score: 0.45331720143081367


                                                                                

In [31]:
for i in range(len(k_values)):
    print(f"\n K values = {k_values[i]}")
    print("Cluster Centers: ")
    for center in models[i].clusterCenters():
        print(center)


 K values = 2
Cluster Centers: 
[ 272.76923077  528.30769231   47.30769231   22.15384615 4379.76923077]
[  266.7826087    518.30434783    43.30434783    28.56521739
 15274.17391304]

 K values = 5
Cluster Centers: 
[  231.71428571   390.57142857    61.42857143    28.42857143
 10326.57142857]
[  190.75   747.75    31.25    44.25 18890.75]
[ 286.4  649.2   44.2   21.2 1458.6]
[ 291.4  507.8   33.2   24.  4767.2]
[  293.86666667   485.26666667    44.6           22.86666667
 15284.33333333]

 K values = 10
Cluster Centers: 
[ 251.    341.5    59.     21.   8044.75]
[  265.5     472.       54.625    26.875 16337.5  ]
[318.5 833.   44.   25.  411.5]
[  263.5   431.     37.5    33.  11094. ]
[ 267.5   435.5    51.75   26.25 2431.25]
[  190.75   747.75    31.25    44.25 18890.75]
[4.08000000e+02 7.43333333e+02 1.63333333e+01 7.00000000e+00
 1.36586667e+04]
[  265.     318.25    45.75    26.75 14397.25]
[  219.    394.5    63.5    35.5 12145. ]
[ 278.33333333  698.           28.           16.3

In [33]:
sc.stop()