# <center> <img src="../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Modelo de machine learning** </center>

---

**Proyecto Final** - **MODELO**

**Fecha**: 13 mayo 2025

**Nombre del Equipo**: Arriba Linux

**Integrantes del Equipo**: Tirzah Peniche Barba / Ana Cristina Luna Arellano / Juan Pedro Bihouet

**Profesor**: Dr. Pablo Camarillo Ramirez

In [51]:
import findspark
findspark.init()

### Creacion de la conexión con el cluster de spark


Se crea una sesión de Spark con el paquete de Kafka incluido.

In [52]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Arriba-Linux-Proyecto-Final") \
    .master("spark://28d4ad191d34:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

### Leemos los archivos parquet desde el path

In [53]:
from pyspark.sql import SparkSession
parquet_data_path = "/home/jovyan/notebooks/datalake/social_logs/"
df = spark.read.parquet(parquet_data_path)

### Generamos la columna viral

In [54]:
from pyspark.sql.functions import when
df = df.withColumn("likes", df.likes.cast("int")) 
 

#Creamos la columna viral para post con mas de mil like
df = df.withColumn("viral", when(df["likes"] > 1000, 1).otherwise(0))


### Vector Assemble

In [55]:
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=["likes"], outputCol="features")

data_with_features = assembler.transform(df) \
    .withColumnRenamed("viral", "label") \
    .select("label", "features")

In [56]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=42)
print("show original")
data_with_features.show()
print("Trained set")
train_df.show()



show original


                                                                                

+-----+--------+
|label|features|
+-----+--------+
|    0| [247.0]|
|    1|[1671.0]|
|    1|[3485.0]|
|    1|[1907.0]|
|    1|[4692.0]|
|    1|[1849.0]|
|    1|[1707.0]|
|    1|[3438.0]|
|    1|[1120.0]|
|    1|[3673.0]|
|    0| [609.0]|
|    1|[1724.0]|
|    1|[2259.0]|
|    0| [404.0]|
|    1|[2365.0]|
|    1|[1004.0]|
|    1|[3708.0]|
|    1|[3780.0]|
|    1|[2031.0]|
|    1|[2191.0]|
+-----+--------+
only showing top 20 rows

Trained set


[Stage 2:>                                                          (0 + 1) / 1]

+-----+--------+
|label|features|
+-----+--------+
|    0|   [0.0]|
|    0|   [6.0]|
|    0|   [9.0]|
|    0|  [10.0]|
|    0|  [14.0]|
|    0|  [16.0]|
|    0|  [18.0]|
|    0|  [20.0]|
|    0|  [30.0]|
|    0|  [33.0]|
|    0|  [41.0]|
|    0|  [44.0]|
|    0|  [49.0]|
|    0|  [51.0]|
|    0|  [53.0]|
|    0|  [58.0]|
|    0|  [63.0]|
|    0|  [64.0]|
|    0|  [72.0]|
|    0|  [72.0]|
+-----+--------+
only showing top 20 rows



                                                                                

In [57]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#Inicializamos el modelo
lr = LogisticRegression(maxIter=10, regParam=0.01)
#Lo entrenamos
lr_model = lr.fit(train_df)

                                                                                

### Realizamos Predicciones

In [58]:
from pyspark.sql.functions import col

predictions = lr_model.transform(test_df)
predictions.select("features", col("label").alias("viral"), "prediction", "probability").show(200)


[Stage 25:>                                                         (0 + 1) / 1]

+--------+-----+----------+--------------------+
|features|viral|prediction|         probability|
+--------+-----+----------+--------------------+
|   [8.0]|    0|       0.0|[0.89897676510910...|
|  [15.0]|    0|       0.0|[0.89748675856370...|
|  [18.0]|    0|       0.0|[0.89684224573643...|
|  [40.0]|    0|       0.0|[0.89200542140718...|
|  [56.0]|    0|       0.0|[0.88836354962434...|
|  [69.0]|    0|       0.0|[0.88532589053993...|
|  [76.0]|    0|       0.0|[0.88366060111740...|
| [100.0]|    0|       0.0|[0.87779087673536...|
| [143.0]|    0|       0.0|[0.86663689042275...|
| [157.0]|    0|       0.0|[0.86282376891432...|
| [159.0]|    0|       0.0|[0.86227161380289...|
| [163.0]|    0|       0.0|[0.86116170120235...|
| [165.0]|    0|       0.0|[0.86060393685146...|
| [184.0]|    0|       0.0|[0.8552111074523,...|
| [201.0]|    0|       0.0|[0.85024007484665...|
| [224.0]|    0|       0.0|[0.84329175770541...|
| [252.0]|    0|       0.0|[0.83448130783407...|
| [334.0]|    0|    

                                                                                

### Evaluamos

In [59]:
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Evaluate model performance
evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.setMetricName("accuracy").evaluate(predictions)
precision = evaluator.setMetricName("weightedPrecision").evaluate(predictions)
recall = evaluator.setMetricName("weightedRecall").evaluate(predictions)
f1 = evaluator.setMetricName("f1").evaluate(predictions)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")




Accuracy: 0.9877
Precision: 0.9879
Recall: 0.9877
F1 Score: 0.9875


                                                                                

### Generamos el csv para el sig paso de powerBI

In [60]:
from pyspark.sql.functions import col

predictions.select(
    col("features").cast("string"),
    col("label").alias("viral"),
    col("prediction"),
    col("probability").cast("string")
).coalesce(1).write.option("header", "true").mode("overwrite") \
 .csv("/home/jovyan/notebooks/output/predictions_single_csv")


                                                                                

### Preparación de Datos

In [61]:
sc.stop()