In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Proyecto") \
    .master("spark://f04d2745dc57:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-b064e8c8-9536-49c9-a832-718fe7ff253d;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;3.5.4 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;3.5.4 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.0.4 in central
	found org.apache.commons#commons-pool2;2.11.1 in centr

### Preparación de datos

In [3]:
import os 
from pyspark.sql import DataFrame
from functools import reduce

#archivosSQL
csvs = "/home/jovyan/notebooks/final_project/whatsapp2/data/ml/ml_input"

#Almacenar los DataFrames de los CSVs que esten limpios
csvs_clean = []

# Itera sobre todos los archivos en el directorio
for f in os.listdir(csvs):
    if f.endswith(".csv"): 
        path = os.path.join(csvs, f)  #Obtener la ruta del archivo
        try:
            #Leer el CSV
            df_temp = spark.read.csv(path, header=True, inferSchema=True)
            _ = df_temp.count()  #Detectar errores de lectura
            csvs_clean.append(df_temp) 
        except:
            #Si tiene errores se ignora el archivo
            print(f"Archivo con errores: {f}")

#Si hay archivos limpios
if csvs_clean:
    #Combinar todos los DataFrames válidos en uno solo
    df = reduce(DataFrame.unionAll, csvs_clean)
    df.show(5)
else:
    print("Archivos inválidos")


25/05/11 06:28:58 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB
25/05/11 06:28:59 WARN DAGScheduler: Broadcasting large task binary with size 3.6 MiB


+--------+-----------+---------+-------------------+-----+--------+------+
|platform|    user_id|  post_id|         event_time|likes|comments|shares|
+--------+-----------+---------+-------------------+-----+--------+------+
|Facebook|user_fb_172|post_4198|2025-05-11 00:06:59| 1748|     695|    22|
|Facebook|user_fb_438|post_3271|2025-05-10 23:51:30|  517|     631|    20|
|Facebook|user_fb_371|post_6361|2025-05-10 23:51:32| 1084|     696|    39|
|Facebook| user_fb_66|post_8041|2025-05-10 23:51:35| 2592|     191|    40|
|Facebook|user_fb_302|post_2099|2025-05-10 23:43:21|  124|     498|   197|
+--------+-----------+---------+-------------------+-----+--------+------+
only showing top 5 rows



                                                                                

In [4]:
from pyspark.sql.functions import col, when

#df = spark.read.csv("/home/jovyan/notebooks/final_project/whatsapp2/data/ml/ml_input", header=True, inferSchema=True) no se pudo :(

#Unir columnas de comentarios y compartidos
df = df.withColumn("likes", col("likes").cast("int")) \
       .withColumn("comments", col("comments").cast("int")) \
       .withColumn("shares", col("shares").cast("int"))

#Columna viral (likes)
df = df.withColumn("viral", when(col("likes") > 2000, 1).otherwise(0))

#Eliminar valores nulos
df = df.na.drop(subset=["likes", "comments", "shares", "viral"])

#Columnas originales (exportar CSV modelado)
original_df = df.select("likes", "comments", "shares", "platform")

### Ensamblar las características en una sola columna vectorial

In [5]:
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=["likes", "comments", "shares"], outputCol="features")
data_with_features = assembler.transform(df).withColumnRenamed("viral", "label").select("label", "features") 

### Dividir los datos en conjuntos de entrenamiento y prueba: 80 % de datos de entrenamiento y 20 % de datos de prueba.

In [6]:
train_df, test_df = data_with_features.randomSplit([0.8, 0.2], seed=57)

### Mostrar el dataset

In [7]:
print("Original Dataset")
data_with_features.show()

# Print train dataset
print("train set")
train_df.show()

Original Dataset


25/05/11 06:29:16 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/05/11 06:29:16 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/05/11 06:29:17 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB
25/05/11 06:29:18 WARN DAGScheduler: Broadcasting large task binary with size 2.8 MiB


+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0| [1748.0,695.0,22.0]|
|    0|  [517.0,631.0,20.0]|
|    0| [1084.0,696.0,39.0]|
|    1| [2592.0,191.0,40.0]|
|    0| [124.0,498.0,197.0]|
|    0|[1268.0,201.0,200.0]|
|    1| [2124.0,636.0,42.0]|
|    1|[3241.0,109.0,152.0]|
|    0| [1718.0,284.0,38.0]|
|    1| [3492.0,34.0,107.0]|
|    0| [533.0,512.0,141.0]|
|    1| [3577.0,517.0,15.0]|
|    0|[1848.0,145.0,173.0]|
|    0| [482.0,590.0,168.0]|
|    1|  [2964.0,68.0,11.0]|
|    1|[3121.0,379.0,108.0]|
|    1| [2328.0,650.0,90.0]|
|    1|[2553.0,146.0,183.0]|
|    1|[2350.0,445.0,194.0]|
|    1| [2584.0,145.0,88.0]|
+-----+--------------------+
only showing top 20 rows

train set


25/05/11 06:29:30 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
25/05/11 06:29:31 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
25/05/11 06:29:32 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB

+-----+--------------------+
|label|            features|
+-----+--------------------+
|    0| [1748.0,695.0,22.0]|
|    0|  [517.0,631.0,20.0]|
|    1| [2592.0,191.0,40.0]|
|    0| [124.0,498.0,197.0]|
|    0|[1268.0,201.0,200.0]|
|    1| [2124.0,636.0,42.0]|
|    1|[3241.0,109.0,152.0]|
|    1| [3492.0,34.0,107.0]|
|    0| [482.0,590.0,168.0]|
|    0|[1848.0,145.0,173.0]|
|    1|  [2964.0,68.0,11.0]|
|    1|[3121.0,379.0,108.0]|
|    1| [2328.0,650.0,90.0]|
|    1|[2553.0,146.0,183.0]|
|    1|[2350.0,445.0,194.0]|
|    1| [2584.0,145.0,88.0]|
|    0|  [361.0,81.0,157.0]|
|    0|[1499.0,522.0,179.0]|
|    0| [937.0,562.0,196.0]|
|    1| [3280.0,220.0,30.0]|
+-----+--------------------+
only showing top 20 rows



                                                                                

### Crear el modelo de regresión logística

In [8]:
from pyspark.ml.classification import LogisticRegression
lr = LogisticRegression(maxIter=10, regParam=0.01)

### Trainning

In [9]:
lr_model = lr.fit(train_df)

# Print coefficients
print("Coefficients: " + str(lr_model.coefficients))

# Display model summary
training_summary = lr_model.summary

25/05/11 06:29:54 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:31:07 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:31:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.JNIBLAS
25/05/11 06:31:12 WARN InstanceBuilder: Failed to load implementation from:dev.ludovic.netlib.blas.VectorBLAS
25/05/11 06:31:13 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:32:03 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:32:06 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:32:38 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:32:39 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:33:06 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:33:08 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 

Coefficients: [0.0018771965205362678,0.0011581923465459184,-0.004835110164455652]


### Predicciones

In [10]:
# Use the trained model to make predictions on the test data
predictions = lr_model.transform(test_df)

# Show predictions
predictions.select("features", "prediction", "probability").show()

25/05/11 06:38:34 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
25/05/11 06:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
25/05/11 06:38:35 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB
25/05/11 06:38:37 WARN DAGScheduler: Broadcasting large task binary with size 3.7 MiB

+--------------------+----------+--------------------+
|            features|prediction|         probability|
+--------------------+----------+--------------------+
| [1084.0,696.0,39.0]|       0.0|[0.76685711377311...|
| [1718.0,284.0,38.0]|       0.0|[0.61605726855596...|
| [533.0,512.0,141.0]|       0.0|[0.94937089264110...|
| [3577.0,517.0,15.0]|       1.0|[0.03236027761848...|
|    [76.0,578.0,9.0]|       0.0|[0.95582756042630...|
|[2833.0,116.0,197.0]|       1.0|[0.34144334997760...|
| [2504.0,82.0,137.0]|       1.0|[0.42800406737443...|
|[2957.0,104.0,133.0]|       1.0|[0.23412032583566...|
| [224.0,104.0,134.0]|       0.0|[0.98110947032679...|
| [1618.0,83.0,227.0]|       0.0|[0.85902296577603...|
| [1440.0,202.0,51.0]|       0.0|[0.75997370612232...|
|[2582.0,332.0,112.0]|       1.0|[0.30009482549863...|
| [248.0,408.0,180.0]|       0.0|[0.97758369251919...|
|  [662.0,118.0,41.0]|       0.0|[0.93474257323464...|
|  [76.0,405.0,186.0]|       0.0|[0.98418193668523...|
| [543.0,3

                                                                                

### Evaluar el modelo

In [11]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction")

accuracy = evaluator.evaluate(predictions, {evaluator.metricName: "accuracy"})
precision = evaluator.evaluate(predictions, {evaluator.metricName: "weightedPrecision"})
recall = evaluator.evaluate(predictions, {evaluator.metricName: "weightedRecall"})
f1 = evaluator.evaluate(predictions, {evaluator.metricName: "f1"})

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

25/05/11 06:38:53 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:40:20 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:41:26 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB
25/05/11 06:42:45 WARN DAGScheduler: Broadcasting large task binary with size 3.5 MiB

Accuracy: 0.9676113360323887
Precision: 0.9686316756620252
Recall: 0.9676113360323886
F1 Score: 0.9675439576863988


                                                                                

El modelo nos da una conclusión de todas las métricas mayores al 96%. Esto demuestra que la aplicación es útil para predecir el potencial de viralidad de nuevas publicaciones.


### Exportar como CSV

In [12]:
from pyspark.sql.functions import when

#Se unen los datos originales con los modelados
original_with_features = assembler.transform(original_df).select("likes", "comments", "shares", "platform", "features")
#Unión entre las predicciones y los datos originales
final_df = predictions.join(original_with_features, on="features", how="left")

#Si = 1, es viral,si no, no es viral
final_df = final_df.withColumn(
    "viral",
    when(final_df["prediction"] == 1, "Es viral").otherwise("No es viral")
)

#Ccolumnas originales y modeladas
export_df = final_df.select("platform", "likes", "comments", "shares", "prediction", "viral")
#Exportar el CSV con pandas (Spark no me dejó por el espacio)
export_df.toPandas().to_csv("/home/jovyan/notebooks/final_project/whatsapp2/data/predicciones_virales.csv", index=False)

25/05/11 06:44:01 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/05/11 06:45:23 WARN DAGScheduler: Broadcasting large task binary with size 6.0 MiB
                                                                                

In [13]:
export_df.show(5)

25/05/11 06:48:12 WARN DAGScheduler: Broadcasting large task binary with size 3.1 MiB
25/05/11 06:49:22 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
25/05/11 06:49:25 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
25/05/11 06:49:28 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB
25/05/11 06:49:31 WARN DAGScheduler: Broadcasting large task binary with size 6.1 MiB

+--------+-----+--------+------+----------+-----------+
|platform|likes|comments|shares|prediction|      viral|
+--------+-----+--------+------+----------+-----------+
|Facebook| 1084|     696|    39|       0.0|No es viral|
|Facebook| 1718|     284|    38|       0.0|No es viral|
|Facebook|  533|     512|   141|       0.0|No es viral|
|Facebook| 3577|     517|    15|       1.0|   Es viral|
|Facebook|   76|     578|     9|       0.0|No es viral|
+--------+-----+--------+------+----------+-----------+
only showing top 5 rows



                                                                                

In [14]:
sc.stop() 