In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr
from pyspark.sql.types import StructType, StringType, IntegerType

In [2]:
import findspark
findspark.init()

In [3]:

spark = SparkSession.builder \
    .appName("Proyecto") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

:: loading settings :: url = jar:file:/opt/conda/spark-3.5.4-bin-hadoop3-scala2.13/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /root/.ivy2/cache
The jars for the packages stored in: /root/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-be1dd72b-3189-4bd7-ad35-1a872b0cb266;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;3.5.4 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;3.5.4 in central
	found org.apache.kafka#kafka-clients;3.4.1 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.5 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.0.4 in central
	found org.apache.commons#commons-pool2;2.11.1 in centr

In [4]:
# Generic schema for all platforms
schema_generic = StructType() \
    .add("platform", StringType()) \
    .add("user_id", StringType()) \
    .add("post_id", StringType()) \
    .add("event_time", StringType()) \
    .add("likes", IntegerType()) \
    .add("comments", IntegerType()) \
    .add("shares", IntegerType())

# Specific schema for Twitter
schema_twitter = StructType() \
    .add("platform", StringType()) \
    .add("user_id", StringType()) \
    .add("tweet_id", StringType()) \
    .add("event_time", StringType()) \
    .add("likes", IntegerType()) \
    .add("retweets", IntegerType()) \
    .add("replies", IntegerType())

In [5]:
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka_cluster-kafka-1:9093") \
    .option("subscribe", "facebook-topic,instagram-topic,twitter-topic,tiktok-topic") \
    .load()
# Transform Kafka value into JSON format
raw_df = kafka_df.selectExpr("CAST(value AS STRING) as json_str")

In [6]:
# Process messages based on platform
parsed_df = raw_df \
    .select(
        from_json(col("json_str"), schema_generic).alias("data_generic"),
        from_json(col("json_str"), schema_twitter).alias("data_twitter")
    ) \
    .select(
        expr("COALESCE(data_generic.platform, data_twitter.platform) AS platform"),
        expr("COALESCE(data_generic.user_id, data_twitter.user_id) AS user_id"),
        expr("COALESCE(data_generic.post_id, data_twitter.tweet_id) AS post_id"),
        expr("COALESCE(data_generic.event_time, data_twitter.event_time) AS event_time"),
        expr("COALESCE(data_generic.likes, data_twitter.likes) AS likes"),
        expr("COALESCE(data_generic.comments, data_twitter.replies) AS comments"),
        expr("COALESCE(data_generic.shares, data_twitter.retweets) AS shares")
    )

In [11]:
# Iniciar la query de Parquet
parquet_query = parsed_df.writeStream \
    .format("parquet") \
    .option("path", "/home/jovyan/notebooks/final_project/whatsapp2/data/lake/lake") \
    .option("checkpointLocation", "/home/jovyan/notebooks/final_project/whatsapp2/data/lake/checkpoints") \
    .option("maxFilesPerTrigger", 100) \
    .trigger(processingTime='30 seconds') \
    .outputMode("append") \
    .start()

# Iniciar la query de CSV
csv_query = parsed_df.writeStream \
    .format("csv") \
    .option("path", "/home/jovyan/notebooks/final_project/whatsapp2/data/ml/ml_input") \
    .option("checkpointLocation", "/home/jovyan/notebooks/final_project/whatsapp2/data/ml/ml_checkpoints") \
    .option("header", "true") \
    .option("maxFilesPerTrigger", 100) \
    .trigger(processingTime='30 seconds') \
    .outputMode("append") \
    .start()

25/05/10 04:59:51 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/10 04:59:51 WARN StreamingQueryManager: Stopping existing streaming query [id=886b45e4-842e-48d9-9cf8-e8819b50425e, runId=577de765-aafa-4b7f-ab45-037f25d2c7b0], as a new run is being started.
25/05/10 04:59:51 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/05/10 04:59:51 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/10 04:59:51 WARN StreamingQueryManager: Stopping existing streaming query [id=19206824-79cb-40c2-8d5a-729856c044d2, runId=3f403eec-d901-4976-a314-47550bfc2a3d], as a new run is being started.


25/05/10 04:59:51 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
                                                                                

In [12]:
parquet_query.stop()
csv_query.stop()


In [None]:

sc.stop()