In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, expr
from pyspark.sql.types import StructType, StringType, IntegerType

In [8]:
import findspark
findspark.init()

In [9]:

spark = SparkSession.builder \
    .appName("Proyecto") \
    .master("spark://spark-master:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

In [10]:
# Generic schema for all platforms
schema_generic = StructType() \
    .add("platform", StringType()) \
    .add("user_id", StringType()) \
    .add("post_id", StringType()) \
    .add("event_time", StringType()) \
    .add("likes", IntegerType()) \
    .add("comments", IntegerType()) \
    .add("shares", IntegerType())

# Specific schema for Twitter
schema_twitter = StructType() \
    .add("platform", StringType()) \
    .add("user_id", StringType()) \
    .add("tweet_id", StringType()) \
    .add("event_time", StringType()) \
    .add("likes", IntegerType()) \
    .add("retweets", IntegerType()) \
    .add("replies", IntegerType())

In [11]:
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka_cluster-kafka-1:9093") \
    .option("subscribe", "facebook-topic,instagram-topic,twitter-topic,tiktok-topic") \
    .load()
# Transform Kafka value into JSON format
raw_df = kafka_df.selectExpr("CAST(value AS STRING) as json_str")

In [12]:
# Process messages based on platform
parsed_df = raw_df \
    .select(
        from_json(col("json_str"), schema_generic).alias("data_generic"),
        from_json(col("json_str"), schema_twitter).alias("data_twitter")
    ) \
    .select(
        expr("COALESCE(data_generic.platform, data_twitter.platform) AS platform"),
        expr("COALESCE(data_generic.user_id, data_twitter.user_id) AS user_id"),
        expr("COALESCE(data_generic.post_id, data_twitter.tweet_id) AS post_id"),
        expr("COALESCE(data_generic.event_time, data_twitter.event_time) AS event_time"),
        expr("COALESCE(data_generic.likes, data_twitter.likes) AS likes"),
        expr("COALESCE(data_generic.comments, data_twitter.replies) AS comments"),
        expr("COALESCE(data_generic.shares, data_twitter.retweets) AS shares")
    )

In [13]:
# Iniciar la query de Parquet
parquet_query = parsed_df.repartition(1).writeStream \
    .format("parquet") \
    .option("path", "/home/jovyan/notebooks/final_project/whatsapp2/data/lake/lake") \
    .option("checkpointLocation", "/home/jovyan/notebooks/final_project/whatsapp2/data/lake/checkpoints") \
    .option("maxFilesPerTrigger", 100) \
    .trigger(processingTime='30 seconds') \
    .outputMode("append") \
    .start()

# Iniciar la query de CSV
csv_query = parsed_df.repartition(1).writeStream \
    .format("csv") \
    .option("path", "/home/jovyan/notebooks/final_project/whatsapp2/data/ml/ml_input") \
    .option("checkpointLocation", "/home/jovyan/notebooks/final_project/whatsapp2/data/ml/ml_checkpoints") \
    .option("header", "true") \
    .option("maxFilesPerTrigger", 100) \
    .trigger(processingTime='30 seconds') \
    .outputMode("append") \
    .start()

25/05/10 05:00:56 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/10 05:00:58 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/10 05:00:58 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


25/05/10 05:00:59 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.


In [17]:
parquet_query.stop()
csv_query.stop()


In [None]:

sc.stop()