In [None]:
# Import necessary libraries
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, to_timestamp, expr

# Create sample sales data
media_data = {
    "post_id": [101, 102,103, 104,105],
    "user_id": [1,2,3,4,5],
    "location": ["USA", "UK", "Canada", "Australia","China"],
    "confident_score": [0.85,0.9,0.75,0.7,0.95],
}
# Convert to DataFrame
df_social_media = pd.DataFrame(media_data)
# Save as CSV
csv_path = "/dbfs/FileStore/media_data.csv"
df_social_media.to_csv(csv_path, index=False)
print("CSV file created successfully.")
# Save as Parquet
parquet_path = "/dbfs/FileStore/media_data.parquet"
df_social_media.to_parquet(parquet_path, index=False)
print(f"Sample data saved to {csv_path} and {parquet_path}")

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Social Media Analysis") \
    .getOrCreate()

# Load data from CSV
df_sales = spark.read.format("csv") \
    .option("header", "true") \
    .load("/FileStore/social_media.csv")

# Write transformed data to Delta table
delta_table_path = "/delta/social_media"
df_social_media.write.format("delta").mode("overwrite").save(delta_table_path)
print("Delta table created and data written successfully.")

# Create a live table for incoming social media posts
# Make sure the correct path is used for the streaming source
spark.sql("""
CREATE OR REPLACE LIVE TABLE social_media AS
SELECT * FROM streaming.`/mnt/streaming/social_media`;
""")

# Create a user-defined function for sentiment analysis
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from pyspark.sql.functions import udf
from pyspark.sql.types import FloatType

# Initialize the sentiment analyzer
analyzer = SentimentIntensityAnalyzer()

# Define UDF for sentiment analysis
@udf(FloatType())
def sentiment_udf(text):
    return analyzer.polarity_scores(text)['compound']

# Create a live table to compute sentiment scores
spark.sql("""
CREATE OR REPLACE STREAMING LIVE TABLE sentiment_analysis AS
SELECT
    post_id,
    user_id,
    location,
    confident_score AS sentiment_score
FROM media_data;
""")

# Read from Kafka stream (ensure the Kafka topic is correct)
social_media_stream = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "your_kafka_server:port")
    .option("subscribe", "social_media_topic")
    .load()
)

# Convert Kafka stream to DataFrame
social_media_posts = social_media_stream.selectExpr("CAST(value AS STRING) as json_value")

# Further transform the JSON data into a structured DataFrame (adjust schema as necessary)
social_media_posts_df = social_media_posts.select(
    expr("json_value.post_id").alias("post_id"),
    expr("json_value.user_id").alias("user_id"),
    expr("json_value.post_text").alias("post_text")
)

# Write the streaming DataFrame to Delta table
query = (
    social_media_posts_df
    .writeStream
    .format("delta")
    .outputMode("append")
    .option("checkpointLocation", "/mnt/checkpoints/sentiment_analysis")
    .table("sentiment_analysis")
)

# Start the query
query.awaitTermination()
