## Spark Sentiment Daily Aggregation

In [None]:
!pip install pyspark influxdb-client

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, col, to_date, avg
from influxdb_client import InfluxDBClient, Point, WritePrecision
from influxdb_client.client.write_api import SYNCHRONOUS
import os
from datetime import datetime

# 📍 InfluxDB-Konfiguration
INFLUX_URL = "http://influxdb:8086"
INFLUX_TOKEN = "14iJvsBJKp37nLXjIZvE4RbAoEO2dNs1k0GvCbKuJUnF_ub4pSWWw80O739jabLPMD-XBzA72WSX9f-4FuDBQ=="
INFLUX_ORG = "bdinf-org"
INFLUX_BUCKET = "bdinf-bucket"

# 🧠 SparkSession mit MinIO S3-Kompatibilität
spark = SparkSession.builder \
    .appName("SentimentToInflux") \
    .config("spark.hadoop.fs.s3a.endpoint", "http://172.29.16.105:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "bdenggroup3") \
    .config("spark.hadoop.fs.s3a.secret.key", "bdenggroup3") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

# 📥 Eingehende JSON-Dateien aus MinIO lesen
df_raw = spark.readStream \
    .format("json") \
    .schema("url STRING, articleTimestamp STRING, tickers ARRAY<STRUCT<ticker:STRING,sentiment_score:DOUBLE,relevance_score:DOUBLE>>") \
    .load("s3a://bdenggroup3/sentiment/")

# 🔄 Flattening und Vorbereitung der Daten
df_flat = df_raw.select(
    to_date("articleTimestamp").alias("datum"),
    explode("tickers").alias("ticker_info")
).select(
    col("datum"),
    col("ticker_info.ticker").alias("ticker"),
    col("ticker_info.sentiment_score").alias("sentiment_score")
)

# 📊 Tagesbasierte Aggregation pro Ticker
df_agg = df_flat.groupBy("ticker", "datum") \
    .agg(avg("sentiment_score").alias("avg_sentiment"))

# ✍️ Schreiben in InfluxDB
def write_to_influx(batch_df, batch_id):
    client = InfluxDBClient(url=INFLUX_URL, token=INFLUX_TOKEN, org=INFLUX_ORG)
    write_api = client.write_api(write_options=SYNCHRONOUS)

    rows = batch_df.collect()
    for row in rows:
        datum_str = row["datum"].strftime("%Y-%m-%dT00:00:00Z")
        point = (
            Point("sentiment_daily")
            .tag("ticker", row["ticker"])
            .field("avg_sentiment", float(row["avg_sentiment"]))
            .time(datum_str, WritePrecision.NS)
        )
        write_api.write(bucket=INFLUX_BUCKET, org=INFLUX_ORG, record=point)

    print(f"✅ {len(rows)} Einträge in Influx geschrieben – Batch {batch_id}")
    client.close()

# 🚀 Starte den Stream
query = df_agg.writeStream \
    .foreachBatch(write_to_influx) \
    .outputMode("complete") \
    .option("checkpointLocation", "/tmp/influx_sentiment_checkpoint") \
    .start()

query.awaitTermination()