## Spark Sentiment Daily Aggregation

In [77]:
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import explode, col, to_date, avg, sum as _sum, count
import boto3
import json
from urllib.parse import urlparse

from influxdb_client import InfluxDBClient, Point, WritePrecision
from influxdb_client.client.write_api import SYNCHRONOUS
import pandas as pd
import os
from datetime import datetime

# 📍 InfluxDB-Konfiguration
INFLUXDB_URL = "http://localhost:10896"
INFLUXDB_TOKEN = "14iJvsBJKp37nLXjIZvE4RbAoEO2dNs1k0GvCbKuJUnF_ub4pSWWw80O739jabLPMD-XBzA72WSX9f-4FuDBQ=="
INFLUXDB_ORG = "bdinf-org"
INFLUXDB_BUCKET = "bdinf-bucket"

s3_endpoint_url="http://172.29.16.105:9000"
s3_access_key_id="bdenggroup3"
s3_secret_access_key="bdenggroup3"
bucket_name = "bdenggroup3"
s3_prefix = "sentiment"

spark_master_url = "spark://localhost:7077"

In [67]:
# 🧠 SparkSession mit MinIO S3-Kompatibilität
spark = SparkSession.builder \
    .appName("Sentiment Aggregator") \
    .master(spark_master_url) \
    .getOrCreate()



spark.sparkContext.setLogLevel("WARN")

In [68]:
# === Set up boto3 client ===
s3 = boto3.client("s3",
                  endpoint_url=s3_endpoint_url,
                  aws_access_key_id=s3_access_key_id,
                  aws_secret_access_key=s3_secret_access_key
                  )

# === List all JSON files under the prefix ===
keys = []
continuation_token = None

In [69]:
while True:
    if continuation_token:
        response = s3.list_objects_v2(
            Bucket=bucket_name, Prefix=s3_prefix, ContinuationToken=continuation_token)
    else:
        response = s3.list_objects_v2(
            Bucket=bucket_name, Prefix=s3_prefix)

    for obj in response.get("Contents", []):
        key = obj["Key"]
        if key.endswith(".json"):
            keys.append(key)

    if response.get("IsTruncated"):  # more data available
        continuation_token = response.get("NextContinuationToken")
    else:
        break

print(f"Found {len(keys)} JSON files")

Found 20905 JSON files


In [None]:
# === Read and parse JSON files ===
key_counter = 0
rows = []

for key in keys:
    key_counter += 1
    try:
        obj = s3.get_object(Bucket=bucket_name, Key=key)
        data = json.load(obj["Body"])
        article_timestamp = data.get("articleTimestamp")
        url = data.get("url", "")
        if not article_timestamp:
            continue
        article_date = article_timestamp[:10]  # 'YYYY-MM-DD'
        domain = urlparse(url).netloc.lower()

        tickers = data.get("tickers", [])
        for ticker in tickers:
            sentiment = ticker.get("sentiment_score")
            relevance = ticker.get("relevance_score")
            symbol = ticker.get("ticker")
            if sentiment is not None and relevance is not None and symbol:
                weighted_sentiment = sentiment * relevance
                rows.append(Row(
                    article_date=article_date,
                    ticker=symbol,
                    sentiment=sentiment,
                    relevance=relevance,
                    weighted_sentiment=weighted_sentiment,
                    domain=domain
                ))

    except Exception as e:
        print(f"Failed to process {key}: {e}")

    if key_counter % 100 == 0:
        print(f"Processed {key_counter} JSON files")

    if key_counter % 100000 == 0:
        print(f"Cancelled after {key_counter} JSON files")
        break

# Optional final update
print(f"Finished processing {key_counter} JSON files total.")


In [70]:
# Create Spark DataFrame from rows
df = spark.createDataFrame(rows)

# Filter to specific date and select unique domains
target_date = "2025-06-21"

domains_df = df.filter(col("article_date") != target_date).select("domain").distinct()

# Show the result
domains_df.show(truncate=False)

# Optional: Save to file
domains_df.toPandas().to_csv(f"domains_{target_date}.csv", index=False)


+----------------+
|domain          |
+----------------+
|www.benzinga.com|
|www.fool.com    |
+----------------+



In [71]:
# === Create DataFrame and compute relevance-weighted average ===
df = spark.createDataFrame(rows)

agg_df = df.groupBy("article_date", "ticker").agg(
    _sum("weighted_sentiment").alias("total_weighted_sentiment"),
    count("*").alias("article_count")
)

# Step 2: Compute daily average sentiment per ticker
agg_df = agg_df.withColumn(
    "daily_sentiment",
    col("total_weighted_sentiment") / col("article_count")
)

agg_df.orderBy("article_date", "ticker").show(truncate=False)

+------------+------+------------------------+-------------+---------------------+
|article_date|ticker|total_weighted_sentiment|article_count|daily_sentiment      |
+------------+------+------------------------+-------------+---------------------+
|2022-10-14  |TSLA  |0.0                     |1            |0.0                  |
|2024-01-01  |AAPL  |0.07176399999999998     |4            |0.017940999999999995 |
|2024-01-01  |GOOG  |0.481741                |5            |0.0963482            |
|2024-01-01  |NVDA  |0.777533                |6            |0.12958883333333335  |
|2024-01-01  |O     |0.112407                |1            |0.112407             |
|2024-01-01  |TSLA  |0.7458720000000001      |9            |0.08287466666666668  |
|2024-01-02  |AAPL  |-1.214044               |14           |-0.08671742857142857 |
|2024-01-02  |BA    |-0.6                    |1            |-0.6                 |
|2024-01-02  |GOOG  |1.018313                |7            |0.14547328571428572  |
|202

In [72]:
# Create output directory if it doesn't exist
output_dir = "output"
os.makedirs(output_dir, exist_ok=True)

# Convert Spark DataFrame to Pandas
pandas_df = agg_df.toPandas()

# Save to CSV
output_path = os.path.join(output_dir, "ticker_sentiment_by_day.csv")
pandas_df.to_csv(output_path, index=False, sep=";")

print(f"CSV written to {output_path}")

CSV written to output\ticker_sentiment_by_day.csv


In [78]:
# Initialize InfluxDB client and writer
influx_client = InfluxDBClient(
    url=INFLUXDB_URL,
    token=INFLUXDB_TOKEN,
    org=INFLUXDB_ORG
)
write_api = influx_client.write_api(write_options=SYNCHRONOUS)

# Convert Spark DataFrame to Pandas
summary_df = agg_df.toPandas()

# Optional: calculate median if not part of aggregation
if "median_sentiment_score" not in summary_df.columns:
    summary_df["median_sentiment_score"] = None  # or compute separately

# Write each row to InfluxDB
for _, row in summary_df.iterrows():
    point = (
        Point("sentiment_data")
        .tag("ticker", row["ticker"])
        .tag("aggregation", "daily")
        .field("avg_sentiment_score", row["daily_sentiment"])
        .field("median_sentiment_score", row.get("median_sentiment_score", 0) or 0)
        .field("article_count", int(row["article_count"]))
        .time(pd.to_datetime(row["article_date"]), WritePrecision.S)
    )
    write_api.write(bucket=INFLUXDB_BUCKET, org=INFLUXDB_ORG, record=point)

print("✅ All points written to InfluxDB.")

✅ All points written to InfluxDB.


In [None]:

spark.stop()