# Sentiment Analyse

## Kafka Producer

In [None]:
import json
import boto3
from kafka.admin import KafkaAdminClient, NewTopic
from kafka import KafkaProducer
from kafka.errors import TopicAlreadyExistsError

# MinIO-Verbindung
s3 = boto3.client(
    's3',
    endpoint_url='http://172.29.16.105:9000',
    aws_access_key_id='bdenggroup3',
    aws_secret_access_key='bdenggroup3'
)

bucket_name = 'bdenggroup3' # S3 bucket
prefix = 'parsed/'          # S3 folder in bucket

# Kafka-Einstellungen
kafka_broker = 'localhost:9092'
topic_name = 'artikel-sentiment'

# Kafka-Topic erstellen, falls es nicht existiert
admin_client = KafkaAdminClient(bootstrap_servers=kafka_broker)
try:
    admin_client.create_topics([NewTopic(name=topic_name, num_partitions=1, replication_factor=1)])
    print(f"✅ Kafka-Topic '{topic_name}' wurde erstellt.")
except TopicAlreadyExistsError:
    print(f"ℹ️ Kafka-Topic '{topic_name}' existiert bereits.")

# Kafka-Producer initialisieren
producer = KafkaProducer(
    bootstrap_servers=kafka_broker,
    value_serializer=lambda v: json.dumps(v).encode('utf-8')
)

# Alle Dateien im Ordner 'parsed/' aus MinIO lesen und senden
paginator = s3.get_paginator("list_objects_v2")
page_iterator = paginator.paginate(Bucket=bucket_name, Prefix=prefix)

for page in page_iterator:
    for obj in page.get("Contents", []):
        key = obj["Key"]
        if not key.endswith(".json"):
            continue

        content = s3.get_object(Bucket=bucket_name, Key=key)["Body"].read().decode("utf-8")
        parsed_article = json.loads(content)
        # Extrahiere Dateinamen, z.B. parsed_255.json
        filename = key.split("/")[-1]

        # Füge `source`-Feld hinzu
        message = {
            "source": filename,
            "url": parsed_article.get("url"),
            "articleText": parsed_article.get("articleText"),
            "articleTimestamp": parsed_article.get("articleTimestamp"),
            "scrapingTimestamp": parsed_article.get("scrapingTimestamp"),
            "parsingTimestamp": parsed_article.get("parsingTimestamp")
        }

        producer.send(topic_name, message)
        print(f"📤 Gesendet: {key}")

producer.flush()
print("✅ Alle Artikel wurden an Kafka gesendet.")

## Kafka Consumer FinBert Sentiment Analyse

In [None]:
import json
import boto3
import nltk
from kafka import KafkaConsumer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from collections import defaultdict
from nltk.tokenize import PunktSentenceTokenizer
import pandas as pd
from datetime import datetime

nltk.download("punkt")
sentence_tokenizer = PunktSentenceTokenizer()

# Ticker + Synonyme laden
ticker_df = pd.read_csv("ticker_synonyme.csv")
ticker_map = defaultdict(set)
for _, row in ticker_df.iterrows():
    ticker_map[row["ticker"]].add(row["synonym"].lower())

# Kafka Consumer
consumer = KafkaConsumer(
    "artikel-sentiment",
    bootstrap_servers="localhost:9092",
    auto_offset_reset="earliest",
    enable_auto_commit=True,
    group_id="finbert-consumer",
    value_deserializer=lambda m: json.loads(m.decode("utf-8"))
)

# FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# MinIO Setup
s3 = boto3.client(
    's3',
    endpoint_url='http://172.29.16.105:9000',
    aws_access_key_id='bdenggroup3',
    aws_secret_access_key='bdenggroup3'
)

bucket_name = 'bdenggroup3'
output_prefix = 'sentiment/'

# Konfiguration
block_size = 3
min_blocks_per_ticker = 2
min_relevance_score = 0.1

for msg in consumer:
    article = msg.value
    text = article.get("articleText")
    if not text:
        print(f"⚠️ Kein Text vorhanden für: {article.get('source')}")
        continue

    sätze = sentence_tokenizer.tokenize(text)
    if len(sätze) == 0:
        continue

    # In Blöcke aufteilen
    blöcke = [sätze[i:i+block_size] for i in range(0, len(sätze), block_size)]
    block_sentences = [" ".join(block) for block in blöcke]
    total_blocks = len(blöcke)

    # Sentimentanalyse pro Block
    block_sentiments = classifier(block_sentences, truncation=True)

    # Ticker-Erkennung pro Block
    ticker_to_blocks = defaultdict(list)
    for idx, block in enumerate(blöcke):
        block_text = " ".join(block).lower()
        for ticker, syns in ticker_map.items():
            if any(syn in block_text for syn in syns):
                ticker_to_blocks[ticker].append(idx)

    # Nur relevante Ticker behalten
    ticker_to_blocks = {
        ticker: idxs for ticker, idxs in ticker_to_blocks.items()
        if len(idxs) >= min_blocks_per_ticker
    }

    if not ticker_to_blocks:
        print(f"ℹ️ Keine ausreichend relevanten Ticker für: {article.get('source')}")
        continue

    # Sentiment & Relevanz berechnen
    tickers_output = []
    for ticker, idx_list in ticker_to_blocks.items():
        sentiment_sum = 0
        score_sum = 0
        for idx in idx_list:
            result = block_sentiments[idx]
            label_weights = {"positive": 1, "neutral": 0, "negative": -1}
            weight = label_weights[result["label"]]
            sentiment_sum += weight * result["score"]
            score_sum += result["score"]

        weighted_sentiment = sentiment_sum / score_sum if score_sum else 0
        relevance_score = len(idx_list) / total_blocks if total_blocks else 0

        if relevance_score >= min_relevance_score:
            tickers_output.append({
                "ticker": ticker,
                "sentiment_score": round(weighted_sentiment, 3),
                "relevance_score": round(relevance_score, 3)
            })

    if not tickers_output:
        print(f"ℹ️ Keine Ticker mit ausreichender Relevanz für: {article.get('source')}")
        continue

    # Ergebnis erzeugen & speichern
    ergebnis = {
        "url": article.get("url"),
        "articleTimestamp": article.get("articleTimestamp"),
        "scrapingTimestamp": article.get("scrapingTimestamp"),
        "parsingTimestamp": article.get("parsingTimestamp"),
        "sentimentTimestamp": datetime.utcnow().isoformat(),
        "tickers": tickers_output
    }

    source_filename = article.get("source", "parsed_unknown.json")
    sentiment_filename = source_filename.replace("parsed_", "sentiment_")
    key = f"{output_prefix}{sentiment_filename}"

    s3.put_object(Bucket=bucket_name, Key=key, Body=json.dumps(ergebnis).encode("utf-8"))
    print(f"✅ Sentiment für {len(tickers_output)} Ticker gespeichert: {key}")

## Consumer mit Multithreading

In [None]:
processed = []

In [None]:
import json
import boto3
import nltk
import pandas as pd
from kafka import KafkaConsumer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from collections import defaultdict
from nltk.tokenize import PunktSentenceTokenizer
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime

# Setup
nltk.download("punkt")
sentence_tokenizer = PunktSentenceTokenizer()

# Ticker + Synonyme laden
ticker_df = pd.read_csv("ticker_synonyme.csv")
ticker_map = defaultdict(set)
for _, row in ticker_df.iterrows():
    ticker_map[row["ticker"]].add(row["synonym"].lower())

# FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# 🪣 MinIO
s3 = boto3.client(
    's3',
    endpoint_url='http://172.29.16.105:9000',
    aws_access_key_id='bdenggroup3',
    aws_secret_access_key='bdenggroup3'
)

bucket_name = 'bdenggroup3'
output_prefix = 'sentiment/'

# 🔧 Konfiguration
block_size = 3
min_blocks_per_ticker = 2
min_relevance_score = 0.1
max_workers = 4

# Kafka
consumer = KafkaConsumer(
    "artikel-sentiment",
    bootstrap_servers="localhost:9092",
    auto_offset_reset="earliest",
    enable_auto_commit=False,
    group_id="finbert-consumer",
    value_deserializer=lambda m: json.loads(m.decode("utf-8"))
)


# Verarbeitung eines Artikels
def process_article(article):
    try:
        source_filename = article.get("source", "parsed_unknown.json")
        sentiment_filename = source_filename.replace("parsed_", "sentiment_")
        key = f"{output_prefix}{sentiment_filename}"

        # 🚫 Duplikate überspringen
        try:
            s3.head_object(Bucket=bucket_name, Key=key)
            print(f"⏭️ Bereits verarbeitet, wird übersprungen: {key}")
            return
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] != '404':
                raise

        text = article.get("articleText")
        if not text:
            print(f"⚠️ Kein Text vorhanden für: {article.get('source')}")
            return

        sätze = sentence_tokenizer.tokenize(text)
        if len(sätze) == 0:
            return

        blöcke = [sätze[i:i+block_size] for i in range(0, len(sätze), block_size)]
        block_sentences = [" ".join(block) for block in blöcke]
        total_blocks = len(blöcke)

        block_sentiments = classifier(block_sentences, truncation=True)

        ticker_to_blocks = defaultdict(list)
        for idx, block in enumerate(blöcke):
            block_text = " ".join(block).lower()
            for ticker, syns in ticker_map.items():
                if any(syn in block_text for syn in syns):
                    ticker_to_blocks[ticker].append(idx)

        ticker_to_blocks = {
            ticker: idxs for ticker, idxs in ticker_to_blocks.items()
            if len(idxs) >= min_blocks_per_ticker
        }

        if not ticker_to_blocks:
            print(f"ℹ️ Keine ausreichend relevanten Ticker für: {article.get('source')}")
            return

        tickers_output = []
        for ticker, idx_list in ticker_to_blocks.items():
            sentiment_sum = 0
            score_sum = 0
            for idx in idx_list:
                result = block_sentiments[idx]
                weight = {"positive": 1, "neutral": 0, "negative": -1}[result["label"]]
                sentiment_sum += weight * result["score"]
                score_sum += result["score"]

            weighted_sentiment = sentiment_sum / score_sum if score_sum else 0
            relevance_score = len(idx_list) / total_blocks if total_blocks else 0

            if relevance_score >= min_relevance_score:
                tickers_output.append({
                    "ticker": ticker,
                    "sentiment_score": round(weighted_sentiment, 3),
                    "relevance_score": round(relevance_score, 3)
                })

        if not tickers_output:
            print(f"ℹ️ Keine Ticker mit ausreichender Relevanz für: {article.get('source')}")
            return

        ergebnis = {
            "url": article.get("url"),
            "articleTimestamp": article.get("articleTimestamp"),
            "scrapingTimestamp": article.get("scrapingTimestamp"),
            "parsingTimestamp": article.get("parsingTimestamp"),
            "sentimentTimestamp": datetime.utcnow().isoformat(),
            "tickers": tickers_output
        }

        s3.put_object(Bucket=bucket_name, Key=key, Body=json.dumps(ergebnis).encode("utf-8"))
        print(f"✅ Sentiment für {len(tickers_output)} Ticker gespeichert: {key}")

    except Exception as e:
        print(f"❌ Fehler bei Artikelverarbeitung: {e}")


counter = 0

# Haupt-Loop mit Batching & Parallelität
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    while True:
        batch = consumer.poll(timeout_ms=1000, max_records=10)

        futures = []
        for _, messages in batch.items():
            for msg in messages:
                counter += 1
                if msg not in processed:
                    processed.append(msg)
                    print(f'Processing message Nr. {counter}')
                    futures.append(executor.submit(process_article, msg.value))
                else:
                    print(f'Message Nr. {counter} already processed, skipping')

        # Warten, bis alle fertig
        for f in futures:
            f.result()

        # Nach Erfolg Kafka-Offsets committen
        consumer.commit()

## Consumer Test

In [None]:
import json
import boto3
import nltk
from kafka import KafkaConsumer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from collections import defaultdict
from nltk.tokenize import PunktSentenceTokenizer
import pandas as pd
from datetime import datetime

nltk.download("punkt")
sentence_tokenizer = PunktSentenceTokenizer()

# 🔃 Ticker + Synonyme laden
ticker_df = pd.read_csv("ticker_synonyme.csv")
ticker_map = defaultdict(set)
for _, row in ticker_df.iterrows():
    ticker_map[row["ticker"]].add(row["synonym"].lower())

# Kafka Consumer
consumer = KafkaConsumer(
    "artikel-sentiment",
    bootstrap_servers="localhost:9092",
    auto_offset_reset="earliest",
    enable_auto_commit=True,
    group_id="finbert-consumer-test",
    value_deserializer=lambda m: json.loads(m.decode("utf-8"))
)

# FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Konfiguration
max_articles = 10
block_size = 3
min_blocks_per_ticker = 2
min_relevance_score = 0.1

processed = 0

for msg in consumer:
    if processed >= max_articles:
        break

    article = msg.value
    text = article.get("articleText")
    if not text:
        print(f"⚠️ Kein Text vorhanden für: {article.get('source')}")
        continue

    sätze = sentence_tokenizer.tokenize(text)
    if len(sätze) == 0:
        continue

    # Blöcke erzeugen
    blöcke = [sätze[i:i+block_size] for i in range(0, len(sätze), block_size)]
    block_sentences = [" ".join(block) for block in blöcke]

    # Sentiment pro Block
    block_sentiments = classifier(block_sentences, truncation=True)

    # Ticker-Zuordnung
    ticker_to_blocks = defaultdict(list)
    for idx, block in enumerate(blöcke):
        block_text = " ".join(block).lower()
        for ticker, syns in ticker_map.items():
            if any(syn in block_text for syn in syns):
                ticker_to_blocks[ticker].append(idx)

    # Mindestanzahl Blöcke pro Ticker
    ticker_to_blocks = {
        ticker: idxs for ticker, idxs in ticker_to_blocks.items() if len(idxs) >= min_blocks_per_ticker
    }

    if not ticker_to_blocks:
        print(f"ℹ️ Keine ausreichend relevanten Ticker für: {article.get('source')}")
        processed += 1
        continue

    total_blocks = len(blöcke)
    tickers_output = []

    for ticker, idx_list in ticker_to_blocks.items():
        sentiment_sum = 0
        score_sum = 0
        for idx in idx_list:
            result = block_sentiments[idx]
            label_weights = {"positive": 1, "neutral": 0, "negative": -1}
            weight = label_weights[result["label"]]
            sentiment_sum += weight * result["score"]
            score_sum += result["score"]

        weighted_sentiment = sentiment_sum / score_sum if score_sum else 0
        relevance_score = len(idx_list) / total_blocks if total_blocks else 0

        if relevance_score >= min_relevance_score:
            tickers_output.append({
                "ticker": ticker,
                "sentiment_score": round(weighted_sentiment, 3),
                "relevance_score": round(relevance_score, 3)
            })

    if not tickers_output:
        print(f"ℹ️ Keine Ticker mit ausreichender Relevanz für: {article.get('source')}")
    else:
        ergebnis = {
            "url": article.get("url"),
            "articleTimestamp": article.get("articleTimestamp"),
            "scrapingTimestamp": article.get("scrapingTimestamp"),
            "parsingTimestamp": article.get("parsingTimestamp"),
            "sentimentTimestamp": datetime.utcnow().isoformat(),
            "tickers": tickers_output
        }
        print(json.dumps(ergebnis, indent=2))

    processed += 1

print(f"✅ Analyse abgeschlossen für {processed} Artikel.")

## Multithreaded on GPU

In [2]:
import torch
print(torch.cuda.is_available())  # Should return True
print(torch.cuda.get_device_name(0))


True
NVIDIA GeForce RTX 2060 SUPER


In [None]:
import json
import boto3
import nltk
import pandas as pd
from kafka import KafkaConsumer
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
from collections import defaultdict
from nltk.tokenize import PunktSentenceTokenizer
from concurrent.futures import ThreadPoolExecutor
from datetime import datetime
import torch


# Setup
nltk.download("punkt")
sentence_tokenizer = PunktSentenceTokenizer()

# Ticker + Synonyme laden
ticker_df = pd.read_csv("ticker_synonyme.csv")
ticker_map = defaultdict(set)
for _, row in ticker_df.iterrows():
    ticker_map[row["ticker"]].add(row["synonym"].lower())

# FinBERT
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSequenceClassification.from_pretrained(model_name).to(device)

classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

# 🪣 MinIO
s3 = boto3.client(
    's3',
    endpoint_url='http://172.29.16.105:9000',
    aws_access_key_id='bdenggroup3',
    aws_secret_access_key='bdenggroup3'
)

bucket_name = 'bdenggroup3'
output_prefix = 'sentiment/'

# 🔧 Konfiguration
block_size = 3
min_blocks_per_ticker = 2
min_relevance_score = 0.1
max_workers = 4

# Kafka
consumer = KafkaConsumer(
    "artikel-sentiment",
    bootstrap_servers="localhost:9092",
    auto_offset_reset="earliest",
    enable_auto_commit=False,
    group_id="finbert-consumer",
    value_deserializer=lambda m: json.loads(m.decode("utf-8"))
)


# Verarbeitung eines Artikels
def process_article(article):
    try:
        source_filename = article.get("source", "parsed_unknown.json")
        sentiment_filename = source_filename.replace("parsed_", "sentiment_")
        key = f"{output_prefix}{sentiment_filename}"

        # 🚫 Duplikate überspringen
        try:
            s3.head_object(Bucket=bucket_name, Key=key)
            print(f"⏭️ Bereits verarbeitet, wird übersprungen: {key}")
            return
        except s3.exceptions.ClientError as e:
            if e.response['Error']['Code'] != '404':
                raise

        text = article.get("articleText")
        if not text:
            print(f"⚠️ Kein Text vorhanden für: {article.get('source')}")
            return

        sätze = sentence_tokenizer.tokenize(text)
        if len(sätze) == 0:
            return

        blöcke = [sätze[i:i+block_size] for i in range(0, len(sätze), block_size)]
        block_sentences = [" ".join(block) for block in blöcke]
        total_blocks = len(blöcke)

        block_sentiments = classifier(block_sentences, truncation=True)

        ticker_to_blocks = defaultdict(list)
        for idx, block in enumerate(blöcke):
            block_text = " ".join(block).lower()
            for ticker, syns in ticker_map.items():
                if any(syn in block_text for syn in syns):
                    ticker_to_blocks[ticker].append(idx)

        ticker_to_blocks = {
            ticker: idxs for ticker, idxs in ticker_to_blocks.items()
            if len(idxs) >= min_blocks_per_ticker
        }

        if not ticker_to_blocks:
            print(f"ℹ️ Keine ausreichend relevanten Ticker für: {article.get('source')}")
            return

        tickers_output = []
        for ticker, idx_list in ticker_to_blocks.items():
            sentiment_sum = 0
            score_sum = 0
            for idx in idx_list:
                result = block_sentiments[idx]
                weight = {"positive": 1, "neutral": 0, "negative": -1}[result["label"]]
                sentiment_sum += weight * result["score"]
                score_sum += result["score"]

            weighted_sentiment = sentiment_sum / score_sum if score_sum else 0
            relevance_score = len(idx_list) / total_blocks if total_blocks else 0

            if relevance_score >= min_relevance_score:
                tickers_output.append({
                    "ticker": ticker,
                    "sentiment_score": round(weighted_sentiment, 3),
                    "relevance_score": round(relevance_score, 3)
                })

        if not tickers_output:
            print(f"ℹ️ Keine Ticker mit ausreichender Relevanz für: {article.get('source')}")
            return

        ergebnis = {
            "url": article.get("url"),
            "articleTimestamp": article.get("articleTimestamp"),
            "scrapingTimestamp": article.get("scrapingTimestamp"),
            "parsingTimestamp": article.get("parsingTimestamp"),
            "sentimentTimestamp": datetime.utcnow().isoformat(),
            "tickers": tickers_output
        }

        s3.put_object(Bucket=bucket_name, Key=key, Body=json.dumps(ergebnis).encode("utf-8"))
        print(f"✅ Sentiment für {len(tickers_output)} Ticker gespeichert: {key}")

    except Exception as e:
        print(f"❌ Fehler bei Artikelverarbeitung: {e}")

counter = 0

# Haupt-Loop mit Batching & Parallelität
with ThreadPoolExecutor(max_workers=max_workers) as executor:
    while True:
        batch = consumer.poll(timeout_ms=1000, max_records=200)

        futures = []
        for _, messages in batch.items():
            for msg in messages:
                counter += 1
                print(f'Processing message Nr. {counter}')
                futures.append(executor.submit(process_article, msg.value))

        # Warten, bis alle fertig
        for f in futures:
            f.result()

        # Nach Erfolg Kafka-Offsets committen
        consumer.commit()

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Michael\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Processing message Nr. 1
Processing message Nr. 2
Processing message Nr. 3
Processing message Nr. 4
Processing message Nr. 5
Processing message Nr. 6
Processing message Nr. 7
Processing message Nr. 8
Processing message Nr. 9
Processing message Nr. 10
Processing message Nr. 11
Processing message Nr. 12
Processing message Nr. 13
Processing message Nr. 14
Processing message Nr. 15
Processing message Nr. 16
Processing message Nr. 17
Processing message Nr. 18
Processing message Nr. 19
Processing message Nr. 20
Processing message Nr. 21
Processing message Nr. 22
Processing message Nr. 23
Processing message Nr. 24
Processing message Nr. 25
Processing message Nr. 26
Processing message Nr. 27
Processing message Nr. 28
Processing message Nr. 29
Processing message Nr. 30
Processing message Nr. 31
Processing message Nr. 32
Processing message Nr. 33
Processing message Nr. 34
Processing message Nr. 35
Processing message Nr. 36
Processing message Nr. 37
Processing message Nr. 38
Processing message Nr

  "sentimentTimestamp": datetime.utcnow().isoformat(),


ℹ️ Keine ausreichend relevanten Ticker für: parsed_15566.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15567.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15569.json
ℹ️ Keine ausreichend relevanten Ticker für: parsed_15568.json
ℹ️ Keine ausreichend relevanten Ticker für: parsed_15570.json
ℹ️ Keine ausreichend relevanten Ticker für: parsed_15571.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15572.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15573.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15574.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15576.json
ℹ️ Keine ausreichend relevanten Ticker für: parsed_15578.json
ℹ️ Keine Ticker mit ausreichender Relevanz für: parsed_1557.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15577.json
ℹ️ Keine ausreichend relevanten Ticker für: parsed_1558.json
✅ Sentiment für 1 Ticker gespeichert: sentiment/sentiment_15575.json
✅ Sentiment 