# FinBert Test

In [None]:
!pip install numpy==1.26.4

In [None]:
!pip install transformers==4.38.2

In [None]:
import nltk
nltk.download('punkt')

In [None]:
import pandas as pd
from collections import defaultdict
from pathlib import Path
import nltk
from nltk.tokenize import PunktSentenceTokenizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline

# Punkt-Tokenizer laden
nltk.download('punkt')
sentence_tokenizer = PunktSentenceTokenizer()

# Unternehmenssynonyme laden
unternehmen_df = pd.read_csv("unternehmen_erweitert.csv")
unternehmen_map = defaultdict(set)
for _, row in unternehmen_df.iterrows():
    unternehmen_map[row["firma"]].add(row["synonym"].lower())

# Artikeltext einlesen
with open("textnetflix.txt", "r", encoding="utf-8") as f:
    text = f.read()
sätze = sentence_tokenizer.tokenize(text)

# FinBERT laden
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=-1)

# Chunking nach Firma
firma_to_text_chunks = defaultdict(list)
aktuelle_firma = None

for satz in sätze:
    satz_lower = satz.lower()
    erwähnte_firmen = [firma for firma, syns in unternehmen_map.items() if any(syn in satz_lower for syn in syns)]

    if erwähnte_firmen:
        for firma in erwähnte_firmen:
            firma_to_text_chunks[firma].append(satz)
        aktuelle_firma = erwähnte_firmen[0]
    elif aktuelle_firma:
        firma_to_text_chunks[aktuelle_firma].append(satz)

# Gewichtete Sentiment-Berechnung
label_weights = {"positive": 1, "neutral": 0, "negative": -1}
results = []

for firma, chunks in firma_to_text_chunks.items():
    output = classifier(chunks, truncation=True)
    sentiment_sum = 0
    score_sum = 0
    pos = neu = neg = 0

    for r in output:
        weight = label_weights[r["label"]]
        sentiment_sum += weight * r["score"]
        score_sum += r["score"]

        if r["label"] == "positive":
            pos += 1
        elif r["label"] == "neutral":
            neu += 1
        elif r["label"] == "negative":
            neg += 1

    weighted_sentiment = sentiment_sum / score_sum if score_sum else 0
    avg_conf = sum([r["score"] for r in output]) / len(output)

    results.append({
        "Firma": firma,
        "Positive": pos,
        "Neutral": neu,
        "Negative": neg,
        "Avg. Confidence": round(avg_conf, 3),
        "Gewichteter Sentiment-Score": round(weighted_sentiment, 3),
        "Anzahl Sätze": len(chunks)
    })

# Ausgabe
df = pd.DataFrame(results)
display(df)

## Debugging:

In [None]:
print(f"📄 Sätze für Netflix (n = {len(firma_to_text_chunks['Netflix'])}):\n")
for s in firma_to_text_chunks["Netflix"]:
    print("-", s)

In [None]:
print("🎯 Satzweise Sentimentbewertung für Netflix:\n")
for s in firma_to_text_chunks["Netflix"]:
    inputs = tokenizer(s, return_tensors="pt", truncation=True)
    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=1).squeeze()
    score = -1 * probs[0].item() + 0 * probs[1].item() + 1 * probs[2].item()
    print(f"{score:+.3f} | {s}")

In [None]:
from collections import Counter
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
from nltk.tokenize import PunktSentenceTokenizer

# Text laden
with open("textnetflix.txt", "r", encoding="utf-8") as f:
    text = f.read()

# Sätze extrahieren
tokenizer_nltk = PunktSentenceTokenizer()
sätze = tokenizer_nltk.tokenize(text)

# Modell laden
model_name = "ProsusAI/finbert"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
classifier = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

# Analyse
output = classifier(sätze, truncation=True)
label_counter = Counter([r["label"] for r in output])

print("Labelverteilung:")
for label, count in label_counter.items():
    print(f"{label}: {count}")