FinBert

In [2]:
import os, random, numpy as np, torch

SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

print("CUDA available:", torch.cuda.is_available())
print("GPU:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else None)


CUDA available: True
GPU: NVIDIA GeForce RTX 4060


In [None]:
import os
import pandas as pd

dataset_dir = "dataset\FinancialPhraseBank-v1.0\." 
files = [
    "Sentences_50Agree.txt",
    "Sentences_66Agree.txt",
    "Sentences_75Agree.txt",
    "Sentences_AllAgree.txt"
]

folder = os.path.abspath(dataset_dir)
files = [os.path.join(folder, f) for f in files]
files


['c:\\Coding\\Uni\\DLP\\Assignment3\\dataset\\FinancialPhraseBank-v1.0\\Sentences_50Agree.txt',
 'c:\\Coding\\Uni\\DLP\\Assignment3\\dataset\\FinancialPhraseBank-v1.0\\Sentences_66Agree.txt',
 'c:\\Coding\\Uni\\DLP\\Assignment3\\dataset\\FinancialPhraseBank-v1.0\\Sentences_75Agree.txt',
 'c:\\Coding\\Uni\\DLP\\Assignment3\\dataset\\FinancialPhraseBank-v1.0\\Sentences_AllAgree.txt']

In [None]:
import pandas as pd
import re
from tqdm import tqdm

texts = []
labels = []

for fp in files:
    with open(fp, "r", encoding="utf-8", errors="ignore") as f:
        lines = f.read().splitlines()   #avoiding line by line I/O
    
    for line in tqdm(lines, desc=f"Loading {os.path.basename(fp)}"):
        if not line.strip():
            continue
        
        match = re.search(r"@(positive|negative|neutral)$", line)
        if not match:
            continue

        label = match.group(1)
        text = line[:line.rfind("@")].strip()

        texts.append(text)
        labels.append(label)

df = pd.DataFrame({"text": texts, "label": labels})
df.head()


Loading Sentences_50Agree.txt: 100%|██████████| 4846/4846 [00:00<00:00, 537500.92it/s]
Loading Sentences_66Agree.txt: 100%|██████████| 4217/4217 [00:00<00:00, 587620.60it/s]
Loading Sentences_75Agree.txt: 100%|██████████| 3453/3453 [00:00<00:00, 427868.82it/s]
Loading Sentences_AllAgree.txt: 100%|██████████| 2264/2264 [00:00<00:00, 501156.02it/s]


Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",neutral
1,Technopolis plans to develop in stages an area...,neutral
2,The international electronic industry company ...,negative
3,With the new production plant the company woul...,positive
4,According to the company 's updated strategy f...,positive


In [3]:
len(df)
df['label'].value_counts()


label
neutral     8951
positive    3988
negative    1841
Name: count, dtype: int64

In [4]:
import re
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords

STOP = set(stopwords.words("english"))

def clean_text(t):
    t = str(t).lower()
    t = re.sub(r"http\S+", " ", t)
    t = re.sub(r"[^a-z0-9\s]", " ", t)
    tokens = [w for w in t.split() if w not in STOP and len(w) > 2]
    return " ".join(tokens)

df["clean_text"] = df["text"].apply(clean_text)
df.head()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Ayan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,text,label,clean_text
0,"According to Gran , the company has no plans t...",neutral,according gran company plans move production r...
1,Technopolis plans to develop in stages an area...,neutral,technopolis plans develop stages area less 100...
2,The international electronic industry company ...,negative,international electronic industry company elco...
3,With the new production plant the company woul...,positive,new production plant company would increase ca...
4,According to the company 's updated strategy f...,positive,according company updated strategy years 2009 ...


In [11]:
from gensim import corpora, models
from gensim.models import CoherenceModel

texts = [t.split() for t in df["clean_text"]]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(t) for t in texts]

results = {}

for k in [35, 38, 45]:
    ...
    lda = models.LdaModel(
        corpus=corpus,
        id2word=dictionary,
        num_topics=k,
        random_state=42,
        passes=10
    )
    coherence = CoherenceModel(
        model=lda,
        texts=texts,
        dictionary=dictionary,
        coherence="c_v"
    ).get_coherence()
    
    results[k] = coherence
    print(f"k = {k}, coherence = {coherence}")


KeyboardInterrupt: 

In [13]:
best_k = 35
lda = models.LdaModel(
    corpus=corpus,
    id2word=dictionary,
    num_topics=best_k,
    random_state=42,
    passes=10
)

topics = lda.print_topics(num_topics=best_k, num_words=10)
for topic in topics:
    print(topic)

df["topic"] = [
    max(lda.get_document_topics(bow), key=lambda x: x[1])[0]
    for bow in corpus
]


(0, '0.046*"well" + 0.046*"products" + 0.032*"finland" + 0.032*"services" + 0.026*"service" + 0.023*"research" + 0.020*"customers" + 0.017*"facilities" + 0.016*"meat" + 0.016*"network"')
(1, '0.033*"whole" + 0.028*"paid" + 0.022*"personnel" + 0.019*"negotiations" + 0.017*"cut" + 0.016*"also" + 0.015*"electricity" + 0.015*"information" + 0.014*"date" + 0.014*"content"')
(2, '0.047*"posted" + 0.041*"100" + 0.026*"aldata" + 0.021*"solution" + 0.021*"holding" + 0.019*"corporate" + 0.019*"planned" + 0.018*"cover" + 0.015*"next" + 0.015*"retail"')
(3, '0.077*"oyj" + 0.065*"finnish" + 0.061*"said" + 0.049*"today" + 0.040*"hel" + 0.038*"approximately" + 0.036*"000" + 0.033*"million" + 0.031*"usd" + 0.027*"2010"')
(4, '0.037*"company" + 0.022*"director" + 0.020*"line" + 0.018*"kemira" + 0.018*"finnish" + 0.016*"business" + 0.015*"managing" + 0.014*"finland" + 0.013*"product" + 0.013*"maintenance"')
(5, '0.054*"investment" + 0.046*"eur" + 0.039*"value" + 0.030*"company" + 0.022*"scanfil" + 0.020

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import torch

finbert_name = "ProsusAI/finbert"

tokenizer = AutoTokenizer.from_pretrained(finbert_name)
model = AutoModelForSequenceClassification.from_pretrained(
    finbert_name,
    trust_remote_code=True,
    use_safetensors=True
)

device = 0 if torch.cuda.is_available() else -1

finbert = pipeline(
    "text-classification",
    model=model,
    tokenizer=tokenizer,
    top_k=1,
    device=device
)

def predict_finbert(batch):
    outputs = finbert(batch, top_k=1)

    preds = []
    for out in outputs:
        # case 1: dict
        if isinstance(out, dict):
            preds.append(out["label"].lower())
        # case 2: [{'label':..., 'score':...}]
        elif isinstance(out, list) and len(out) and isinstance(out[0], dict):
            preds.append(out[0]["label"].lower())
        # case 3: [[{'label':...}]]
        elif isinstance(out, list) and len(out) and isinstance(out[0], list):
            preds.append(out[0][0]["label"].lower())
        else:
            raise ValueError("Unexpected output format:", out)

    return preds


batch_size = 32   
preds = []

for i in range(0, len(df), batch_size):
    batch = df["text"].iloc[i:i+batch_size].tolist()
    p = predict_finbert(batch)
    preds.extend(p)

df["finbert_pred"] = preds
print("Done! Preds:", len(preds))



Device set to use cuda:0
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Done! Preds: 14780


In [23]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_true = df["label"]
y_pred = df["finbert_pred"]

print("FinBERT Accuracy:", accuracy_score(y_true, y_pred))
print()
print("Classification Report:\n", classification_report(y_true, y_pred))
print()
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


FinBERT Accuracy: 0.9236129905277402

Classification Report:
               precision    recall  f1-score   support

    negative       0.83      0.98      0.90      1841
     neutral       0.98      0.90      0.94      8951
    positive       0.86      0.94      0.90      3988

    accuracy                           0.92     14780
   macro avg       0.89      0.94      0.91     14780
weighted avg       0.93      0.92      0.92     14780


Confusion Matrix:
 [[1800   19   22]
 [ 288 8084  579]
 [  70  151 3767]]
Confusion Matrix:
 [[1800   19   22]
 [ 288 8084  579]
 [  70  151 3767]]


Local LLM Sentiment Analysis

In [26]:
from transformers import pipeline

llm = pipeline(
    "zero-shot-classification",
    model="cross-encoder/nli-deberta-v3-base",
    device=0
)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda:0


In [27]:
labels = ["negative", "neutral", "positive"]


In [28]:
def predict_llm(batch):
    outputs = llm(batch, candidate_labels=labels)
    preds = [out["labels"][0] for out in outputs]
    return preds

batch_size = 8
llm_preds = []

for i in range(0, len(df), batch_size):
    batch = df["text"].iloc[i:i+batch_size].tolist()
    p = predict_llm(batch)
    llm_preds.extend(p)

df["llm_pred"] = llm_preds
print("Done! LLM preds:", len(llm_preds))


Done! LLM preds: 14780


In [29]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_true = df["label"]
y_pred = df["llm_pred"]

print("LLM Accuracy:", accuracy_score(y_true, y_pred))
print()
print("Classification Report:\n", classification_report(y_true, y_pred))
print()
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


LLM Accuracy: 0.6877537212449256

Classification Report:
               precision    recall  f1-score   support

    negative       0.65      0.90      0.75      1841
     neutral       0.81      0.66      0.72      8951
    positive       0.54      0.66      0.59      3988

    accuracy                           0.69     14780
   macro avg       0.66      0.74      0.69     14780
weighted avg       0.71      0.69      0.69     14780


Confusion Matrix:
 [[1661  176    4]
 [ 795 5878 2278]
 [ 115 1247 2626]]


In [38]:
llm2 = pipeline(
    "zero-shot-classification",
    model="typeform/distilbert-base-uncased-mnli",
    device=0
)

def predict_llm2(batch):
    outputs = llm2(batch, candidate_labels=["negative","neutral","positive"])
    return [o["labels"][0] for o in outputs]

batch_size = 8
llm2_preds = []

for i in range(0, len(df), batch_size):
    batch = df["text"].iloc[i:i+batch_size].tolist()
    llm2_preds.extend(predict_llm2(batch))

df["llm2_pred"] = llm2_preds
print("Done with LLM2:", len(llm2_preds))

print(accuracy_score(df["label"], df["llm2_pred"]))



Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Device set to use cuda:0


Done with LLM2: 14780
0.3439106901217862


RAG


In [30]:
# RAG parameters and imports
import numpy as np
from sentence_transformers import SentenceTransformer
import faiss
from tqdm import tqdm

EMB_MODEL_NAME = "all-MiniLM-L6-v2"   # fast, good
EMB_BATCH_SIZE = 128
FAISS_INDEX_PATH = "faiss_index.idx"
EMBS_PATH = "embs.npy"

RAG_K = 5            # retrieval depth (try 3,5,10 experiments)
RAG_BATCH = 32       # how many queries per LLM batch
CANDIDATE_LABELS = ["negative", "neutral", "positive"]  # same labels as before


In [31]:
# encode all clean_text into embeddings and build FAISS index (CPU)
embedder = SentenceTransformer(EMB_MODEL_NAME)

print("Computing embeddings (this may take a couple minutes)...")
embs = embedder.encode(df["clean_text"].tolist(), convert_to_numpy=True, show_progress_bar=True, batch_size=EMB_BATCH_SIZE)
np.save(EMBS_PATH, embs)

# normalize for cosine similarity
faiss.normalize_L2(embs)
d = embs.shape[1]
index = faiss.IndexFlatIP(d)   # inner product on normalized vectors == cosine similarity
index.add(embs)
faiss.write_index(index, FAISS_INDEX_PATH)
print("FAISS index built. n:", index.ntotal)


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Computing embeddings (this may take a couple minutes)...


Batches: 100%|██████████| 116/116 [00:02<00:00, 40.32it/s]


FAISS index built. n: 14780


In [32]:
def retrieve_topk(sentence_idx, k=RAG_K):
    # returns indices and distances for the given sentence index in dataset
    q = embs[sentence_idx:sentence_idx+1]  # already normalized
    D, I = index.search(q, k)
    return I[0].tolist(), D[0].tolist()

# robust RAG classifier for a batch of input sentences
def rag_classify_batch(query_texts, query_indices, k=RAG_K):
    """
    query_texts: list[str] - the original (untokenized) sentences to classify
    query_indices: list[int] - indices into df that correspond to query_texts (for retrieval base)
    returns: list[str] predicted labels
    """
    prompts = []
    for q_text, q_idx in zip(query_texts, query_indices):
        # retrieve top-k neighbors for context (exclude the query itself if it's the same index)
        I, D = retrieve_topk(q_idx, k=k)
        # optionally remove the first result if it's the query itself
        context_sentences = []
        for idx in I:
            if idx == q_idx:
                continue
            context_sentences.append(df["text"].iloc[idx])
        # build context string (short)
        context = "\n".join(context_sentences[:k]) if context_sentences else ""
        # final prompt: include context then the target sentence
        full_input = f"Context:\n{context}\n\nSentence:\n{q_text}\n\nClassify the sentiment of the Sentence as NEGATIVE / NEUTRAL / POSITIVE."
        prompts.append(full_input)
    # call zero-shot classifier (batch)
    outputs = llm(prompts, candidate_labels=CANDIDATE_LABELS)
    # outputs: list of dicts; pick top label
    preds = [out["labels"][0] for out in outputs]
    return preds


In [33]:
# Build a mapping of indices to text for query (we will classify every sentence using its own index)
n = len(df)
rag_preds = []
batch_size = RAG_BATCH

print("Running RAG inference in batches. This will take several minutes.")

for i in tqdm(range(0, n, batch_size)):
    batch_indices = list(range(i, min(i+batch_size, n)))
    batch_texts = df["text"].iloc[batch_indices].tolist()
    # call rag classifier
    batch_preds = rag_classify_batch(batch_texts, batch_indices, k=RAG_K)
    rag_preds.extend(batch_preds)

# attach to dataframe
df["rag_pred"] = rag_preds
print("RAG done. preds:", len(rag_preds))


Running RAG inference in batches. This will take several minutes.


100%|██████████| 462/462 [19:06<00:00,  2.48s/it]

RAG done. preds: 14780





In [34]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_true = df["label"]
y_pred = df["rag_pred"]

print("RAG Accuracy:", accuracy_score(y_true, y_pred))
print()
print("Classification Report:\n", classification_report(y_true, y_pred))
print()
print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))


RAG Accuracy: 0.18606224627875506

Classification Report:
               precision    recall  f1-score   support

    negative       0.13      0.96      0.24      1841
     neutral       0.71      0.05      0.10      8951
    positive       0.52      0.13      0.21      3988

    accuracy                           0.19     14780
   macro avg       0.45      0.38      0.18     14780
weighted avg       0.58      0.19      0.14     14780


Confusion Matrix:
 [[1772   54   15]
 [8021  464  466]
 [3335  139  514]]


In [35]:
df.to_csv("with_rag_predictions.csv", index=False)
faiss.write_index(index, FAISS_INDEX_PATH)
np.save(EMBS_PATH, embs)
print("Saved df CSV and FAISS index and embeddings.")


Saved df CSV and FAISS index and embeddings.


In [39]:
for k in [3, 10]:
    RAG_K = k
    rag_preds_k = []

    for i in range(0, len(df), 32):
        batch = df["text"].iloc[i:i+32].tolist()
        idx_batch = list(range(i, min(i+32, len(df))))
        rag_preds_k.extend(rag_classify_batch(batch, idx_batch, k=RAG_K))

    print(f"\nRAG results with k={k}")
    print("Accuracy:", accuracy_score(df["label"], rag_preds_k))



RAG results with k=3
Accuracy: 0.1604871447902571

RAG results with k=10
Accuracy: 0.17550744248985115


In [40]:
required_cols = ["finbert_pred", "llm_pred", "llm2_pred", "rag_pred"]
missing = [c for c in required_cols if c not in df.columns]
print("Missing columns:", missing)

#Save combined results
df.to_csv("resultsWithAllModels.csv", index=False)
print("Saved resultsWithAllModels.csv with", len(df), "rows.")


Missing columns: []
Saved resultsWithAllModels.csv with 14780 rows.


In [41]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# Make images look consistent
plt.style.use("default")

def save_cm(y_true, y_pred, filename, title):
    cm = confusion_matrix(y_true, y_pred, labels=["negative", "neutral", "positive"])

    plt.figure(figsize=(6, 5))
    sns.heatmap(cm,
                annot=True,
                fmt="d",
                cmap="Blues",
                xticklabels=["negative", "neutral", "positive"],
                yticklabels=["negative", "neutral", "positive"])
    
    plt.title(title)
    plt.xlabel("Predicted Label")
    plt.ylabel("True Label")

    plt.tight_layout()
    plt.savefig(filename, dpi=300)
    plt.close()
    print(f"Saved {filename}")

# === Generate all confusion matrices ===

save_cm(df["label"], df["finbert_pred"], "cm_finbert.png", "FinBERT Confusion Matrix")
save_cm(df["label"], df["llm_pred"],     "cm_llm1.png",   "LLM1 (DeBERTa NLI) Confusion Matrix")
save_cm(df["label"], df["llm2_pred"],    "cm_llm2.png",   "LLM2 (DistilBERT MNLI) Confusion Matrix")
save_cm(df["label"], df["rag_pred"],     "cm_rag.png",    "RAG-Based Sentiment Confusion Matrix")


Saved cm_finbert.png
Saved cm_llm1.png
Saved cm_llm2.png
Saved cm_rag.png
