# **Data selection & processing**


In [None]:
from datasets import load_dataset

dataset = load_dataset("takala/financial_phrasebank", "sentences_allagree")
print(dataset)


You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/682k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2264 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 2264
    })
})


In [2]:
dataset["train"].to_csv("financial_phrasebank_allagree.csv", index=False)

Creating CSV from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

287841

In [24]:
import pandas as pd
import re
import torch
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
data = pd.read_csv("financial_phrasebank_allagree.csv", encoding='latin1')
data = data.rename(columns={'sentence': 'text'})
data.head()

Unnamed: 0,text,label
0,"According to Gran , the company has no plans t...",1
1,"For the last quarter of 2010 , Componenta 's n...",2
2,"In the third quarter of 2010 , net sales incre...",2
3,Operating profit rose to EUR 13.1 mn from EUR ...,2
4,"Operating profit totalled EUR 21.1 mn , up fro...",2


In [12]:
def process_text(text):
  text = text.lower()
  text = re.sub(r'[^\w\s]', '', text)
  tokens = word_tokenize(text)
  stop_words = set(stopwords.words('english'))
  filtered_tokens = [word for word in tokens if word not in stop_words]
  processed_text = ' '.join(filtered_tokens)
  return processed_text

def prepare_data(data):
    data['text'] = data['text'].apply(process_text)
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    return train_data, test_data

# **Bert-based models and prompting techniques**

In [17]:
def load_model(model_name):
    print(f"Loading {model_name}...")
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(model_name)
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model = model.to(device)
    return pipeline("sentiment-analysis",
                   model=model,
                   tokenizer=tokenizer,
                   device=0 if device == 'cuda' else -1)

In [18]:
def create_prompt(text, method):
    prompts = {
        "zero-shot": f"Classify the sentiment of the following statement: \n\"{text}\"\nAnswer as Positive, Negative, or Neutral.",
        "few-shot": f"Classify the sentiment of these statements:\n1. \"The revenue increased significantly.\" -> Positive\n2. \"Operational losses reduced profit.\" -> Negative\n3. \"Earnings were stable.\" -> Neutral\nNow classify:\n\"{text}\"",
        "chain-of-thought": f"Analyze step-by-step:\n\"{text}\"\nStep 1: Identify key phrases.\nStep 2: Evaluate impact.\nStep 3: Classify as Positive, Negative, or Neutral.",
        "instruction": f"You are a financial sentiment expert. Classify this statement:\n\"{text}\"\nExplain reasoning.",
        "contrastive": f"Compare with examples:\n1. \"Revenue grew 20%.\" -> Positive\n2. \"Losses led to decline.\" -> Negative\nClassify:\n\"{text}\"",
    }
    return prompts.get(method, f"Invalid method: {method}")

In [19]:
def predict_sentiment(classifier, texts, method, batch_size=32):
    results = []
    for i in range(0, len(texts), batch_size):
        batch = texts[i:i+batch_size]
        batch_prompts = [create_prompt(text, method) for text in batch]
        batch_results = classifier(batch_prompts)
        results.extend(batch_results)
    return results

In [20]:
def evaluate_predictions(predictions, true_labels):
    results = {
        "predictions": predictions,
        "accuracy": 0,
    }

    correct = 0
    pred_labels = []
    for pred, true in zip(predictions, true_labels):
        pred_label = 2 if pred['label'].upper() in ['POSITIVE', 'LABEL_2'] else \
                    (0 if pred['label'].upper() in ['NEGATIVE', 'LABEL_0'] else 1)
        pred_labels.append(pred_label)
        if pred_label == true:
            correct += 1

    results["accuracy"] = correct / len(true_labels)
    return results

In [21]:
def run_experiments(train_data, test_data):
    """Run experiments with multiple models and methods"""
    models = {
        "finbert": "ProsusAI/finbert",
        "finbert-tone": "yiyanghkust/finbert-tone",
        "financial-bert": "ahmedrachid/FinancialBERT",
        "roberta-financial": "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
    }

    methods = ["zero-shot", "few-shot", "chain-of-thought","instruction","contrastive"]
    results = {}

    for model_name, model_path in models.items():
        print(f"\nEvaluating {model_name}")
        classifier = load_model(model_path)

        for method in methods:
            print(f"\nTesting with {method}")
            predictions = predict_sentiment(
                classifier,
                test_data['text'].tolist(),
                method
            )

            eval_results = evaluate_predictions(predictions, test_data['label'].values)
            results[f"{model_name}_{method}"] = eval_results

            print(f"Accuracy: {eval_results['accuracy']:.4f}")
    return results


# **Evaluation and comparision**

In [25]:
if __name__ == "__main__":
    print("Loading and preparing data...")
    train_data, test_data = prepare_data(data)

    print("\nRunning experiments...")
    results = run_experiments(train_data, test_data)

    print("\nFinal Results:")
    for exp, res in results.items():
        print(f"{exp}: {res['accuracy']:.4f}")


Loading and preparing data...

Running experiments...

Evaluating finbert
Loading ProsusAI/finbert...

Running experiments...

Evaluating finbert
Loading ProsusAI/finbert...


tokenizer_config.json:   0%|          | 0.00/252 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config.json:   0%|          | 0.00/758 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]


Testing with zero-shot


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


Accuracy: 0.7219

Testing with few-shot
Accuracy: 0.3797

Testing with chain-of-thought
Accuracy: 0.3797

Testing with chain-of-thought
Accuracy: 0.6291

Testing with instruction
Accuracy: 0.6291

Testing with instruction
Accuracy: 0.7351

Testing with contrastive
Accuracy: 0.7351

Testing with contrastive
Accuracy: 0.3642

Evaluating finbert-tone
Loading yiyanghkust/finbert-tone...
Accuracy: 0.3642

Evaluating finbert-tone
Loading yiyanghkust/finbert-tone...


config.json:   0%|          | 0.00/533 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]


Testing with zero-shot
Accuracy: 0.5011

Testing with few-shot
Accuracy: 0.5011

Testing with few-shot
Accuracy: 0.2737

Testing with chain-of-thought
Accuracy: 0.2737

Testing with chain-of-thought
Accuracy: 0.6358

Testing with instruction
Accuracy: 0.6358

Testing with instruction
Accuracy: 0.7638

Testing with contrastive
Accuracy: 0.7638

Testing with contrastive
Accuracy: 0.3157

Evaluating financial-bert
Loading ahmedrachid/FinancialBERT...
Accuracy: 0.3157

Evaluating financial-bert
Loading ahmedrachid/FinancialBERT...


tokenizer_config.json:   0%|          | 0.00/324 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/589 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/439M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ahmedrachid/FinancialBERT and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Testing with zero-shot
Accuracy: 0.1678

Testing with few-shot
Accuracy: 0.1678

Testing with few-shot
Accuracy: 0.4547

Testing with chain-of-thought
Accuracy: 0.4547

Testing with chain-of-thought
Accuracy: 0.4216

Testing with instruction
Accuracy: 0.4216

Testing with instruction
Accuracy: 0.1391

Testing with contrastive
Accuracy: 0.1391

Testing with contrastive
Accuracy: 0.6093

Evaluating roberta-financial
Loading mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis...
Accuracy: 0.6093

Evaluating roberta-financial
Loading mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis...


tokenizer_config.json:   0%|          | 0.00/333 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/933 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/328M [00:00<?, ?B/s]


Testing with zero-shot
Accuracy: 0.9139

Testing with few-shot
Accuracy: 0.9139

Testing with few-shot
Accuracy: 0.2671

Testing with chain-of-thought
Accuracy: 0.2671

Testing with chain-of-thought
Accuracy: 0.8962

Testing with instruction
Accuracy: 0.8962

Testing with instruction
Accuracy: 0.9095

Testing with contrastive
Accuracy: 0.9095

Testing with contrastive
Accuracy: 0.3245

Final Results:
finbert_zero-shot: 0.7219
finbert_few-shot: 0.3797
finbert_chain-of-thought: 0.6291
finbert_instruction: 0.7351
finbert_contrastive: 0.3642
finbert-tone_zero-shot: 0.5011
finbert-tone_few-shot: 0.2737
finbert-tone_chain-of-thought: 0.6358
finbert-tone_instruction: 0.7638
finbert-tone_contrastive: 0.3157
financial-bert_zero-shot: 0.1678
financial-bert_few-shot: 0.4547
financial-bert_chain-of-thought: 0.4216
financial-bert_instruction: 0.1391
financial-bert_contrastive: 0.6093
roberta-financial_zero-shot: 0.9139
roberta-financial_few-shot: 0.2671
roberta-financial_chain-of-thought: 0.8962
r

# **Improvements**

In [26]:
def create_instruction_prompt(text, version="base"):
    prompts = {
        "base": f"You are a financial sentiment expert. Classify this statement:\n\"{text}\"\nExplain reasoning.",

        "detailed": f"""You are a financial analyst with expertise in sentiment analysis of financial statements.
Consider market impact, financial metrics, and industry context.
Statement: "{text}"
Task: Classify as Positive/Negative/Neutral based on financial implications.""",

        "step_by_step": f"""As a financial sentiment expert, analyze this statement step by step:
1. Statement: "{text}"
2. Identify key financial indicators
3. Evaluate market impact
4. Consider industry context
Classification (Positive/Negative/Neutral):""",

        "criteria_based": f"""Expert Financial Sentiment Analysis
Statement: "{text}"
Criteria to consider:
- Revenue/Profit trends
- Market position changes
- Operational efficiency
- Growth indicators
Based on these criteria, classify as Positive/Negative/Neutral.""",

        "comparative": f"""As a financial expert, evaluate this statement in context:
Statement: "{text}"
Compare with standard financial metrics:
- Above expectations → Positive
- Meeting expectations → Neutral
- Below expectations → Negative
Classification:"""
    }
    return prompts.get(version, prompts["base"])

In [None]:
def test_instruction_prompts(classifier, test_data):
    prompt_versions = ["base", "detailed", "step_by_step", "criteria_based", "comparative"]
    results = {}

    for version in prompt_versions:
        print(f"\nTesting instruction prompt version: {version}")
        predictions = []

        example_text = test_data['text'].iloc[0]
        prompt = create_instruction_prompt(example_text, version)
        print("\nExample prompt:")
        print(prompt)

        for text in test_data['text']:
            prompt = create_instruction_prompt(text, version)
            pred = classifier(prompt)
            predictions.append(pred[0])

        accuracy = evaluate_predictions(predictions, test_data['label'].values)
        results[version] = accuracy
        print(f"Accuracy: {accuracy['accuracy']:.4f}")

    return results
print("Loading and preparing data...")
train_data, test_data = prepare_data(data)
model_name = "mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis"
classifier = load_model(model_name)
results = test_instruction_prompts(classifier, test_data)


Loading and preparing data...
Loading mrm8488/distilroberta-finetuned-financial-news-sentiment-analysis...

Testing instruction prompt version: base

Example prompt:
You are a financial sentiment expert. Classify this statement:
"contract value amounts eur 24 million"
Explain reasoning.
Accuracy: 0.9095

Testing instruction prompt version: detailed

Example prompt:
You are a financial analyst with expertise in sentiment analysis of financial statements. 
Consider market impact, financial metrics, and industry context.
Statement: "contract value amounts eur 24 million"
Task: Classify as Positive/Negative/Neutral based on financial implications.
Accuracy: 0.8631

Testing instruction prompt version: step_by_step

Example prompt:
As a financial sentiment expert, analyze this statement step by step:
1. Statement: "contract value amounts eur 24 million"
2. Identify key financial indicators
3. Evaluate market impact
4. Consider industry context
Classification (Positive/Negative/Neutral):
Accu

In [None]:
print("Loading and preparing data...")
train_data, test_data = prepare_data(data)
model_name = "ProsusAI/finbert"

classifier = load_model(model_name)
results = test_instruction_prompts(classifier, test_data)


Loading and preparing data...
Loading ProsusAI/finbert...

Testing instruction prompt version: base

Example prompt:
You are a financial sentiment expert. Classify this statement:
"contract value amounts eur 24 million"
Explain reasoning.
Accuracy: 0.7351

Testing instruction prompt version: detailed

Example prompt:
You are a financial analyst with expertise in sentiment analysis of financial statements. 
Consider market impact, financial metrics, and industry context.
Statement: "contract value amounts eur 24 million"
Task: Classify as Positive/Negative/Neutral based on financial implications.
Accuracy: 0.7616

Testing instruction prompt version: step_by_step

Example prompt:
As a financial sentiment expert, analyze this statement step by step:
1. Statement: "contract value amounts eur 24 million"
2. Identify key financial indicators
3. Evaluate market impact
4. Consider industry context
Classification (Positive/Negative/Neutral):
Accuracy: 0.6490

Testing instruction prompt version: