# Custom Financial BERT Training (Local Jupyter Version)
This notebook includes full environment setup for local Jupyter Notebook (Python 3.12, Anaconda, VS Code).

In [None]:
# ==== ENVIRONMENT SETUP ====
!pip uninstall -y transformers accelerate tensorflow keras tf-keras
!pip install "transformers==4.36.2" "accelerate==0.24.1" --quiet
!pip install datasets evaluate scikit-learn pandas numpy --quiet

import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["HF_HUB_DISABLE_TF_WARNING"] = "1"

print("Environment setup complete. Please restart the Jupyter kernel.")

Found existing installation: transformers 4.36.2
Uninstalling transformers-4.36.2:
  Successfully uninstalled transformers-4.36.2
Found existing installation: accelerate 1.11.0
Uninstalling accelerate-1.11.0:
  Successfully uninstalled accelerate-1.11.0
Found existing installation: tensorflow 2.17.0
Uninstalling tensorflow-2.17.0:
  Successfully uninstalled tensorflow-2.17.0
Found existing installation: keras 3.6.0
Uninstalling keras-3.6.0:
  Successfully uninstalled keras-3.6.0
[0mEnvironment setup complete. Please restart the Jupyter kernel.


## Load Kaggle Dataset (Local File)

In [3]:
import pandas as pd

# Replace with your actual filename in the notebook folder
kaggle_file = "stock_data.csv"
df_kaggle = pd.read_csv(kaggle_file)

label_map = {-1:0, 0:1, 1:2}
df_kaggle["label"] = df_kaggle["Sentiment"].map(label_map)
df_kaggle = df_kaggle[["Text", "label"]]
df_kaggle.head()

Unnamed: 0,Text,label
0,Kickers on my watchlist XIDE TIT SOQ PNK CPW B...,2
1,user: AAP MOVIE. 55% return for the FEA/GEED i...,2
2,user I'd be afraid to short AMZN - they are lo...,2
3,MNTA Over 12.00,2
4,OI Over 21.37,2


## Load PhraseBank Dataset (Local File)

In [6]:
phrasebank_file = "Sentences_50Agree.txt"  # update if needed
texts=[]; labels=[]

with open(phrasebank_file, 'r', encoding='ISO-8859-1', errors='replace') as f:
    for line in f:
        line = line.strip()
        if '@positive' in line:
            texts.append(line.replace('@positive','').strip()); labels.append(2)
        elif '@neutral' in line:
            texts.append(line.replace('@neutral','').strip()); labels.append(1)
        elif '@negative' in line:
            texts.append(line.replace('@negative','').strip()); labels.append(0)

df_phrase = pd.DataFrame({'Text':texts, 'label':labels})
df_phrase.head()

Unnamed: 0,Text,label
0,"According to Gran , the company has no plans t...",1
1,Technopolis plans to develop in stages an area...,1
2,The international electronic industry company ...,0
3,With the new production plant the company woul...,2
4,According to the company 's updated strategy f...,2


## Combine Datasets & Encode

In [9]:
from sklearn.model_selection import train_test_split
from datasets import Dataset, DatasetDict

# Merge
import pandas as pd
df = pd.concat([df_phrase, df_kaggle], ignore_index=True)
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# Tokenization
from transformers import BertTokenizerFast
SEQ_LEN = 256
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')

def tok(batch):
    return tokenizer(batch['Text'], truncation=True, padding='max_length', max_length=SEQ_LEN)

enc = DatasetDict({
    'train': train_ds.map(tok, batched=True).remove_columns(['Text']).rename_column('label','labels'),
    'test': test_ds.map(tok, batched=True).remove_columns(['Text']).rename_column('label','labels')
})

enc.set_format('torch')

  _torch_pytree._register_pytree_node(


Map:   0%|          | 0/8509 [00:00<?, ? examples/s]

Map:   0%|          | 0/2128 [00:00<?, ? examples/s]

## Training

In [12]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import evaluate
import numpy as np

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

acc = evaluate.load('accuracy'); f1 = evaluate.load('f1')

def metrics(p):
    logits, labels = p
    preds = np.argmax(logits, axis=1)
    return {
        'accuracy': acc.compute(predictions=preds, references=labels)['accuracy'],
        'f1_weighted': f1.compute(predictions=preds, references=labels, average='weighted')['f1']
    }

training_args = TrainingArguments(
    output_dir='./custom_finbert',
    evaluation_strategy='epoch',
    save_strategy='epoch',
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=50
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=enc['train'],
    eval_dataset=enc['test'],
    compute_metrics=metrics
)

trainer.train()

  _torch_pytree._register_pytree_node(
W1119 22:51:03.710000 91553 site-packages/torch/distributed/elastic/multiprocessing/redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.
  _torch_pytree._register_pytree_node(
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1 Weighted
1,0.4833,0.431853,0.8125,0.814792
2,0.2792,0.40953,0.835996,0.834869
3,0.1117,0.619554,0.836466,0.836191




TrainOutput(global_step=1596, training_loss=0.32748962315102864, metrics={'train_runtime': 3172.5129, 'train_samples_per_second': 8.046, 'train_steps_per_second': 0.503, 'total_flos': 3358248107171328.0, 'train_loss': 0.32748962315102864, 'epoch': 3.0})

In [14]:
# Save model and tokenizer
trainer.save_model("custom_financial_bert")
tokenizer.save_pretrained("custom_financial_bert")
print("Model saved successfully!")


Model saved successfully!


In [24]:
import torch
import numpy as np

def predict_batch(text_list):
    inputs = tokenizer(
        text_list,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    )

    # Ensure model and tensors are on CPU
    model_cpu = model.to("cpu")
    inputs = {k: v.to("cpu") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model_cpu(**inputs)

    logits = outputs.logits
    preds = torch.argmax(logits, dim=1).numpy()

    id2label = {0: "negative", 1: "neutral", 2: "positive"}

    # FIXED — added the missing closing bracket
    return [id2label[p] for p in preds]


In [26]:
sample_texts = test_df["Text"].iloc[:10].tolist()
sample_labels = test_df["label"].iloc[:10].tolist()

preds = predict_batch(sample_texts)

for text, true_label, pred in zip(sample_texts, sample_labels, preds):
    print("\nTEXT:", text)
    print("TRUE LABEL:", {0:"negative",1:"neutral",2:"positive"}[true_label])
    print("MODEL PRED:", pred)



TEXT: The value of the deal exceeds EUR500 ,000 , the company said .
TRUE LABEL: neutral
MODEL PRED: neutral

TEXT: Trading in the new shares , which have right to dividends and other distributions of funds , will start on the exchange in Helsinki tomorrow .
TRUE LABEL: neutral
MODEL PRED: neutral

TEXT: The deal covers Stockmann Auto Oy Ab 's sales and after-sales services concerning Volkswagen and Audi in Helsinki , Espoo and Vantaa .
TRUE LABEL: neutral
MODEL PRED: neutral

TEXT: And when it has lifted the veil on the various types of customer for which it designs its phones , the result is social stereotyping taken to a fine art .
TRUE LABEL: neutral
MODEL PRED: neutral

TEXT: The bridge will be 1.2 km long and is located between Anasmotet by the road E20 and the new traffic junction in Marieholm by the road E45 .
TRUE LABEL: neutral
MODEL PRED: neutral

TEXT: One small step to send cable TV back to the 80s. Quality content exclusive on NFX. Can't wait for the return of AD.  
TRUE

In [28]:
def predict_single(text):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=256
    )

    model_cpu = model.to("cpu")
    inputs = {k: v.to("cpu") for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model_cpu(**inputs)

    pred = torch.argmax(outputs.logits).item()
    return {0:"negative",1:"neutral",2:"positive"}[pred]


In [30]:
test_sentence = "Apple has done good sales this week."
print("Prediction:", predict_single(test_sentence))

Prediction: positive


In [32]:
test_sentence = "Company lost its half of the customer"
print("Prediction:", predict_single(test_sentence))

Prediction: negative


In [34]:
def sentiment():
    while True:
        text = input("\nEnter text (type 'exit' to quit): ")
        if text.lower() == "exit":
            break
        print("Prediction:", predict_single(text))

# Run interactive tester
sentiment()



Enter text (type 'exit' to quit):  According to Gran , the company has no plans to move all production to Russia , although that is where the company is growing


Prediction: neutral



Enter text (type 'exit' to quit):  Operating profit totalled EUR 21.1 mn , up from EUR 18.6 mn in 2007 , representing 9.7 % of net sales .


Prediction: positive



Enter text (type 'exit' to quit):  Investor confidence increased as the stock surged in after-hours trading.


Prediction: positive



Enter text (type 'exit' to quit):  The company is facing a lawsuit that could impact its financial outlook.


Prediction: negative



Enter text (type 'exit' to quit):  I am very happy today


Prediction: positive



Enter text (type 'exit' to quit):  The company released its annual financial report on Wednesday.


Prediction: neutral



Enter text (type 'exit' to quit):  exit


In [36]:
import torch.nn.functional as F

def predict_with_probs(text):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = F.softmax(logits, dim=1).numpy()[0]

    return {
        "negative": float(probs[0]),
        "neutral": float(probs[1]),
        "positive": float(probs[2])
    }

print(predict_with_probs("The economy is showing strong signs of recovery."))


{'negative': 0.0009611451532691717, 'neutral': 0.001899946597404778, 'positive': 0.9971389770507812}


In [38]:
import torch
import torch.nn.functional as F

id2label = {0: "negative", 1: "neutral", 2: "positive"}

def predict_with_probs(text):
    # Tokenize
    inputs = tokenizer(
        text,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256
    )

    # Ensure CPU execution
    inputs = {k: v.to("cpu") for k, v in inputs.items()}
    model_cpu = model.to("cpu")

    with torch.no_grad():
        logits = model_cpu(**inputs).logits

    # Softmax → probabilities (convert to numpy)
    probs = F.softmax(logits, dim=1).numpy()[0]

    return {
        "negative": float(probs[0]),
        "neutral": float(probs[1]),
        "positive": float(probs[2]),
        "predicted_label": id2label[int(probs.argmax())]
    }


In [None]:
def sentiment_loop():
    print("Type 'exit' to stop.")
    while True:
        text = input("\nEnter text: ")
        if text.lower() == "exit":
            break
        result = predict_with_probs(text)
        print("\nProbabilities:")
        print(f"  Negative: {result['negative']:.4f}")
        print(f"  Neutral:  {result['neutral']:.4f}")
        print(f"  Positive: {result['positive']:.4f}")
        print("Prediction:", result["predicted_label"])

# Run it
sentiment_loop()


Type 'exit' to stop.



Enter text:  For the last quarter of 2010 , Componenta 's net sales doubled to EUR131m from EUR76m for the same period a year earlier , while it moved to a zero pre-tax profit from a pre-tax loss of EUR7m .



Probabilities:
  Negative: 0.0007
  Neutral:  0.0008
  Positive: 0.9986
Prediction: positive



Enter text:  I am Nabin



Probabilities:
  Negative: 0.1119
  Neutral:  0.0012
  Positive: 0.8870
Prediction: positive


In [42]:
model.save_pretrained("custom_financial_bert", safe_serialization=False)
