In [None]:
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from datasets import load_dataset
from sklearn.metrics import accuracy_score, f1_score
import numpy as np

# random seed for reproducibility
torch.manual_seed(42)
np.random.seed(42)

#  Financial PhraseBank dataset  "sentences_75agree", includes 3453 sentences where at least 75% of annotators agreed on the sentiment label, striking a balance between quality (higher agreement) and size
dataset = load_dataset("financial_phrasebank", "sentences_75agree")
print("Dataset loaded:", dataset)


label_map = {0: "negative", 1: "neutral", 2: "positive"}
num_labels = 3


tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=num_labels)


def preprocess_function(examples):
    tokenized = tokenizer(examples["sentence"], truncation=True, padding="max_length", max_length=256)
    tokenized["labels"] = examples["label"]
    return tokenized


encoded_dataset = dataset.map(preprocess_function, batched=True)
encoded_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

#  train (70%), validation (15%), and test (15%)
train_val_split = encoded_dataset["train"].train_test_split(test_size=0.3, seed=42)
val_test_split = train_val_split["test"].train_test_split(test_size=0.5, seed=42)
train_dataset = train_val_split["train"]
val_dataset = val_test_split["train"]
test_dataset = val_test_split["test"]
print(f"Train size: {len(train_dataset)}, Val size: {len(val_dataset)}, Test size: {len(test_dataset)}")


def compute_metrics(pred):
    labels = pred.label_ids
    preds = np.argmax(pred.predictions, axis=1)
    accuracy = accuracy_score(labels, preds)
    f1 = f1_score(labels, preds, average="weighted")
    return {"accuracy": accuracy, "f1": f1}


training_args = TrainingArguments(
    output_dir="./distilbert-financial-sentiment",
    evaluation_strategy="steps",
    save_strategy="steps",
    save_steps=200,
    eval_steps=200,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=10,
    weight_decay=0.01,
    warmup_steps=500,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True,
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3, early_stopping_threshold=0.01)],
)


trainer.train()


print("Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)
print("Test results:", test_results)


final_model_dir = "./distilbert_financial_phasebank"
model.save_pretrained(final_model_dir)
tokenizer.save_pretrained(final_model_dir)
print(f"Model and tokenizer saved to {final_model_dir}")

#  Example inference
def predict_sentiment(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=256)
    inputs = {key: val.to(model.device) for key, val in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return label_map[predicted_class]  # Map integer back to string for readability

sample_text = "The company's stock surged after a strong earnings report."
prediction = predict_sentiment(sample_text)
print(f"Sample text: '{sample_text}'")
print(f"Predicted sentiment: {prediction}")

Dataset loaded: DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 3453
    })
})


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Train size: 2417, Val size: 518, Test size: 518
Starting fine-tuning...


Step,Training Loss,Validation Loss,Accuracy,F1
200,0.7147,0.612579,0.745174,0.700229
400,0.2769,0.50767,0.814672,0.823641
600,0.2416,0.304337,0.909266,0.905866
800,0.1812,0.30882,0.897683,0.900165
1000,0.0665,0.399483,0.907336,0.909392
1200,0.1208,0.361412,0.907336,0.908856


Evaluating on test set...


Test results: {'eval_loss': 0.39173537492752075, 'eval_accuracy': 0.9285714285714286, 'eval_f1': 0.9295369400025413, 'eval_runtime': 0.3853, 'eval_samples_per_second': 1344.248, 'eval_steps_per_second': 168.68, 'epoch': 3.9603960396039604}
Model and tokenizer saved to ./distilbert_financial_phasebank
Sample text: 'The company's stock surged after a strong earnings report.'
Predicted sentiment: positive
