In [None]:
import torch
import numpy as np
import pandas as pd
import time
import random
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

filtered_df = pd.read_csv("balanced_finance_sentiment.csv")


emotion2label = {'negative': 0, 'neutral': 1, 'positive': 2}
label2emotion = {v: k for k, v in emotion2label.items()}
filtered_df["label"] = filtered_df["output"].map(emotion2label)


train_df, test_df = train_test_split(
    filtered_df,
    test_size=0.2,
    stratify=filtered_df['label'],
    random_state=42
)


negative_samples = train_df[train_df["label"] == 0].copy()

#  augmentation for negatives
def augment_negative_text(text):
    words = text.split()
    if len(words) > 4:
        idx = random.randint(0, len(words) - 1)
        words[idx] = random.choice(["bad", "terrible", "awful", "worse"])
    return " ".join(words)

negative_samples["input"] = negative_samples["input"].apply(augment_negative_text)


train_df = pd.concat([train_df, negative_samples]).sample(frac=1).reset_index(drop=True)

# Hard Negative Mining: Move misclassified "neutral" samples
misclassified_neutral = train_df[(train_df["label"] == 1) & (np.random.rand() > 0.9)]
misclassified_neutral["label"] = misclassified_neutral["input"].apply(
    lambda x: 0 if "drop" in x or "decline" in x else 2
)


train_df = pd.concat([train_df, misclassified_neutral]).sample(frac=1).reset_index(drop=True)


train_dataset = Dataset.from_pandas(train_df[['input', 'label']].rename(columns={'input': 'text'}))
test_dataset = Dataset.from_pandas(test_df[['input', 'label']].rename(columns={'input': 'text'}))


model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])
test_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "label"])

num_labels = len(emotion2label)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

#  Dropout to Reduce "Neutral" Overconfidence
model.config.hidden_dropout_prob = 0.2
model.config.attention_probs_dropout_prob = 0.2


class_weights = torch.tensor([3.5, 1.0, 2.5], device=model.device)

#  Focal Loss
class FocalLoss(torch.nn.Module):
    def __init__(self, alpha=None, gamma=2):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.ce_loss = torch.nn.CrossEntropyLoss(weight=alpha)

    def forward(self, logits, labels):
        ce_loss = self.ce_loss(logits, labels)
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss
        return focal_loss.mean()

#  Trainer with Focal Loss and Log Monitoring
class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        device = model.device
        labels = inputs.pop("labels").to(device)
        outputs = model(**inputs)
        logits = outputs.logits.to(device)

        loss_fct = FocalLoss(alpha=class_weights.to(device))
        loss = loss_fct(logits, labels)

        #  Temperature Scaling to Softmax
        temperature = 3.5
        probabilities = torch.nn.functional.softmax(logits / temperature, dim=-1)



training_args = TrainingArguments(
    output_dir="./distilbert_finetuned",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    lr_scheduler_type="cosine_with_restarts",
)



def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    accuracy = accuracy_score(labels, predictions)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average="weighted")
    return {"accuracy": accuracy, "precision": precision, "recall": recall, "f1": f1}


trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)


start_time = time.time()

trainer.train()

end_time = time.time()
training_time = end_time - start_time
print(f" Training time: {training_time / 60:.2f} minutes")


model.save_pretrained("./distilbert_finetuned_stocks")
tokenizer.save_pretrained("./distilbert_finetuned_stocks")


results = trainer.evaluate()
print(" DistilBERT Evaluation Results:")
print(results)


Map:   0%|          | 0/35200 [00:00<?, ? examples/s]

Map:   0%|          | 0/6600 [00:00<?, ? examples/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


⏳ Starting DistilBERT fine-tuning on finance dataset...
Step 0 - Logits: [[ 0.2366845   0.08507209 -0.08979971]
 [ 0.16016254  0.00312399  0.06130556]
 [ 0.29785255  0.07400379 -0.04873648]
 [ 0.31987348  0.07716941 -0.06853196]
 [ 0.21745856  0.15544498  0.05267399]
 [ 0.3003972  -0.00940354 -0.09039551]
 [ 0.25261986 -0.09815821 -0.05242219]
 [ 0.22560844  0.10553806 -0.06020506]
 [ 0.28265715 -0.05104182 -0.04397107]
 [ 0.39241654  0.00959758 -0.04113171]
 [ 0.08488388  0.01560432 -0.13549767]
 [ 0.19440497  0.03713063  0.01694857]
 [ 0.22617249 -0.10515435 -0.01913144]
 [ 0.33315718 -0.04153408  0.02398442]
 [ 0.23214465  0.02503229  0.01136158]
 [ 0.27799544  0.03506692 -0.00390468]
 [ 0.23350666  0.0101525  -0.00518764]
 [ 0.31644905 -0.00196436 -0.11212435]
 [ 0.23715399  0.03370024 -0.01101228]
 [ 0.24710396 -0.0717489   0.00043886]
 [ 0.15394057  0.04824856 -0.16387066]
 [ 0.33672777  0.09892555 -0.05310716]
 [ 0.19227645  0.06029699 -0.0795258 ]
 [ 0.3285637   0.08569193 -0.1

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0041,0.007648,0.914242,0.913865,0.914242,0.913607
2,0.0005,0.002464,0.932879,0.93572,0.932879,0.931562
3,0.0005,0.001923,0.948485,0.949926,0.948485,0.947861
4,0.0,0.002218,0.937273,0.941269,0.937273,0.936071
5,0.0001,0.001573,0.958788,0.959434,0.958788,0.958432
6,0.0001,0.002232,0.956212,0.957536,0.956212,0.95595
7,0.0,0.001562,0.958182,0.959023,0.958182,0.957791
8,0.0,0.001595,0.958788,0.959546,0.958788,0.958435
9,0.0,0.001624,0.961061,0.961455,0.961061,0.960791
10,0.0,0.001637,0.961212,0.961659,0.961212,0.960939


Step 100 - Logits: [[-1.6417672e-01 -2.4392507e-03  9.3043762e-01]
 [-6.3379818e-01  1.4259894e-01  9.0031534e-01]
 [ 3.0927575e+00 -1.3692920e+00 -1.2147516e+00]
 [ 2.8657532e+00 -1.1218085e+00 -1.2728751e+00]
 [ 2.4229457e+00 -1.1288965e+00 -5.3734601e-01]
 [ 9.1751762e-02 -2.3882193e-02  9.1051787e-01]
 [-3.3195210e-01 -8.5618667e-02  9.0172160e-01]
 [-5.3832209e-01  3.6237709e-02  1.2395627e+00]
 [ 1.7566772e+00 -9.0082657e-01 -7.5892404e-02]
 [-4.5935124e-01 -4.0661763e-02  1.2948143e+00]
 [-1.8673436e-01 -3.0332385e-02  1.0903361e+00]
 [ 3.0491085e+00 -1.5224074e+00 -1.0092329e+00]
 [-2.3011555e-01  9.6091896e-02  7.9042315e-01]
 [ 1.1654606e+00 -8.0132020e-01  1.1904708e-01]
 [ 1.5524048e+00 -7.8497767e-01  2.2698303e-01]
 [ 3.1872873e+00 -1.3661103e+00 -1.2609283e+00]
 [ 2.9935840e-01 -5.7037091e-01  9.5733327e-01]
 [-2.8031325e-01  5.3915635e-02  8.7982118e-01]
 [ 2.1368513e+00 -1.2921772e+00 -4.6616262e-01]
 [-6.0129660e-01 -6.0371265e-02  1.0223502e+00]
 [ 1.8273734e+00 -1.0

Step 11000 - Logits: [[-2.1494045  -1.6917092   4.252903  ]
 [-1.6600187   3.7684     -1.9259619 ]
 [-2.2453737   4.3852663  -2.1660218 ]
 [ 5.4074173  -1.9690926  -3.084487  ]
 [-2.5146453  -1.0510857   3.8681192 ]
 [ 5.335794   -2.0949857  -2.8459177 ]
 [-2.6971135   4.312337   -1.5404559 ]
 [-2.6433423   4.1068997  -1.316869  ]
 [-2.1309142   3.7366674  -1.35704   ]
 [-3.294982   -0.6542114   4.3219414 ]
 [-0.791611    3.4680932  -2.5831351 ]
 [ 4.498478   -1.5365101  -2.6394982 ]
 [ 4.185363   -0.8536337  -2.8982494 ]
 [-1.3986976   3.6229105  -2.0101435 ]
 [ 4.0720987  -1.0199182  -2.814548  ]
 [ 3.5665586  -0.59036094 -2.8777242 ]
 [-2.3009708   2.2932756   0.31806973]
 [-2.7479641  -1.1015196   4.232642  ]
 [ 5.7088165  -2.1829615  -3.13236   ]
 [-1.6541349   4.137908   -2.1911368 ]
 [ 5.634457   -2.3686297  -3.0278451 ]
 [ 3.2940314  -0.339711   -2.6973085 ]
 [-2.8109362  -1.3894826   4.6588755 ]
 [-2.7322733   4.3638864  -1.5948865 ]
 [-2.7260292  -1.3697673   4.3813953 ]
 [-3