In [1]:
!pip install -q -U transformers

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/10.2 MB[0m [31m34.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm

import torch
import torch.nn as nn

import transformers
from transformers import (AutoTokenizer, 
                          AutoModelForSequenceClassification,
                          TrainingArguments, 
                          Trainer, 
                          AutoModelForMaskedLM,AutoConfig)

from datasets import load_dataset
from datasets import Dataset

from sklearn.metrics import (accuracy_score, 
                             classification_report, 
                             confusion_matrix)
from sklearn.model_selection import train_test_split

In [5]:
print(f"transformers=={transformers.__version__}")

transformers==4.50.2


In [6]:
tokenizer = AutoTokenizer.from_pretrained("answerdotai/ModernBERT-base")

tokenizer_config.json:   0%|          | 0.00/20.8k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

In [7]:
# Define the label mapping
id2label = {0: "negative", 1: "neutral", 2: "positive"}
label2id = {"negative": 0, "neutral": 1, "positive": 2}

# Load the model AND specify the number of labels
model = AutoModelForSequenceClassification.from_pretrained(
    "answerdotai/ModernBERT-base",
    num_labels=3,
    id2label=id2label,
    label2id=label2id
)

config.json:   0%|          | 0.00/1.19k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

Some weights of ModernBertForSequenceClassification were not initialized from the model checkpoint at answerdotai/ModernBERT-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
filename = "../input/sentiment-analysis-for-financial-news/all-data.csv"

df = pd.read_csv(filename, 
                 names=["sentiment", "text"],
                 encoding="utf-8", encoding_errors="replace")

X_train = list()
X_test = list()
for sentiment in ["positive", "neutral", "negative"]:
    train, test  = train_test_split(df[df.sentiment==sentiment], 
                                    train_size=300,
                                    test_size=300, 
                                    random_state=42)
    X_train.append(train)
    X_test.append(test)

X_train = pd.concat(X_train).sample(frac=1, random_state=10)
X_test = pd.concat(X_test)

eval_idx = [idx for idx in df.index if idx not in list(train.index) + list(test.index)]
X_eval = df[df.index.isin(eval_idx)]
X_eval = (X_eval
          .groupby('sentiment', group_keys=False)
          .apply(lambda x: x.sample(n=50, random_state=10, replace=True)))
X_train = X_train.reset_index(drop=True)

In [9]:
# Also, ensure your label mapping uses the dictionary defined above for consistency
for df in [X_train, X_test, X_eval]:
    df["labels"] = df.sentiment.map(label2id) # Use the label2id dictionary

In [10]:
train_data = Dataset.from_pandas(X_train)
test_data = Dataset.from_pandas(X_test)
eval_data = Dataset.from_pandas(X_eval)

In [11]:
#define the tokenizer function
def tokenizer_function(x):
    return tokenizer(
        x["text"],
        padding="max_length",  
        truncation=True,       
        max_length=512,      ## max length can be modified
        return_tensors="pt"
    )

#tokenize training and testing data set based on above defined tokenizer function


In [12]:
train_data = train_data.map(tokenizer_function, batched=True).remove_columns(['text', 'sentiment'])
test_data = test_data.map(tokenizer_function, batched=True).remove_columns(['text', 'sentiment'])
eval_data = eval_data.map(tokenizer_function, batched=True).remove_columns(['text', 'sentiment'])

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/900 [00:00<?, ? examples/s]

Map:   0%|          | 0/150 [00:00<?, ? examples/s]

In [13]:
import numpy as np
from sklearn.metrics import f1_score, accuracy_score
 
# Metric helper method
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    score = f1_score(labels, predictions, average="weighted")
    return {"f1": score}

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    # The predictions are raw logits
    predictions = np.argmax(predictions, axis=1)

    # Calculate weighted F1 score
    f1 = f1_score(labels, predictions, average="weighted")

    # Calculate accuracy
    accuracy = accuracy_score(labels, predictions)

    # Return both metrics in a dictionary
    return {"f1": f1, "accuracy": accuracy}

In [14]:
#define training arguments 
train_batch, val_batch = (8, 8)
lr = 8e-5
betas = (0.9, 0.98)
n_epochs = 40
eps = 1e-6



training_args = TrainingArguments(
    output_dir=f"fine_tuned_modern_bert",
    learning_rate=lr,
    per_device_train_batch_size=train_batch,
    per_device_eval_batch_size=val_batch,
    num_train_epochs=n_epochs,
    lr_scheduler_type="linear",
    optim="adamw_torch",
    adam_beta1=betas[0],
    adam_beta2=betas[1],
    adam_epsilon=eps,
    logging_strategy="epoch",
    eval_strategy="epoch",        
    save_strategy="epoch",        
    load_best_model_at_end=True,
    save_total_limit=1,
    metric_for_best_model="eval_accuracy",
    greater_is_better=True,
    bf16=True,                   
    bf16_full_eval=True,          
    push_to_hub=False,
    report_to="none"            
)


In [15]:
#Create a Trainer instance
trainer = Trainer(
    model=model,                         # The pre-trained model
    args=training_args,                  # Training arguments
    train_dataset=train_data,            # Tokenized training dataset
    eval_dataset=test_data,              # Tokenized test dataset
    compute_metrics=compute_metrics,
)

In [16]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Accuracy
1,0.8258,0.680526,0.72839,0.743333
2,0.3568,0.726478,0.797681,0.794444
3,0.1335,1.302265,0.768788,0.783333
4,0.1401,0.997987,0.825,0.827778
5,0.1536,1.307202,0.80238,0.805556
6,0.0844,1.242629,0.814023,0.815556
7,0.0954,1.907079,0.823563,0.822222
8,0.0441,1.677115,0.812923,0.811111
9,0.0615,1.365289,0.840221,0.841111
10,0.0414,1.315772,0.804075,0.803333


TrainOutput(global_step=4520, training_loss=0.05333313362392704, metrics={'train_runtime': 4992.7425, 'train_samples_per_second': 7.21, 'train_steps_per_second': 0.905, 'total_flos': 1.2267367501824e+16, 'train_loss': 0.05333313362392704, 'epoch': 40.0})

In [17]:
evaluation_results = trainer.evaluate(eval_data)
print(f"Evaluation Results -  f1 score: {evaluation_results['eval_f1']:0.5f} | accuracy: {evaluation_results['eval_accuracy']:0.5f}")

Evaluation Results -  f1 score: 0.86421 | accuracy: 0.86667


In [18]:
# Save the trained model 
model.save_pretrained("./saved_model")
# Save the tokenizer
tokenizer.save_pretrained("./saved_model")

('./saved_model/tokenizer_config.json',
 './saved_model/special_tokens_map.json',
 './saved_model/tokenizer.json')