In [14]:
import sys
import os
import pandas as pd
from pathlib import Path

project_root = Path.cwd().parent  # one level up from "notebook"
sys.path.append(str(project_root))

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

train_df = pd.read_json("../data/raw/training_merged.json", orient="records", lines=True)
dev_df = pd.read_json("../data/raw/development_merged.json", orient="records", lines=True)


In [15]:
from project_name.preprocessing.BERTweet_preprocessing import BERTweetPreprocessor
preprocessor = BERTweetPreprocessor()
preprocessor.extract_features_labels(train_df, 'tweet', 'emotion')
X_train, y_train = preprocessor.preprocess_df(train_df)
X_dev, y_dev = preprocessor.preprocess_df(dev_df)

In [16]:
from transformers import AutoTokenizer
from project_name.data.tokenizer import TweetDataset
import torch
from torch.utils.data import DataLoader

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
train_set = TweetDataset(X_train, y_train)
dev_set = TweetDataset(X_dev, y_dev)

def collate_batch(batch):
    tweets, labels = zip(*batch)
    encoding = tokenizer(
        list(tweets),
        padding=True,
        truncation=True,
        max_length=128,
        return_token_type_ids=False,
        return_tensors="pt"
    )
    labels = torch.tensor(labels, dtype=torch.long)
    encoding["labels"] = labels
    return encoding
num_labels = len(set(y_train))
train_loader = DataLoader(train_set, batch_size=32, shuffle=True, collate_fn=collate_batch)
dev_loader = DataLoader(dev_set, batch_size=32, collate_fn=collate_batch)


In [17]:
from transformers import AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [18]:
from torch.optim import AdamW
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=2e-5)
EPOCHS = 15
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: train loss = {total_loss/len(train_loader):.4f}")

    # ---------- validation ----------
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dev_loader:
            batch = {k: v.to(device) for k, v in batch.items()}        
            inputs = {k: v for k, v in batch.items() if k != 'labels'}
            logits = model(**batch).logits
            preds = logits.argmax(dim=1)
            labels = batch["labels"]
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    print(f"Epoch {epoch+1}: dev accuracy = {correct/total:.2%}")

Epoch 1/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 1: train loss = 0.9440
Epoch 1: dev accuracy = 49.11%


Epoch 2/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 2: train loss = 0.3819
Epoch 2: dev accuracy = 49.80%


Epoch 3/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 3: train loss = 0.2432
Epoch 3: dev accuracy = 50.20%


Epoch 4/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 4: train loss = 0.1872
Epoch 4: dev accuracy = 50.14%


Epoch 5/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 5: train loss = 0.1468
Epoch 5: dev accuracy = 50.14%


Epoch 6/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 6: train loss = 0.1149
Epoch 6: dev accuracy = 49.18%


Epoch 7/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 7: train loss = 0.0940
Epoch 7: dev accuracy = 47.61%


Epoch 8/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 8: train loss = 0.0825
Epoch 8: dev accuracy = 49.45%


Epoch 9/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 9: train loss = 0.0797
Epoch 9: dev accuracy = 48.22%


Epoch 10/15:   0%|          | 0/222 [00:00<?, ?it/s]

Epoch 10: train loss = 0.0725
Epoch 10: dev accuracy = 49.73%


Epoch 11/15:   0%|          | 0/222 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import torch
from transformers import AutoModelForSequenceClassification, get_linear_schedule_with_warmup
from tqdm.auto import tqdm
import copy

# ----- Setup -----
NUM_CLASSES = 4  # Change as needed!
MODEL_NAME = "vinai/bertweet-base"

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=NUM_CLASSES)
model.to(device)

optimizer = AdamW(model.parameters(), lr=1e-5)
EPOCHS = 10
patience = 5

# Total training steps
num_training_steps = EPOCHS * len(train_loader)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=int(0.1 * num_training_steps),  # 10% warmup
    num_training_steps=num_training_steps
)

best_accuracy = 0
epochs_no_improve = 0
best_model_state = None

for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: train loss = {total_loss/len(train_loader):.4f}")

    # ---------- validation ----------
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dev_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(**batch).logits
            preds = logits.argmax(dim=1)
            labels = batch["labels"]
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    val_acc = correct / total
    print(f"Epoch {epoch+1}: dev accuracy = {val_acc:.2%}")

    # Early stopping check
    if val_acc > best_accuracy:
        best_accuracy = val_acc
        epochs_no_improve = 0
        best_model_state = copy.deepcopy(model.state_dict())  # Save best weights
    else:
        epochs_no_improve += 1
        if epochs_no_improve >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

# Restore the best model weights
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f"Model reverted to epoch with best dev accuracy: {best_accuracy:.2%}")

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 1: train loss = 1.3840
Epoch 1: dev accuracy = 30.50%


Epoch 2/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 2: train loss = 1.3589
Epoch 2: dev accuracy = 30.50%


Epoch 3/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 3: train loss = 1.3444
Epoch 3: dev accuracy = 30.50%


Epoch 4/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 4: train loss = 1.3467
Epoch 4: dev accuracy = 30.50%


Epoch 5/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 5: train loss = 1.3193
Epoch 5: dev accuracy = 30.50%


Epoch 6/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 6: train loss = 1.2976
Epoch 6: dev accuracy = 31.00%


Epoch 7/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 7: train loss = 1.2757
Epoch 7: dev accuracy = 30.50%


Epoch 8/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 8: train loss = 1.2545
Epoch 8: dev accuracy = 31.00%


Epoch 9/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 9: train loss = 1.2313
Epoch 9: dev accuracy = 30.50%


Epoch 10/10:   0%|          | 0/16 [00:00<?, ?it/s]

Epoch 10: train loss = 1.2267
Epoch 10: dev accuracy = 31.00%
Model reverted to epoch with best dev accuracy: 31.00%
