In [1]:
import sys
import os
import pandas as pd
from pathlib import Path

project_root = Path.cwd().parent  # one level up from "notebook"
sys.path.append(str(project_root))

parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))

if parent_dir not in sys.path:
    sys.path.append(parent_dir)

train_df = pd.read_json("../data/raw/training_merged.json", orient="records", lines=True)
dev_df = pd.read_json("../data/raw/development_merged.json", orient="records", lines=True)


In [None]:
from project_name.preprocessing.baseline_preprocessing import BaselinePreprocessor
preprocessor = BaselinePreprocessor()
X_train, y_train = preprocessor.extract_features_labels(train_df, "tweet", "emotion")
X_dev, y_dev = preprocessor.extract_features_labels(dev_df, "tweet", "emotion")

0       @xandraaa5 @amayaallyn6 shut up hashtags are c...
1       it makes me so fucking irate jesus. nobody is ...
2              Lol Adam the Bull with his fake outrage...
3       @THATSSHAWTYLO passed away early this morning ...
4       @Kristiann1125 lol wow i was gonna say really?...
                              ...                        
7097    Watch this amazing live.ly broadcast by @kana_...
7098    Watching @melissamccarthy in #Spy she's one of...
7099                              Could not be happier!! 
7100    @strictlysimilak something about English spark...
7101    and I think some of our most spiritually weigh...
Name: tweet, Length: 7102, dtype: object
0       anger
1       anger
2       anger
3       anger
4       anger
        ...  
7097      joy
7098      joy
7099      joy
7100      joy
7101      joy
Name: emotion, Length: 7102, dtype: object


In [3]:
from transformers import AutoTokenizer


from project_name.data.tokenizer import TweetDataset
unique_labels = sorted(set(y_train))
label2id = {label: i for i, label in enumerate(unique_labels)}

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
train_set = TweetDataset(X_train, y_train, tokenizer, label2id=label2id)
dev_set = TweetDataset(X_dev, y_dev, tokenizer, label2id=label2id)


{'input_ids': tensor([    0,  5238,   577,  1429,  1127,  3189,   211,  5238,  5516,  4580,
          638, 17538,   339,  1271,    49, 19395,    41,   501,    85,  7256,
            2,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1, 

In [4]:
from torch.utils.data import DataLoader
train_loader = DataLoader(train_set, batch_size=32, shuffle=True)
dev_loader = DataLoader(dev_set, batch_size=32)

In [5]:
from transformers import AutoModelForSequenceClassification
import torch
num_labels = len(label2id)

model = AutoModelForSequenceClassification.from_pretrained("vinai/bertweet-base", num_labels=num_labels)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at vinai/bertweet-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(130, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
         

In [6]:
from torch.optim import AdamW
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=2e-5)
EPOCHS = 3
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    for batch in tqdm(train_loader, desc=f"Epoch {epoch+1}/{EPOCHS}"):
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: train loss = {total_loss/len(train_loader):.4f}")

    # ---------- validation ----------
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for batch in dev_loader:
            batch = {k: v.to(device) for k, v in batch.items() if k != 'labels'}
            logits = model(**batch).logits
            preds = logits.argmax(dim=1)
            labels = batch["labels"].to(device)
            correct += (preds == labels).sum().item()
            total += labels.size(0)

    print(f"Epoch {epoch+1}: dev accuracy = {correct/total:.2%}")

Epoch 1/3:   0%|          | 0/222 [00:00<?, ?it/s]

KeyboardInterrupt: 