In [1]:
# load dataset data/test.csv and data/train.csv

import pandas as pd

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

print(train.shape, test.shape)

train.head()

(120000, 2) (7600, 2)


Unnamed: 0,text,label
0,wall st bear claw back black reuters reuters s...,2
1,carlyle look toward commercial aerospace reute...,2
2,oil economy cloud stock outlook reuters reuter...,2
3,iraq halt oil export main southern pipeline re...,2
4,oil price soar alltime record posing new menac...,2


In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '4'

In [3]:
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification

"""
Possible model names:
- bert-base-uncased
- bert-large-uncased
- roberta-base
- roberta-large
- facebook/bart-base
- facebook/bart-large
"""

model_name = "facebook/bart-large"


tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=True)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4)
model.cuda()

  from .autonotebook import tqdm as notebook_tqdm
tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 136kB/s]
config.json: 100%|██████████| 1.63k/1.63k [00:00<00:00, 10.3MB/s]
vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 3.34MB/s]
merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 2.33MB/s]
tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 55.6MB/s]
pytorch_model.bin: 100%|██████████| 1.02G/1.02G [00:09<00:00, 110MB/s] 
Some weights of BartForSequenceClassification were not initialized from the model checkpoint at facebook/bart-large and are newly initialized: ['classification_head.dense.bias', 'classification_head.dense.weight', 'classification_head.out_proj.bias', 'classification_head.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BartForSequenceClassification(
  (model): BartModel(
    (shared): Embedding(50265, 1024, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 1024, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 1024)
      (layers): ModuleList(
        (0-11): 12 x BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=1024, out_features=4096, bias=True)
          (fc2): Linear(in_features=4096, out_features=1024, bias=True)
          (final_layer_norm): Layer

In [4]:
# Custom dataset class
from torch.utils.data import Dataset

class AGNewsDataset(Dataset):
    def __init__(self, texts, labels):
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        return self.texts[idx], self.labels[idx]

In [5]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from transformers import AdamW, get_linear_schedule_with_warmup
from tqdm import tqdm
import wandb

LEARNING_RATE = 2e-5
EPOCHS = 3
BATCH_SIZE = 64


# Initialize Weights & Biases
wandb.init(project="text_classification", name=f"{model_name.split('/')[-1]}_AGNews_epochs={EPOCHS}_lr={LEARNING_RATE}_bs={BATCH_SIZE}")

# Create dataset (assuming train and test DataFrames are already defined)
train_texts = train['text'].tolist()
train_labels = train['label'].tolist()
test_texts = test['text'].tolist()
test_labels = test['label'].tolist()

train_dataset = AGNewsDataset(train_texts, train_labels)
test_dataset = AGNewsDataset(test_texts, test_labels)

# Create dataloader
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

# Define model, optimizer, scheduler, and loss function
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
epochs = EPOCHS
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=total_steps)
loss_fn = nn.CrossEntropyLoss()

# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def evaluate_model(model, test_loader):
    model.eval()
    total_loss, total_accuracy = 0, 0

    with torch.no_grad():
        for texts, labels in test_loader:
            inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
            labels = torch.tensor(labels).to(device)

            outputs = model(**inputs, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()
            total_accuracy += (outputs.logits.argmax(dim=-1) == labels).sum().item()

    return total_loss / len(test_loader), total_accuracy / len(test_loader.dataset)

# Training loop
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    progress_bar = tqdm(enumerate(train_loader), total=len(train_loader), desc=f"Epoch {epoch+1}/{epochs}", leave=False)
    for step, (texts, labels) in progress_bar:
        inputs = tokenizer(texts, padding=True, truncation=True, return_tensors='pt').to(device)
        labels = torch.tensor(labels).to(device)

        optimizer.zero_grad()

        outputs = model(**inputs, labels=labels)
        loss = outputs.loss
        running_loss += loss.item()

        loss.backward()
        optimizer.step()
        scheduler.step()

        wandb.log({"train_loss": loss.item()})

        # Update progress bar
        progress_bar.set_postfix({"Train Loss": f"{running_loss/(step+1):.4f}"})

        if step % 100 == 0:
            # Evaluate on test set every 100 steps
            test_loss, test_accuracy = evaluate_model(model, test_loader)
            wandb.log({"test_loss": test_loss, "test_accuracy": test_accuracy})
            progress_bar.set_postfix({"Train Loss": f"{running_loss/(step+1):.4f}", "Test Loss": f"{test_loss:.4f}", "Test Acc": f"{test_accuracy:.4f}"})

    progress_bar.close()

# Final evaluation on test set
final_test_loss, final_test_accuracy = evaluate_model(model, test_loader)
wandb.log({"final_test_loss": final_test_loss, "final_test_accuracy": final_test_accuracy})

# Finish the wandb run
wandb.finish()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mleolty[0m. Use [1m`wandb login --relogin`[0m to force relogin


  labels = torch.tensor(labels).to(device)
  labels = torch.tensor(labels).to(device)


0,1
final_test_accuracy,▁
final_test_loss,▁
test_accuracy,▁███████████████████████████████████████
test_loss,█▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
train_loss,▄▄▄▅▃▃▅▃▄▄▅▂▅▅█▅▃▃▂▃▄▃▃▁▂▃▂▂▁▂▂▁▁▁▂▁▁▂▂▁

0,1
final_test_accuracy,0.93895
final_test_loss,0.19905
test_accuracy,0.93895
test_loss,0.19912
train_loss,0.0097


In [6]:
# Save model
model.save_pretrained(f'checkpoints/{model_name.split("/")[-1]}_AGNews_epochs={EPOCHS}_lr={LEARNING_RATE}_bs={BATCH_SIZE}')