In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
import wandb
from argparse import Namespace
from tqdm.auto import tqdm
from datasets import load_from_disk
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DataCollatorWithPadding, get_scheduler, AdamW
from torch.nn.functional import cross_entropy
from torch.utils.data import DataLoader

2022-10-11 10:22:34.647082: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


### Load the dataset

In [2]:
ds = load_from_disk("../../Violence_data/geo_corpus.0.0.1_dataset_for_train")

In [3]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2329158
    })
})

In [4]:
ds["train"][0]

{'text': 'Venezuela en crisis, y la Fiscal de shopping en Alemania (Video)',
 'labels': [1.0, 1.0, 1.0, 0.0, 0.0, 0.0]}

# Full training with native Pytorch and DataLoader

This code was inspired from the Transformers course available in Huggingface (Chapter 3: A full training)

### Setup the hyperparameters and other variables for training and wrap them in a *Namespace* for easy access

In [5]:
config = {
    "model_ckpt": "setu4993/LaBSE",
    "batch_size": 1024,
    "num_labels" : 6,
    "init_lr": 5e-5,
    "num_epochs": 2,
    "num_warmup_steps": 0,
    "cuda_device": "cuda:3",
    "lr_scheduler_type": "cosine", # linear
    "weight_decay": 0.1,
    "max_length": 32,
    "seed": 42
}

args = Namespace(**config)

### From text to tokens

In [6]:
# Instantiate the tokenizer
model_ckpt = args.model_ckpt
tokenizer = BertTokenizerFast.from_pretrained(model_ckpt, model_max_length=args.max_length)

### Tokenizing the whole dataset

In [7]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

In [None]:
%time tokenized_ds = ds.map(tokenize, batched=True)

  0%|          | 0/16770 [00:00<?, ?ba/s]

In [None]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Prepare for training

In [None]:
tokenized_ds

In [None]:
# Remove column (text) and leave the columns the model expect for training
tokenized_ds = tokenized_ds.remove_columns('text')
tokenized_ds.set_format("torch")
tokenized_ds["train"].column_names

In [None]:
tokenized_ds["train"][0]

In [None]:
tokenized_ds["train"].features

### Define the dataloaders

In [None]:
train_dataloader = DataLoader(tokenized_ds["train"], shuffle=True, 
                              batch_size=args.batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_ds["validation"],
                            batch_size=args.batch_size, collate_fn=data_collator)

In [None]:
# Inspect a batch to check if there are no mistakes
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

### Define some helper functions

In [None]:
# Differentiate the parameters that should receive weight decay (Biases and LayerNorm weights
# are not subject to weight decay)
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [{'params': params_with_wd, 'weight_decay': args.weight_decay},
           {'params': params_without_wd, 'weight_decay': 0.0}]
                

### Instantiate the model, define optimizer and learning rate scheduler

In [None]:
# Instantiate the model
model = BertForSequenceClassification.from_pretrained(args.model_ckpt, 
                                                      num_labels = args.num_labels,
                                                     problem_type = "multi_label_classification")

In [None]:
# A test to make sure we have everything working properly when we pass our batch to this model
outputs = model(**batch)
print(outputs.loss, outputs.logits.shape)

In [None]:
# Define optimizer
optimizer = AdamW(get_grouped_params(model), lr=args.init_lr)

In [None]:
# Define the learning rate scheduler
num_epochs = args.num_epochs
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name = args.lr_scheduler_type,
    optimizer = optimizer,
    num_warmup_steps = args.num_warmup_steps,
    num_training_steps = num_training_steps
)
print(num_training_steps)

In [None]:
def get_lr():
    return optimizer.param_groups[0]['lr']

### The training loop

In [None]:
device = torch.device(args.cuda_device) if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)