In [2]:
import torch
import transformers
import pandas as pd
import numpy as np
import wandb
from argparse import Namespace
from tqdm.auto import tqdm
from datasets import load_from_disk
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import DataCollatorWithPadding, get_scheduler, AdamW
from torch.nn.functional import cross_entropy
from torch.utils.data import DataLoader

2022-10-10 20:54:17.687153: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


### Load the dataset

In [3]:
ds = load_from_disk("../../Violence_data/geo_corpus.0.0.1_dataset_for_train")

In [4]:
ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['text', 'labels'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['text', 'labels'],
        num_rows: 2329158
    })
})

In [33]:
ds["train"][0]

{'text': 'Venezuela en crisis, y la Fiscal de shopping en Alemania (Video)',
 'labels': [1, 1, 1, 0, 0, 0]}

# Full training with native Pytorch and DataLoader

This code was inspired from the Transformers course available in Huggingface (Chapter 3: A full training)

### Setup the hyperparameters and other variables for training and wrap them in a *Namespace* for easy access

In [18]:
config = {
    "model_ckpt": "setu4993/smaller-LaBSE",
    "batch_size": 32,
    "num_labels" : 6,
    "init_lr": 5e-5,
    "num_epochs": 3,
    "num_warmup_steps": 0,
    "cuda_device": "cuda:2",
    "lr_scheduler_type": "cosine", # linear
    "weight_decay": 0.1,
    "max_length": 32,
    "seed": 42
}

args = Namespace(**config)

### From text to tokens

In [8]:
# Instantiate the tokenizer
model_ckpt = args.model_ckpt
tokenizer = BertTokenizerFast.from_pretrained(model_ckpt, model_max_length=args.max_length)

### Tokenizing the whole dataset

In [9]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

In [10]:
%time tokenized_ds = ds.map(tokenize, batched=True)

  0%|          | 0/16770 [00:00<?, ?ba/s]

  0%|          | 0/4193 [00:00<?, ?ba/s]

  0%|          | 0/2330 [00:00<?, ?ba/s]

CPU times: user 2h 37min 28s, sys: 5min 41s, total: 2h 43min 9s
Wall time: 30min 58s


In [11]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Prepare for training

In [12]:
tokenized_ds

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 16769932
    })
    validation: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4192483
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2329158
    })
})

In [13]:
# Remove column (text) and leave the columns the model expect for training
tokenized_ds = tokenized_ds.remove_columns('text')
tokenized_ds.set_format("torch")
tokenized_ds["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [14]:
tokenized_ds["train"][0]

{'labels': tensor([1, 1, 1, 0, 0, 0]),
 'input_ids': tensor([   101,  72921,  90930,  85944,  43099, 131735, 103297,  54882,  86754,
         121491,  90930,  46377,  43095,  73126,  43096,    102]),
 'token_type_ids': tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]),
 'attention_mask': tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])}

In [45]:
tokenized_ds["train"].features

{'labels': Sequence(feature=Value(dtype='int64', id=None), length=-1, id=None),
 'input_ids': Sequence(feature=Value(dtype='int32', id=None), length=-1, id=None),
 'token_type_ids': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None),
 'attention_mask': Sequence(feature=Value(dtype='int8', id=None), length=-1, id=None)}

### Define the dataloaders

In [15]:
train_dataloader = DataLoader(tokenized_ds["train"], shuffle=True, 
                              batch_size=args.batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(tokenized_ds["validation"],
                            batch_size=args.batch_size, collate_fn=data_collator)

In [16]:
# Inspect a batch to check if there are no mistakes
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'labels': torch.Size([32, 6]),
 'input_ids': torch.Size([32, 32]),
 'token_type_ids': torch.Size([32, 32]),
 'attention_mask': torch.Size([32, 32])}

### Define some helper functions

In [19]:
# Differentiate the parameters that should receive weight decay (Biases and LayerNorm weights
# are not subject to weight decay)
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [{'params': params_with_wd, 'weight_decay': args.weight_decay},
           {'params': params_without_wd, 'weight_decay': 0.0}]
                

In [20]:
# Calculate the loss for multilabel classification
def loss_fn(outputs, targets):
    return torch.n.n.BCEWithLogitsLoss()(outputs, targets)

### Instantiate the model, define optimizer and learning rate scheduler

In [35]:
# Instantiate the model
model = BertForSequenceClassification.from_pretrained(args.model_ckpt, 
                                                      num_labels = args.num_labels,
                                                     problem_type = "multi_label_classification")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at setu4993/smaller-LaBSE and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
# Define optimizer
optimizer = AdamW(get_grouped_params(model), lr=args.init_lr)



In [27]:
# Define the learning rate scheduler
num_epochs = args.num_epochs
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name = args.lr_scheduler_type,
    optimizer = optimizer,
    num_warmup_steps = args.num_warmup_steps,
    num_training_steps = num_training_steps
)
print(num_training_steps)

1572183


In [28]:
def get_lr():
    return optimizer.param_groups[0]['lr']

### The training loop

In [36]:
device = torch.device(args.cuda_device) if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(173347, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elemen

In [41]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        # loss = loss_fn(outputs, batch["labels"])
        loss = output.loss
        if _%100000 == 0:
            print(f'Epoch: {epoch}, Loss: {loss.item()}')
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progess_bar.update(1)

  0%|          | 0/1572183 [00:00<?, ?it/s]

RuntimeError: result type Float can't be cast to the desired output type Long