In [1]:
import torch
import transformers
import pandas as pd
import numpy as np
import wandb
import evaluate
from argparse import Namespace
from tqdm.auto import tqdm
from datasets import load_from_disk
from transformers import BertTokenizerFast, BertForSequenceClassification
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import DataCollatorWithPadding, get_scheduler, AdamW
from torch.nn.functional import cross_entropy
from torch.utils.data import DataLoader
from accelerate import Accelerator

2022-11-08 23:12:22.040354: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.11.0


### Load the dataset

In [2]:
ds = load_from_disk("../../Violence_data/geo_corpus.0.0.1_dataset_for_train")

In [None]:
ds

In [None]:
ds["train"][0]

# Full training with native Pytorch and DataLoader

This code was inspired from the Transformers course available in Huggingface (Chapter 3: A full training)

### Setup the hyperparameters and other variables for training and wrap them in a *Namespace* for easy access

In [3]:
config = {
    "model_ckpt": "setu4993/smaller-LaBSE",
    "batch_size": 1024,
    "num_labels" : 6,
    "init_lr": 5e-5,
    "num_epochs": 3,
    "num_warmup_steps": 0,
    "cuda_device": "cuda:2",
    "lr_scheduler_type": "cosine", # linear
    "weight_decay": 0.1,
    "max_length": 32,
    "seed": 42
}

args = Namespace(**config)

### From text to tokens

In [4]:
# Instantiate the tokenizer
model_ckpt = args.model_ckpt
tokenizer = AutoTokenizer.from_pretrained(model_ckpt, 
                                              model_max_length=args.max_length)

### Tokenizing the whole dataset

In [5]:
def tokenize(batch):
    return tokenizer(batch["text"], truncation=True)

In [6]:
%time tokenized_ds = ds.map(tokenize, batched=True)

Loading cached processed dataset at ../../Violence_data/geo_corpus.0.0.1_dataset_for_train/train/cache-afc9acc25b1f35be.arrow
Loading cached processed dataset at ../../Violence_data/geo_corpus.0.0.1_dataset_for_train/validation/cache-f020fa0b102594cd.arrow
Loading cached processed dataset at ../../Violence_data/geo_corpus.0.0.1_dataset_for_train/test/cache-9c9e9af5f8942984.arrow


CPU times: user 426 ms, sys: 122 ms, total: 548 ms
Wall time: 545 ms


In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

### Prepare for training

In [None]:
tokenized_ds

In [8]:
# Remove column (text) and leave the columns the model expect for training
tokenized_ds = tokenized_ds.remove_columns('text')
tokenized_ds.set_format("torch")
tokenized_ds["train"].column_names

['labels', 'input_ids', 'token_type_ids', 'attention_mask']

In [None]:
tokenized_ds["train"][0]

In [None]:
tokenized_ds["train"].features

### Define the dataloaders

In [9]:
# Select a subsample for testing purposes
train_tokenized_ds = tokenized_ds["train"].select(range(20000))
validation_tokenized_ds = tokenized_ds["validation"].select(range(5000))

In [10]:
# We need to cast float labels to integer to compute some metrics
from datasets import Value, Sequence
validation_tokenized_ds_int = validation_tokenized_ds
new_features = validation_tokenized_ds_int.features.copy()
new_features['labels'] = Sequence(Value(dtype='int32'))
validation_tokenized_ds_int = validation_tokenized_ds.cast(new_features)

Casting the dataset:   0%|          | 0/1 [00:00<?, ?ba/s]

In [11]:
#train_dataloader = DataLoader(tokenized_ds["train"], shuffle=True, 
#                              batch_size=args.batch_size, collate_fn=data_collator)
#eval_dataloader = DataLoader(tokenized_ds["validation"],
#                             batch_size=args.batch_size, collate_fn=data_collator)
train_dataloader = DataLoader(train_tokenized_ds, shuffle=True, 
                              batch_size=args.batch_size, collate_fn=data_collator)
eval_dataloader = DataLoader(validation_tokenized_ds,
                            batch_size=args.batch_size, collate_fn=data_collator)

In [None]:
# Inspect a batch to check if there are no mistakes
for batch in train_dataloader:
    break
{k: v.shape for k, v in batch.items()}

### Define some helper functions

In [12]:
# Differentiate the parameters that should receive weight decay (Biases and LayerNorm weights
# are not subject to weight decay)
def get_grouped_params(model, no_decay=["bias", "LayerNorm.weight"]):
    params_with_wd, params_without_wd = [], []
    for n, p in model.named_parameters():
        if any(nd in n for nd in no_decay):
            params_without_wd.append(p)
        else:
            params_with_wd.append(p)
    return [{'params': params_with_wd, 'weight_decay': args.weight_decay},
           {'params': params_without_wd, 'weight_decay': 0.0}]
                

### Instantiate the model, define optimizer and learning rate scheduler

In [13]:
id2label = {0: 'post7geo10', 1: 'post7geo30', 2: 'post7geo50',
           3: 'pre7geo10', 4: 'pre7geo30', 5: 'pre7geo50'}
label2id = {'post7geo10': 0, 'post7geo30': 1, 'post7geo50': 2,
            'pre7geo10': 3, 'pre7geo30': 4, 'pre7geo50': 5}

In [None]:
# Instantiate the model
model = AutoModelForSequenceClassification.from_pretrained(args.model_ckpt, 
                                  num_labels = args.num_labels,
                                  problem_type = "multi_label_classification",
                                  id2label = id2label,
                                  label2id = label2id)

In [None]:
# A test to make sure we have everything working properly when we pass our batch to this model
outputs = model(**batch)
print(outputs.loss, outputs.logits, outputs.logits.shape)

In [None]:
predictions = torch.sigmoid(outputs.logits)
predictions

In [None]:
# Define optimizer
optimizer = AdamW(get_grouped_params(model), lr=args.init_lr)

In [None]:
# Define the learning rate scheduler
num_epochs = args.num_epochs
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_scheduler(
    name = args.lr_scheduler_type,
    optimizer = optimizer,
    num_warmup_steps = args.num_warmup_steps,
    num_training_steps = num_training_steps
)
print(num_training_steps)

In [14]:
def get_lr():
    return optimizer.param_groups[0]['lr']

### The training loop

In [None]:
accelerator = Accelerator()

In [None]:
train_dl, eval_dl, model, optimizer = accelerator.prepare(
train_dataloader, eval_dataloader, model, optimizer)

In [None]:
progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_dataloader:
        outputs = model(**batch)
        loss = outputs.loss
        print(epoch, batch, loss)
        accelerator.backward(loss)
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)

In [29]:
# Load required metrics
accuracy_metric = evaluate.load("accuracy")
roc_auc_metric = evaluate.load("roc_auc", "multilabel")
precision_micro_metric = evaluate.load("precision")
precision_weighted_metric = evaluate.load("precision")
recall_micro_metric = evaluate.load("recall")
recall_weighted_metric = evaluate.load("recall")
f1_micro_metric = evaluate.load("f1")
f1_weighted_metric = evaluate.load("f1")

In [33]:
def evaluate_fn(args, model, eval_dl, accelerator):
    model.eval()
    
    # Load required metrics
    #accuracy_metric = evaluate.load("accuracy")
    #roc_auc_metric = evaluate.load("roc_auc", "multilabel")
    #precision_micro_metric = evaluate.load("precision")
    #precision_weighted_metric = evaluate.load("precision")
    #recall_micro_metric = evaluate.load("recall")
    #recall_weighted_metric = evaluate.load("recall")
    #f1_micro_metric = evaluate.load("f1")
    #f1_weighted_metric = evaluate.load("f1")
    
    
    for batch in eval_dl:
        with torch.no_grad():
            outputs = model(**batch)
        logits = outputs.logits
        pred_prob = torch.sigmoid(logits)
        preds = (pred_prob > 0.5)*1
            
        for references, predictions in zip (batch["labels"], preds):
            accuracy_metric.add_batch(predictions=accelerator.gather(predictions), 
                references=accelerator.gather(references))
            roc_auc_metric.add_batch(prediction_scores=accelerator.gather(pred_prob), 
                                  references=accelerator.gather(batch["labels"]))
            precision_micro_metric.add_batch(predictions=accelerator.gather(predictions), 
                references=accelerator.gather(references))
            precision_weighted_metric.add_batch(predictions=accelerator.gather(predictions), 
                references=accelerator.gather(references))
            recall_micro_metric.add_batch(predictions=accelerator.gather(predictions), 
                references=accelerator.gather(references))
            recall_weighted_metric.add_batch(predictions=accelerator.gather(predictions), 
                references=accelerator.gather(references))
            f1_micro_metric.add_batch(predictions=accelerator.gather(predictions), 
                references=accelerator.gather(references))
            f1_weighted_metric.add_batch(predictions=accelerator.gather(predictions), 
                references=accelerator.gather(references))
   
    accuracy_res = accuracy_metric.compute()
    roc_auc_res = roc_auc_metric.compute(average="micro")
    precision_micro_res = precision_micro_metric.compute(average="micro")
    precision_weighted_res = precision_weighted_metric.compute(average="weighted")
    recall_micro_res = recall_micro_metric.compute(average="micro")
    recall_weighted_res = recall_weighted_metric.compute(average="weighted")
    f1_micro_res = f1_micro_metric.compute(average="micro")
    f1_weighted_res = f1_weighted_metric.compute(average="weighted")

    return accuracy_res, roc_auc_res, precision_micro_res, precision_weighted_res, recall_micro_res, recall_weighted_res, f1_micro_res, f1_weighted_res, logits

In [42]:
def training_function():
    accelerator = Accelerator(mixed_precision="fp16")
    
    # Instantiate the model
    model = AutoModelForSequenceClassification.from_pretrained(
        args.model_ckpt,
        num_labels = args.num_labels,
        problem_type = "multi_label_classification",
        id2label = id2label,
        label2id = label2id)
    
    # Optimizer
    optimizer = AdamW(get_grouped_params(model), lr=args.init_lr)
    
    # Dataloaders
    train_dl, eval_dl, model, optimizer = accelerator.prepare(
        train_dataloader, eval_dataloader, model, optimizer)
    
    # Define the learning rate scheduler
    num_epochs = args.num_epochs
    num_training_steps = num_epochs * len(train_dl)
    num_valid_steps = num_epochs * len(eval_dl)
    lr_scheduler = get_scheduler(
        name = args.lr_scheduler_type,
        optimizer = optimizer,
        num_warmup_steps = args.num_warmup_steps,
        num_training_steps = num_training_steps
    )
    
    # Training loop
    progress_bar = tqdm(range(num_training_steps))
    progress_bar_eval = tqdm(range(num_valid_steps))
    
    idx=0
    idx_eval=0
    for epoch in range(num_epochs):
        model.train()
        for batch in train_dl:
            idx += 1
            outputs = model(**batch)
            loss = outputs.loss
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()
            print(epoch, idx, loss)
            progress_bar.update(1)
      
        # Evaluate the model
        model.eval()
            
        for batch in eval_dl:
            idx_eval+= 1
            with torch.no_grad():
                outputs = model(**batch)
            eval_loss = outputs.loss
            eval_logits = outputs.logits
            pred_prob = torch.sigmoid(eval_logits)
            preds = (pred_prob > 0.5)*1
            
            for references, predictions in zip (batch["labels"], preds):
                accuracy_metric.add_batch(predictions=accelerator.gather(predictions),
                                          references=accelerator.gather(references))
                roc_auc_metric.add_batch(prediction_scores=accelerator.gather(pred_prob), 
                                         references=accelerator.gather(batch["labels"]))
                precision_micro_metric.add_batch(predictions=accelerator.gather(predictions), 
                                                 references=accelerator.gather(references))
                precision_weighted_metric.add_batch(predictions=accelerator.gather(predictions), 
                                                    references=accelerator.gather(references))
                recall_micro_metric.add_batch(predictions=accelerator.gather(predictions), 
                                              references=accelerator.gather(references))
                recall_weighted_metric.add_batch(predictions=accelerator.gather(predictions), 
                                                 references=accelerator.gather(references))
                f1_micro_metric.add_batch(predictions=accelerator.gather(predictions),
                                          references=accelerator.gather(references))
                f1_weighted_metric.add_batch(predictions=accelerator.gather(predictions), 
                                             references=accelerator.gather(references))
            accuracy_res = accuracy_metric.compute()
            roc_auc_res = roc_auc_metric.compute(average="micro")
            precision_micro_res = precision_micro_metric.compute(average="micro")
            precision_weighted_res = precision_weighted_metric.compute(average="weighted")
            recall_micro_res = recall_micro_metric.compute(average="micro")
            recall_weighted_res = recall_weighted_metric.compute(average="weighted")
            f1_micro_res = f1_micro_metric.compute(average="micro")
            f1_weighted_res = f1_weighted_metric.compute(average="weighted")
            
            print(epoch, idx_eval, eval_loss, accuracy_res, roc_auc_res, precision_micro_res,
                  precision_weighted_res, recall_micro_res, recall_weighted_res, 
                  f1_micro_res, f1_weighted_res)
            progress_bar_eval.update(1)
    

In [43]:
from accelerate import notebook_launcher
notebook_launcher(training_function, num_processes=1)

Launching training on one GPU.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at setu4993/smaller-LaBSE and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/60 [00:00<?, ?it/s]

  0%|          | 0/15 [00:00<?, ?it/s]

0 1 tensor(0.7026, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 2 tensor(0.6879, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 3 tensor(0.6788, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 4 tensor(0.6729, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 5 tensor(0.6768, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 6 tensor(0.6822, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 7 tensor(0.6754, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 8 tensor(0.6808, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 9 tensor(0.6712, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 10 tensor(0.6716, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
0 11 tensor(0.6742, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogi

2 60 tensor(0.6715, device='cuda:0',
       grad_fn=<BinaryCrossEntropyWithLogitsBackward0>)
2 11 tensor(0.6742, device='cuda:0') {'accuracy': 0.583984375} {'roc_auc': 0.6094343674111098} {'precision': 0.583984375} {'precision': 0.5831395584243512} {'recall': 0.583984375} {'recall': 0.583984375} {'f1': 0.583984375} {'f1': 0.5748421045385971}
2 12 tensor(0.6692, device='cuda:0') {'accuracy': 0.5950520833333334} {'roc_auc': 0.614765469908891} {'precision': 0.5950520833333334} {'precision': 0.5909602975517619} {'recall': 0.5950520833333334} {'recall': 0.5950520833333334} {'f1': 0.5950520833333334} {'f1': 0.5868554086721555}
2 13 tensor(0.6637, device='cuda:0') {'accuracy': 0.6038411458333334} {'roc_auc': 0.6238788464451706} {'precision': 0.6038411458333334} {'precision': 0.5981800909601359} {'recall': 0.6038411458333334} {'recall': 0.6038411458333334} {'f1': 0.6038411458333334} {'f1': 0.5962300516948943}
2 14 tensor(0.6710, device='cuda:0') {'accuracy': 0.5940755208333334} {'roc_auc': 0.6

### Evaluation Loop

In [None]:
progress_bar = tqdm(range(len(eval_dataloader)))

accuracy_metric = evaluate.load("accuracy")
roc_auc_metric = evaluate.load("roc_auc", "multilabel")
precision_micro_metric = evaluate.load("precision")
precision_weighted_metric = evaluate.load("precision")
recall_micro_metric = evaluate.load("recall")
recall_weighted_metric = evaluate.load("recall")
f1_micro_metric = evaluate.load("f1")
f1_weighted_metric = evaluate.load("f1")

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    pred_prob = torch.sigmoid(outputs.logits)
    preds = (pred_prob > 0.5)*1
        
    for references, predictions in zip (batch["labels"], preds):
        accuracy_metric.add_batch(predictions=predictions, 
                              references=references)
        roc_auc_metric.add_batch(prediction_scores=preds, 
                              references=batch["labels"])
        precision_micro_metric.add_batch(predictions=predictions, 
                              references=references)
        precision_weighted_metric.add_batch(predictions=predictions, 
                              references=references)
        recall_micro_metric.add_batch(predictions=predictions, 
                              references=references)
        recall_weighted_metric.add_batch(predictions=predictions, 
                              references=references)
        f1_micro_metric.add_batch(predictions=predictions, 
                              references=references)
        f1_weighted_metric.add_batch(predictions=predictions, 
                              references=references)
    progress_bar.update(1)
    
accuracy_res = accuracy_metric.compute()
%time roc_auc_res = roc_auc_metric.compute(average="micro")
precision_micro_res = precision_micro_metric.compute(average="micro")
precision_weighted_res = precision_weighted_metric.compute(average="weighted")
recall_micro_res = recall_micro_metric.compute(average="micro")
recall_weighted_res = recall_weighted_metric.compute(average="weighted")
f1_micro_res = f1_micro_metric.compute(average="micro")
f1_weighted_res = f1_weighted_metric.compute(average="weighted")

In [None]:
print("Accuracy: ", accuracy_res)
print("roc_auc: ", roc_auc_res)
print("Precision_micro: ", precision_micro_res)
print("Precision_weighted: ", precision_weighted_res)
print("Recall_micro: ", recall_micro_res)
print("Recall_weighted: ", recall_weighted_res)
print("F1_micro: ", f1_micro_res)
print("F1_weighted: ", f1_weighted_res)

In [None]:
progress_bar = tqdm(range(len(eval_dataloader)))

roc_auc_metric = evaluate.load("roc_auc", "multilabel")

model.eval()
for batch in eval_dataloader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
    logits = outputs.logits
    pred_prob = torch.sigmoid(outputs.logits)
    preds = (pred_prob > 0.5)*1
        
    for references, predictions in zip (batch["labels"], preds):
        roc_auc_metric.add_batch(prediction_scores=preds, 
                              references=batch["labels"])
    progress_bar.update(1)
    
roc_auc_res = roc_auc_metric.compute(average="micro")

In [None]:
roc_auc_res

In [None]:
accuracy_metric = evaluate.load(["accuracy", "f1", "roc_auc", 
                                "precision", "recall"])