# Algorithms for Speech and NLP
MVA 2021-2022

David Soto: david.soto.c17@gmail.com

Elias Masquil: eliasmasquil@gmail.com

Nicolas Violante: nviolante96@gmail.com

In this project we compare the performance on classification downstream tasks of the character-based language model CANINE against the subword-based mBERT. For both cases, we use the pre-trained models available at the Hugging Face Hub  and fine-tune them for the particular downstream task. 

*References*:


*   [CANINE: Pre-training an Efficient Tokenization-Free Encoder for Language Representation](https://arxiv.org/abs/2103.06874v3)
* [BERT: Pre-training of Deep Bidirectional Transformers for Language Understanding](https://arxiv.org/abs/1810.04805)
*   [Hugging Face tutorials](https://huggingface.co/docs/transformers/training#trainer)



## Installs, imports and Colab-Drive settings

In [None]:
!pip install transformers
!pip install datasets

Collecting transformers
  Downloading transformers-4.18.0-py3-none-any.whl (4.0 MB)
[K     |████████████████████████████████| 4.0 MB 19.6 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.5.1-py3-none-any.whl (77 kB)
[K     |████████████████████████████████| 77 kB 5.4 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 49.2 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.49-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 33.8 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 33.0 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found e

In [None]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset, load_metric
import numpy as np

from google.colab import drive
import os

In [None]:
# drive.mount('Drive')
persistent_storage = 'Drive/My Drive/nlp-models/'
persistent_storage = "data"
os.makedirs(persistent_storage, exist_ok=True)

## Experiment 0: Understanding a pre-trained model

We need two building blocks
1. Tokenizer
2. Model

Pre-trained checkpoints can be found here: https://huggingface.co/models

In [None]:
checkpoint = "google/canine-s"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModel.from_pretrained(checkpoint)

In [None]:
inputs = ["Life is like a box of chocolates.", "Life they say"]
model_inputs = tokenizer(inputs, padding="longest", truncation=True, return_tensors="pt")

- `return_tensors="pt"`: the model returns PyTorch tensors

- `padding="longest"`: padds the shorter sequence with dummy tokens to match the longest one. The tensor `model_inputs.input_ids` has shape $(2, 35)$, where $2$ is the number of sentences and $35$ is the length of the longest sentence (the first). The second sentence is padded with the dummy $id=0$

In [None]:
print(f"Shape of encoded (tokenized) tensor: {model_inputs.input_ids.shape}")

# For CANINE, the id of the token is the Unicode number
print(model_inputs)
print(f"Unicode ids: L={ord('L')}, i={ord('i')}, f={ord('f')}, L={ord('e')},")

- Each token gets a feature (last_hidden_state) of shape $768$

- The feature of the "L" character of the word "Life" in the first sentence is 
different to the feature of the "L" of the second sentence. The transformer takes into account the context (the rest of the sentence)


In [None]:
outputs = model(**model_inputs)

print(outputs.last_hidden_state.shape)
print("Features of L, first sentence: " , outputs.last_hidden_state[0][1][:5])
print("Features of L, second sentence: " ,outputs.last_hidden_state[1][1][:5])

## Fine-tuning a pre-trained model

- We import the AutoModel for the desired fine-tunning task, for example `AutoModelForSequenceClassification`. This should be the pre-trained model with one extra layer specific to the task (the pre-trained model only outputs embeddings).

- For Sequence Classification, we take two sequence as inputs and output the (logits) probability that both sentences are equivalent (1) or not equivalent (0).

- For sentence entailment, since our original pre-trained model takes only one sequence as input, we'll have to do a wrapper to merge the two sentences and provide a way to distinguish them (this is done automatically).

For running the experiments for different models you just need to select the appropiate checkpoint and run all the code.

In [None]:
# Function encapsulating all the steps needed for fine-tuning and evaluating a model
def train_and_evaluate(checkpoint, tokenizer, metrics_function, dataset, training_args, model_name, num_labels=None, eval_subset="validation"):
    # Load the model + tokenizer
    tokenizer = AutoTokenizer.from_pretrained(checkpoint)
    if num_labels is None:
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint)
    else:
        model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels)
    data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

    trainer = Trainer(
        model,
        training_args,
        train_dataset=dataset["train"],
        eval_dataset=dataset[eval_subset],
        data_collator=data_collator,
        tokenizer=tokenizer,
        compute_metrics=metrics_function,
    )

    # Training
    print("before training evaluation")
    trainer.evaluate()
    print("start training")
    trainer.train()

### Experiment 1: GLUE-MRPC

In [None]:
# Settings
checkpoint = "google/canine-c"
model_name = "canine-c-glue-mrpc"
output_path = os.path.join(persistent_storage, model_name)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    """
    Returns new tokenizer function that merges the two sentence in example and 
    also provides a masking via the "token_type_ids" field. token_type_ids=0 
    for the tokens of the sentence1 and token_type_ids=1 for those of sentence2
    """
    return tokenizer(example["sentence1"], 
                     example["sentence2"], 
                     truncation=True,
                     padding=True)

def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

    
dataset = load_dataset("glue", "mrpc").map(tokenize_function, batched=True)
training_args = TrainingArguments(output_path,
                                num_train_epochs=5,
                                learning_rate=5e-5,
                                per_device_train_batch_size=16,
                                evaluation_strategy="epoch",
                                logging_steps=1,
                                ) 

# Training + evaluation
train_and_evaluate(checkpoint, tokenizer, compute_metrics, dataset, training_args, model_name)

### Experiment 2: ajgt_twitter_ar

In [None]:
# Settings
checkpoint = "bert-base-multilingual-cased"
model_name = "bert-cased-ajgt_twitter_ar"
output_path = os.path.join(persistent_storage, model_name)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    """
    Returns new tokenizer function that merges the two sentence in example and 
    also provides a masking via the "token_type_ids" field. token_type_ids=0 
    for the tokens of the sentence1 and token_type_ids=1 for those of sentence2
    """
    return tokenizer(example["text"], 
                     truncation=True,
                     padding=True)


def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)
    

raw_dataset = load_dataset("ajgt_twitter_ar").map(tokenize_function, batched=True)
dataset = raw_dataset["train"].train_test_split()
training_args = TrainingArguments(output_path,
                                  num_train_epochs=5,
                                  learning_rate=5e-5,
                                  per_device_train_batch_size=16,
                                  evaluation_strategy="epoch",
                                  logging_steps=1
                                  ) 

# Training + evaluation
train_and_evaluate(checkpoint, tokenizer, compute_metrics, dataset, training_args, model_name, eval_subset="test")

### Experiment 3: fvillena/spanish_diagnostics 

In [None]:
# Settings
checkpoint = "bert-base-multilingual-uncased"
model_name = "bert-uncased-diagnostics"
output_path = os.path.join(persistent_storage, model_name)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    """
    Returns new tokenizer function that merges the two sentence in example and 
    also provides a masking via the "token_type_ids" field. token_type_ids=0 
    for the tokens of the sentence1 and token_type_ids=1 for those of sentence2
    """
    return tokenizer(example["text"], 
                     truncation=True,
                     padding=True)
    
    
def compute_metrics(eval_preds):
    metric = load_metric("glue", "mrpc")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

dataset = load_dataset("fvillena/spanish_diagnostics").map(tokenize_function, batched=True)
dataset["train"] = dataset["train"].select(list(range(3500)))
dataset["test"] = dataset["test"].select(list(range(500)))
training_args = TrainingArguments(output_path,
                                  num_train_epochs=5,
                                  learning_rate=5e-5,
                                  per_device_train_batch_size=4,
                                  evaluation_strategy="epoch",
                                  logging_steps=1,
                                  ) 

# Training + evaluation
train_and_evaluate(checkpoint, tokenizer, compute_metrics, dataset, training_args, model_name, eval_subset="test")

### Experiment 4: amazon_reviews_multi (spanish split)


In [None]:
# Settings
checkpoint = "google/canine-s"
model_name = "canine-s-amazon"
output_path = os.path.join(persistent_storage, model_name)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize_function(example):
    """
    Returns new tokenizer function that merges the two sentence in example and 
    also provides a masking via the "token_type_ids" field. token_type_ids=0 
    for the tokens of the sentence1 and token_type_ids=1 for those of sentence2
    """
    return tokenizer(example["review_body"], 
                     truncation=True,
                     padding=True)
    
    
def compute_metrics(eval_preds):
    f1 = load_metric("f1")
    accuracy = load_metric("accuracy")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1)
    return {**f1.compute(predictions=predictions, references=labels, average="macro"), **accuracy.compute(predictions=predictions, references=labels)}
    
dataset = load_dataset("amazon_reviews_multi", "es").map(tokenize_function, batched=True)
dataset = dataset.rename_column("stars", "label")
val_new_labels = list(map(lambda x: x-1, dataset['validation']['label']))
test_new_labels = list(map(lambda x: x-1, dataset['test']['label']))
dataset["validation"] = dataset["validation"].remove_columns('label')
dataset["validation"] = dataset["validation"].add_column('label', val_new_labels)
dataset["test"] = dataset["test"].remove_columns('label')
dataset["test"] = dataset["test"].add_column('label', test_new_labels)
# train set is huge, validation is already balanced!
dataset["train"] = dataset["validation"]
training_args = TrainingArguments(output_path,
                                num_train_epochs=5,
                                learning_rate=5e-5,
                                per_device_train_batch_size=8,
                                evaluation_strategy="epoch",
                                logging_steps=1,
                                ) 

# Training + evaluation
train_and_evaluate(checkpoint, tokenizer, compute_metrics, dataset, training_args, model_name, num_labels=5, eval_subset="test")