In [1]:
import torch
torch.cuda.empty_cache()

### Prepare Dataset

In [2]:
def load_data_and_target(path, sep=' '):
    data = []
    target = []
    
    with open(path, encoding='utf-8') as f:
        for line in f:
            x, y = sep.join(line.split(sep)[1:]).rstrip('\n'), line.split(sep)[0]
            data.append(x), target.append(y)

    return data, target

In [3]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

In [4]:
def get_tokenized_dataset(raw_data, labels, tokenizer):
    encodings = tokenizer(raw_data, padding="max_length", truncation=True)
    return Dataset(encodings, labels)

In [5]:
full_train_data, full_train_target = load_data_and_target('data/questions-train.txt')
test_data, test_target = load_data_and_target('data/questions-test.txt')
n_labels = len(set(full_train_target))

In [6]:
valid_ratio = 0.1
valid_size = int(len(full_train_data) * valid_ratio)
train_size = len(full_train_data) - valid_size

train_data, train_target = full_train_data[:train_size], full_train_target[:train_size]
eval_data, eval_target = full_train_data[train_size:], full_train_target[train_size:]

In [7]:
categorical_to_ordinal_values = dict()

for ord, cat in enumerate(set(full_train_target)):
    categorical_to_ordinal_values[cat] = ord

categorical_to_ordinal_values

{'LOCATION': 0,
 'ENTITY': 1,
 'PERSON': 2,
 'QUANTITY': 3,
 'DEFINITION': 4,
 'ABBREVIATION': 5,
 'TEMPORAL': 6,
 'DESCRIPTION': 7,
 'ORGANIZATION': 8}

In [8]:
def ordinal_encoding(dataset):
    """Encode categorical into ordinal values"""
    dataset_encoded = []
    for cat in dataset:
        dataset_encoded.append(categorical_to_ordinal_values[cat])

    return dataset_encoded

In [9]:
# encoding targets
train_target = ordinal_encoding(train_target)
eval_target = ordinal_encoding(eval_target)
test_target = ordinal_encoding(test_target)

### Some useful functions

In [10]:
!pip install evaluate

# Evaluation metric
import numpy as np
import evaluate

metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

Collecting evaluate
  Downloading evaluate-0.4.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.4/81.4 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: evaluate
Successfully installed evaluate-0.4.0
[0m

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [11]:
from transformers import TrainingArguments, Trainer

def train_and_eval_model(model, tokenizer, outd=None):
    train_dataset = get_tokenized_dataset(train_data, train_target, tokenizer)
    eval_dataset = get_tokenized_dataset(eval_data, eval_target, tokenizer)
    test_dataset = get_tokenized_dataset(test_data, test_target, tokenizer)

    # Training hyperparameters
    training_args = TrainingArguments(
        report_to="none",
        output_dir=outd, 
        evaluation_strategy="epoch", 
        per_device_train_batch_size=16, 
        per_device_eval_batch_size=8
    )

    # Create a Trainer object with your model, training arguments, training and test datasets, and evaluation function
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        compute_metrics=compute_metrics,
    )

    trainer.train()

    print(trainer.evaluate(test_dataset))

### Q1-a Fine-tune a pretrained model (BERT, RoBERTa)

In [12]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer

#### BERT

In [13]:
bert_base_model = AutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=n_labels)
bert_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

In [14]:
train_and_eval_model(bert_base_model, bert_tokenizer, "results/Q1-a/Bert")

***** Running training *****
  Num examples = 5001
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 939


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.241924,0.945946
2,0.428500,0.271154,0.944144
3,0.428500,0.232227,0.953153


***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
Saving model checkpoint to /kaggle/working/tp3nlp/results/Q1-a/Bert/checkpoint-500
Configuration saved in /kaggle/working/tp3nlp/results/Q1-a/Bert/checkpoint-500/config.json
Model weights saved in /kaggle/working/tp3nlp/results/Q1-a/Bert/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 0.15775156021118164, 'eval_accuracy': 0.974, 'eval_runtime': 8.9353, 'eval_samples_per_second': 55.958, 'eval_steps_per_second': 7.051, 'epoch': 3.0}


#### RoBERTa

In [15]:
roberta_base_model = AutoModelForSequenceClassification.from_pretrained("roberta-base", num_labels=n_labels)
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

https://huggingface.co/roberta-base/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpf2l5pu2l


Downloading:   0%|          | 0.00/481 [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
creating metadata file for /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
loading configuration file https://huggingface.co/roberta-base/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/733bade19e5f0ce98e6531021dd5180994bb2f7b8bd7e80c7968805834ba351e.35205c6cfc956461d8515139f0f8dd5d207a2f336c0c3a83b4bc8dca3518e37b
Model config RobertaConfig {
  "_name_or_path": "roberta-base",
  "architectures": [
    "RobertaForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hid

Downloading:   0%|          | 0.00/478M [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
creating metadata file for /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
loading weights file https://huggingface.co/roberta-base/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/51ba668f7ff34e7cdfa9561e8361747738113878850a7d717dbc69de8683aaad.c7efaa30a0d80b2958b876969faa180e485944a849deee4ad482332de65365a7
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.decoder.weight', 'roberta.pooler.dense.bias', 'lm_head.dense.weight', 'roberta.pooler.dense.weight', 'lm_head.dense.bias', 'lm_head.layer_norm.bias', '

Downloading:   0%|          | 0.00/878k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/vocab.json in cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
creating metadata file for /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
https://huggingface.co/roberta-base/resolve/main/merges.txt not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpv_05dnjd


Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/merges.txt in cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
creating metadata file for /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
https://huggingface.co/roberta-base/resolve/main/tokenizer.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpmc39fixq


Downloading:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

storing https://huggingface.co/roberta-base/resolve/main/tokenizer.json in cache at /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
creating metadata file for /root/.cache/huggingface/transformers/d53fc0fa09b8342651efd4073d75e19617b3e51287c2a535becda5808a8db287.fc9576039592f026ad76a1c231b89aee8668488c671dfbe6616bab2ed298d730
loading file https://huggingface.co/roberta-base/resolve/main/vocab.json from cache at /root/.cache/huggingface/transformers/d3ccdbfeb9aaa747ef20432d4976c32ee3fa69663b379deb253ccfce2bb1fdc5.d67d6b367eb24ab43b08ad55e014cf254076934f71d832bbab9ad35644a375ab
loading file https://huggingface.co/roberta-base/resolve/main/merges.txt from cache at /root/.cache/huggingface/transformers/cafdecc90fcab17011e12ac813dd574b4b3fea39da6dd817813efa010262ff3f.5d12962c5ee615a4c803841266e9c3be9a691a924f72d395d3a6c6c81157788b
loading file https://huggingface.co/roberta

In [16]:
train_and_eval_model(roberta_base_model, roberta_tokenizer, "results/Q1-a/Roberta")

PyTorch: setting up devices
***** Running training *****
  Num examples = 5001
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 939


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.257466,0.927928
2,0.496700,0.305852,0.935135
3,0.496700,0.277564,0.944144


***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
Saving model checkpoint to /kaggle/working/tp3nlp/results/Q1-a/Roberta/checkpoint-500
Configuration saved in /kaggle/working/tp3nlp/results/Q1-a/Roberta/checkpoint-500/config.json
Model weights saved in /kaggle/working/tp3nlp/results/Q1-a/Roberta/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 0.21167896687984467, 'eval_accuracy': 0.968, 'eval_runtime': 8.8848, 'eval_samples_per_second': 56.276, 'eval_steps_per_second': 7.091, 'epoch': 3.0}


BERT est un modèle de transformers pré-entraîné sur un large corpus de données anglaises de manière auto-supervisée. il a deux objectifs : 
- Masked language modeling (MLM) : Prédire les mots masqués dans une phrase. 
- Next sentence prediction (NSP) : Prédire si deux phrases se suivent ou non.

RoBERTa (Robustly Optimized BERT Pretraining Approach) s'appuie sur BERT et modifie les hyperparamètres clés, en supprimant l'objectif de préformation de la phrase suivante et en effectuant une formation avec des mini-batchs et des taux d'apprentissage beaucoup plus importants.

L'implémentation originale de BERT effectue le masquage une fois pendant le prétraitement des données, ce qui donne un seul masque statique. Pour éviter d'utiliser le mÃªme masque pour chaque instance d'entraînement à  chaque époque, les données d'entraînement ont été dupliquées 10 fois de sorte que chaque séquence soit masquée de 10 façons diffèrentes au cours des 40 époques d'entraînement. Ainsi, chaque séquence d'entraînement a été vue avec le même masque quatre fois pendant l'entraînement.

RoBERTa quant à  lui génère le motif de masquage chaque fois que nous alimentons une séquence au modèle.

Selon Yinhan Liu et al. [1], cette diffèrence de masquage devient crucial lors du pré-entraînement pour plus d'étapes ou avec des ensembles de données plus importants.

[1] = https://arxiv.org/abs/1907.11692
RoBERTa: A Robustly Optimized BERT Pretraining Approach by Yinhan Liu, Myle Ott, Naman Goyal, Jingfei Du, Mandar Joshi, Danqi Chen, Omer Levy, Mike Lewis, Luke Zettlemoyer, Veselin Stoyanov.

Nous utilisons pour nos tests les versions bert-base-uncased et roberta-base dispo sur HuggingFace. Les résultats obtenus après fine-tuning sur notre jeu d'entrainement 'questions-train.txt' sont quasi identiques dans les deux cas. Soit entre autres, une précision autour de 97% sur 'questions-test.txt'. Ce qui signifie que le fine-tuning est un succès. 

### Q1-b Bonus: Train cls head model

In [17]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_labels)

# freeze parameters
for param in model.parameters():
    param.requires_grad = False

# Replace fc layer
model.classifier = torch.nn.Linear(768, n_labels)

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8"

In [18]:
train_and_eval_model(model, bert_tokenizer, "results/Q1-b")

PyTorch: setting up devices
***** Running training *****
  Num examples = 5001
  Num Epochs = 3
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 939


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,2.028675,0.210811
2,2.012500,2.012935,0.214414
3,2.012500,2.009361,0.223423


***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
Saving model checkpoint to /kaggle/working/tp3nlp/results/Q1-b/checkpoint-500
Configuration saved in /kaggle/working/tp3nlp/results/Q1-b/checkpoint-500/config.json
Model weights saved in /kaggle/working/tp3nlp/results/Q1-b/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 2.0849123001098633, 'eval_accuracy': 0.206, 'eval_runtime': 8.9305, 'eval_samples_per_second': 55.988, 'eval_steps_per_second': 7.054, 'epoch': 3.0}


In [19]:
model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=n_labels)

# Replace fc layer
model.classifier = torch.nn.Linear(768, n_labels)

train_and_eval_model(model, bert_tokenizer, "results/Q1-b")

loading configuration file https://huggingface.co/bert-base-uncased/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/3c61d016573b14f7f008c02c4e51a366c67ab274726fe2910691e2a761acf43e.37395cee442ab11005bcd270f3c34464dc1704b715b5d7d52b1a461abe3b9e4e
Model config BertConfig {
  "_name_or_path": "bert-base-uncased",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2,
    "LABEL_3": 3,
    "LABEL_4": 4,
    "LABEL_5": 5,
    "LABEL_6": 6,
    "LABEL_7": 7,
    "LABEL_8"

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.32582,0.920721
2,0.476500,0.280942,0.945946
3,0.476500,0.282521,0.944144


***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
Saving model checkpoint to /kaggle/working/tp3nlp/results/Q1-b/checkpoint-500
Configuration saved in /kaggle/working/tp3nlp/results/Q1-b/checkpoint-500/config.json
Model weights saved in /kaggle/working/tp3nlp/results/Q1-b/checkpoint-500/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8
***** Running Evaluation *****
  Num examples = 555
  Batch size = 8


Training completed. Do not forget to share your model on huggingface.co/models =)


***** Running Evaluation *****
  Num examples = 500
  Batch size = 8


{'eval_loss': 0.13561519980430603, 'eval_accuracy': 0.972, 'eval_runtime': 8.9131, 'eval_samples_per_second': 56.097, 'eval_steps_per_second': 7.068, 'epoch': 3.0}


Dans le cas du transfert d'apprentissage, on remplace la tête de classification par une nouvelle couche linéaire et on effectue l'entrainement en gélant les autres paramètres. On obtient alors une précision autour de 20% en test. 

Ceci est dû au gel des paramètres des autres couches sauf celle de la  classification qui se retrouve à  ne pas être adaptés pour la tâche qu'on essaye d'effectuer. Pour mettre en évidence ces propos, nous avons rajouté une petite expérience dans laquelle on remplace la tête de classification sans geler les autres paramètres. On a alors un résultat meilleur que précédemment (94%). 