# Roberta fine-tunning in mnli (original + negative sentences)

The code is from https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/README.md



## preparation of environment

https://github.com/facebookresearch/fairseq/issues/3723 解决non-iterable的问题 

This is fixed in the master branch of hydra with the following commit but no release was made afterwards:
facebookresearch/hydra@8fa67de

It is also fixed by downgrading to hydra-core==1.0.7

In [None]:
!nvidia-smi

Sat Aug 13 15:40:08 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   42C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/memoire"

/content/drive/MyDrive/memoire


In [None]:
%%bash
pip install regex requests hydra-core omegaconf
pip install hydra-core==1.0.7
pip install fairseq

In [None]:
import torch
from fairseq.data.data_utils import collate_tokens

In [None]:
%%bash
pip install transformers
pip install datasets
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
Collecting responses<0.19
  Downloading resp

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

## load the model roberta.large and test on one sentence

In [None]:
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
roberta.eval()  # disable dropout (or leave in train mode to finetune)

In [None]:
tokens = roberta.encode('You are beautiful！')
# tokens # get a tensor 
# assert tokens.tolist() == [0, 31414, 232, 328, 2] # tolist() => tensor to list
roberta.decode(tokens)  # get : 'Hello world!'

'You are beautiful！'

## test on severals examples with roberta.large.mnli

In [None]:
roberta_mnli = torch.hub.load('pytorch/fairseq', 'roberta.large.mnli')
roberta_mnli.eval()

batch_of_pairs = [
    ['Roberta is a heavily optimized version of BERT.', 'Roberta is not very optimized.'],
    ['Roberta is a heavily optimized version of BERT.', 'Roberta is based on BERT.'],
    ['potatoes are awesome.', 'I like to run.'],
    ['Mars is very far from earth.', 'Mars is very close.'],
]

batch = collate_tokens(
    [roberta_mnli.encode(pair[0], pair[1]) for pair in batch_of_pairs], pad_idx=1
)

logprobs = roberta_mnli.predict('mnli', batch)
print(logprobs.argmax(dim=1))
# tensor([0, 2, 1, 0])

Using cache found in /root/.cache/torch/hub/pytorch_fairseq_main
100%|██████████| 751652118/751652118 [00:32<00:00, 22924276.87B/s]


tensor([0, 2, 1, 0])


In [None]:
label = {0:'contradiction', 1:'neutral', 2:'entailment'}
for e in (logprobs.argmax(dim=1)).tolist():
  print(label[e])

contradiction
entailment
neutral
contradiction


## evaluation on a whole file (dev)

### dev_matched 

In [None]:
# with roberta (not fine-tunned on mnli)
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta.cuda() # gpu
roberta.eval()
with open('corpus/MNLI/dev_matched.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
        tokens = roberta.encode(sent1, sent2)
        prediction = roberta.predict('mnli', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# Accuracy:  0.3540499235863474


| Accuracy:  0.3540499235863474


In [None]:
# with roberta_mnli
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta_mnli.cuda() # gpu
roberta_mnli.eval()
with open('corpus/MNLI/dev_matched.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
        tokens = roberta_mnli.encode(sent1, sent2)
        prediction = roberta_mnli.predict('mnli', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# Accuracy:  0.9059602649006623

| Accuracy:  0.9059602649006623


### dev_mismatched

In [None]:
# with roberta (not fine-tunned on mnli)
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta.cuda() # gpu
roberta.eval()
with open('corpus/MNLI/dev_mismatched.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
        tokens = roberta.encode(sent1, sent2)
        prediction = roberta.predict('mnli', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
#  Accuracy:  0.35211554109031734

| Accuracy:  0.35211554109031734


In [None]:
# with roberta_mnli
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta_mnli.cuda() # gpu
roberta_mnli.eval()
with open('corpus/MNLI/dev_mismatched.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
        tokens = roberta_mnli.encode(sent1, sent2)
        prediction = roberta_mnli.predict('mnli', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# Accuracy:  0.9012408462164361

| Accuracy:  0.9012408462164361


### mnli_completed_with_importance

In [None]:
# with roberta (not fine-tunned on mnli)
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta.cuda() # gpu
roberta.eval()
with open('data/mnli_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        tokens = roberta.encode(sent1, sent2)
        prediction = roberta.predict('mnli', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.2895

| Accuracy:  0.2895


In [None]:
# with roberta_mnli
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
roberta_mnli.cuda() # gpu
roberta_mnli.eval()
with open('data/mnli_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        tokens = roberta_mnli.encode(sent1, sent2)
        prediction = roberta_mnli.predict('mnli', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.763

| Accuracy:  0.763


## fine-tunning

### datasets

In [None]:
data_files = './data/mnli_completed_with_importance.tsv'
neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
neg_datasets



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-74b7affe083aeed2/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-74b7affe083aeed2/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'label', 'importance'],
    num_rows: 2000
})

In [None]:
neg_datasets.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'importance': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [None]:
label_map = {'contradiction':0 , 'neutral':1 , 'entailment':2}
labels = []
for label in neg_datasets['label']:
  labels.append(label_map[label])

In [None]:
neg_datasets = neg_datasets.remove_columns('label')
neg_datasets = neg_datasets.add_column('labels', labels)

In [None]:
neg_datasets

Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 2000
})

In [None]:
neg_datasets = neg_datasets.train_test_split(test_size=0.3)

In [None]:
train_neg_datasets = neg_datasets['train'].train_test_split(test_size=0.1)

In [None]:
train_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 140
    })
})

### prepare trainer evironment

In [None]:
# download the checkpoint
checkpoint = "roberta-large-mnli"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Downloading config.json:   0%|          | 0.00/688 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], example["hypothesis"], truncation=True)

tokenized_neg_datasets = train_neg_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 140
    })
})

In [None]:
# most hyperparameters are set by default : like optimizer
training_args = TrainingArguments(
    "roberta-large-mnli-fine-tunned-neg-snli", 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mnli")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # 算出axis = -1 那个列的概率最大的那个
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_neg_datasets["train"],
    eval_dataset=tokenized_neg_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, importance, hypothesis, text. If Unnamed: 0, importance, hypothesis, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1260
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 945


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.660195,0.85
2,0.592600,0.635273,0.9
3,0.592600,0.751154,0.892857


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, importance, hypothesis, text. If Unnamed: 0, importance, hypothesis, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 140
  Batch size = 4


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Saving model checkpoint to roberta-large-mnli-fine-tunned-neg-snli/checkpoint-315
Configuration saved in roberta-large-mnli-fine-tunned-neg-snli/checkpoint-315/config.json
Model weights saved in roberta-large-mnli-fine-tunned-neg-snli/checkpoint-315/pytorch_model.bin
tokenizer config file saved in roberta-large-mnli-fine-tunned-neg-snli/checkpoint-315/tokenizer_config.json
Special tokens file saved in roberta-large-mnli-fine-tunned-neg-snli/checkpoint-315/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, importance, hypothesis, text. If Unnamed: 0, importance, hypothesis, text are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 140
  Batch size = 4
Saving model checkpoint to roberta-large-mnli-fine-tunned-neg-snli/checkpoint-630
Configuration saved in robert

TrainOutput(global_step=945, training_loss=0.4149713425409226, metrics={'train_runtime': 317.1252, 'train_samples_per_second': 11.92, 'train_steps_per_second': 2.98, 'total_flos': 317753264943768.0, 'train_loss': 0.4149713425409226, 'epoch': 3.0})

## Evaluate with the fine-tunned model

In [None]:
# download the checkpoint
checkpoint = "roberta-large-mnli-fine-tunned-neg-snli/checkpoint-945"
tokenizer_neg = AutoTokenizer.from_pretrained(checkpoint)
model_neg = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
# evaluate on dev_mismatched
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('corpus/MNLI/dev_mismatched.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.7278275020341741

| Accuracy:  0.8711350691619203


In [None]:
# evaluate on negation_benchmark snli
label_map = {0: 'contradiction', 1: 'neutral', 2: 'entailment'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('data/mnli_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:   0.838

| Accuracy:  0.9425


### test on important and unimportante sentences

In [None]:
test_neg_datasets = neg_datasets['test']

In [None]:
test_neg_datasets_importance = test_neg_datasets.filter(lambda example: '1' in example['importance'])
test_neg_datasets_unimportance = test_neg_datasets.filter(lambda example: example['importance']=='0')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
test_neg_datasets

Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 600
})

In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.7433333333333333

| Accuracy:  0.8616666666666667


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_importance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.6574074074074074

| Accuracy:  0.8176100628930818


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_unimportance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.8189655172413793

| Accuracy:  0.9024390243902439


# Roberta fine-tunning in snli (original + negative sentences)

## preparation of environment

https://github.com/facebookresearch/fairseq/issues/3723 解决non-iterable的问题 

This is fixed in the master branch of hydra with the following commit but no release was made afterwards:
facebookresearch/hydra@8fa67de

It is also fixed by downgrading to hydra-core==1.0.7

In [None]:
!nvidia-smi

Sun Aug 14 07:44:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    25W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd "/content/drive/MyDrive/memoire"

/content/drive/MyDrive/memoire


In [None]:
%%bash
pip install regex requests hydra-core omegaconf
pip install hydra-core==1.0.7
pip install fairseq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hydra-core
  Downloading hydra_core-1.2.0-py3-none-any.whl (151 kB)
Collecting omegaconf
  Downloading omegaconf-2.2.2-py3-none-any.whl (79 kB)
Collecting antlr4-python3-runtime==4.9.*
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
Collecting PyYAML>=5.1.0
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Building wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py): started
  Building wheel for antlr4-python3-runtime (setup.py): finished with status 'done'
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144575 sha256=371bbff883232b82e4bd0bd2adf73d0d50715f8a4a6140d80ed8b600d1f2e084
  Stored in directory: /root/.cache/pip/wheels/8b/8d/53/2af8772d9aec614e3fc65e53d4a993ad73c61daa

In [None]:
import torch
from fairseq.data.data_utils import collate_tokens

In [None]:
%%bash
pip install transformers
pip install datasets
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
Collecting responses<0.19
  Downloading

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

## load the model roberta.large and test on one sentence

In [None]:
# ROBERTA LARGE
roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
roberta.eval()  # disable dropout (or leave in train mode to finetune)

In [None]:
tokens = roberta.encode('You are beautiful！')
# tokens # get a tensor 
# assert tokens.tolist() == [0, 31414, 232, 328, 2] # tolist() => tensor to list
roberta.decode(tokens)  # get : 'Hello world!'

'You are beautiful！'

In [None]:
# ROBERTA LARGE SNLI
checkpoint = "boychaboy/SNLI_roberta-large"
tokenizer_snli = AutoTokenizer.from_pretrained(checkpoint)
model_snli = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Downloading tokenizer_config.json:   0%|          | 0.00/289 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/870 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.32G [00:00<?, ?B/s]

## evaluation on a whole file (dev)

### original 

In [None]:
# with roberta large

# redefine the head
roberta.register_classification_head('snli', num_classes=3)
logprobs = roberta.predict('snli', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)

label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
roberta.cuda() # gpu
roberta.eval()
with open('corpus/SNLI/snli_1.0_dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[5], tokens[6], tokens[0]
        tokens = roberta.encode(sent1, sent2)
        prediction = roberta.predict("snli",tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

In [None]:
# with roberta large snli

label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_snli.cuda() # gpu
model_snli.eval()
with open('corpus/SNLI/snli_1.0_dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[5], tokens[6], tokens[0]
        inputs = tokenizer_snli(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_snli(**inputs.to(model_snli.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        # print(prediction_label, target)
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# Accuracy:  0.3540499235863474

| Accuracy:  0.9176


### mnli_completed_with_importance

In [None]:
# with roberta (not fine-tunned on mnli)
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
roberta.cuda() # gpu
roberta.eval()
with open('data/snli_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        tokens = roberta.encode(sent1, sent2)
        prediction = roberta.predict('mnli', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

In [None]:
# evaluate on negation_benchmark
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_snli.cuda() # gpu
model_snli.eval()
with open('data/snli_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_snli(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_snli(**inputs.to(model_snli.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.65


## fine-tunning on negative sentences

### load datasets

In [None]:
data_files = './data/snli_completed_with_importance.tsv'
neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
neg_datasets



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-4ddf541e06bd4d44/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-4ddf541e06bd4d44/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'label', 'importance'],
    num_rows: 2000
})

In [None]:
neg_datasets.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'importance': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [None]:
label_map = {'entailment':0 , 'neutral':1 , 'contradiction':2}
labels = []
for label in neg_datasets['label']:
  labels.append(label_map[label])

In [None]:
neg_datasets = neg_datasets.remove_columns('label')
neg_datasets = neg_datasets.add_column('labels', labels)

In [None]:
neg_datasets

Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 2000
})

In [None]:
neg_datasets = neg_datasets.train_test_split(test_size=0.3)

In [None]:
train_neg_datasets = neg_datasets['train'].train_test_split(test_size=0.1)

In [None]:
train_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 140
    })
})

### prepare trainer evironment

In [None]:
# # ROBERTA LARGE SNLI
# checkpoint = "boychaboy/SNLI_roberta-large"
# tokenizer_snli = AutoTokenizer.from_pretrained(checkpoint)
# model_snli = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
def tokenize_function(example):
    return tokenizer_snli(example["text"], example["hypothesis"], truncation=True)

tokenized_neg_datasets = train_neg_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_snli)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 140
    })
})

In [None]:
# most hyperparameters are set by default : like optimizer
training_args = TrainingArguments(
    "roberta-large-snli-fine-tunned-neg-snli", 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", 'mnli')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # 算出axis = -1 那个列的概率最大的那个
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model_snli,
    training_args,
    train_dataset=tokenized_neg_datasets["train"],
    eval_dataset=tokenized_neg_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer_snli,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()
# 8 min 17s

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: hypothesis, text, importance, Unnamed: 0. If hypothesis, text, importance, Unnamed: 0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1260
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 945


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.601776,0.842857
2,0.615500,0.745379,0.842857
3,0.615500,0.837211,0.871429


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: hypothesis, text, importance, Unnamed: 0. If hypothesis, text, importance, Unnamed: 0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 140
  Batch size = 4


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Saving model checkpoint to roberta-large-snli-fine-tunned-neg-snli/checkpoint-315
Configuration saved in roberta-large-snli-fine-tunned-neg-snli/checkpoint-315/config.json
Model weights saved in roberta-large-snli-fine-tunned-neg-snli/checkpoint-315/pytorch_model.bin
tokenizer config file saved in roberta-large-snli-fine-tunned-neg-snli/checkpoint-315/tokenizer_config.json
Special tokens file saved in roberta-large-snli-fine-tunned-neg-snli/checkpoint-315/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: hypothesis, text, importance, Unnamed: 0. If hypothesis, text, importance, Unnamed: 0 are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 140
  Batch size = 4
Saving model checkpoint to roberta-large-snli-fine-tunned-neg-snli/checkpoint-630
Configuration saved in robert

TrainOutput(global_step=945, training_loss=0.47335208307498344, metrics={'train_runtime': 497.1022, 'train_samples_per_second': 7.604, 'train_steps_per_second': 1.901, 'total_flos': 223169192701152.0, 'train_loss': 0.47335208307498344, 'epoch': 3.0})

## Evaluate with the fine-tunned model

In [None]:
# download the checkpoint
checkpoint = "roberta-large-snli-fine-tunned-neg-snli/checkpoint-945"
tokenizer_neg = AutoTokenizer.from_pretrained(checkpoint)
model_neg = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
# with roberta (not fine-tunned on mnli)
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('corpus/SNLI/snli_1.0_dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[5], tokens[6], tokens[0]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        # print(prediction_label, target)
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8958


In [None]:
# evaluate on negation_benchmark snli
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('data/snli_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.93


### test on important and unimportante sentences

In [None]:
test_neg_datasets = neg_datasets['test']

In [None]:
test_neg_datasets_importance = test_neg_datasets.filter(lambda example: '1' in example['importance'])
test_neg_datasets_unimportance = test_neg_datasets.filter(lambda example: example['importance']=='0')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
test_neg_datasets

Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 600
})

In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))


| Accuracy:  0.845


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_importance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))


| Accuracy:  0.7788778877887789


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_unimportance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))


| Accuracy:  0.904


# Roberta fine-tunning in rte (original + negative sentences)

## preparation of environment

https://github.com/facebookresearch/fairseq/issues/3723 解决non-iterable的问题 

This is fixed in the master branch of hydra with the following commit but no release was made afterwards:
facebookresearch/hydra@8fa67de

It is also fixed by downgrading to hydra-core==1.0.7

In [2]:
!nvidia-smi

Sun Aug 14 09:33:06 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
%cd "/content/drive/MyDrive/memoire"

/content/drive/MyDrive/memoire


In [5]:
%%bash
pip install regex requests hydra-core omegaconf
pip install hydra-core==1.0.7
pip install fairseq

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting hydra-core
  Downloading hydra_core-1.2.0-py3-none-any.whl (151 kB)
Collecting omegaconf
  Downloading omegaconf-2.2.2-py3-none-any.whl (79 kB)
Collecting antlr4-python3-runtime==4.9.*
  Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)
Collecting PyYAML>=5.1.0
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Building wheels for collected packages: antlr4-python3-runtime
  Building wheel for antlr4-python3-runtime (setup.py): started
  Building wheel for antlr4-python3-runtime (setup.py): finished with status 'done'
  Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144575 sha256=3ef1ec15019ad17b94111b5ceb37513839941ceb6e40a3789b7f007b6ed2be43
  Stored in directory: /root/.cache/pip/wheels/8b/8d/53/2af8772d9aec614e3fc65e53d4a993ad73c61daa

In [6]:
import torch
from fairseq.data.data_utils import collate_tokens

In [7]:
%%bash
pip install transformers
pip install datasets
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.8.1 tokenizers-0.12.1 transformers-4.21.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.11.1
  Downloading

In [8]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

## load the model roberta.large and test on one sentence

choose 1

In [None]:
# ROBERTA LARGE

roberta = torch.hub.load('pytorch/fairseq', 'roberta.large')
roberta.eval()  # disable dropout (or leave in train mode to finetune)

In [56]:
tokens = roberta.encode('You are beautiful！')
# tokens # get a tensor 
# assert tokens.tolist() == [0, 31414, 232, 328, 2] # tolist() => tensor to list
roberta.decode(tokens)  # get : 'Hello world!'

'You are beautiful！'

In [9]:
# ROBERTA LARGE RTE

# checkpoint = "howey/roberta-large-rte"
checkpoint = 'howey/roberta-large-rte'
tokenizer_rte = AutoTokenizer.from_pretrained(checkpoint)
model_rte = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/563 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/780k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at textattack/roberta-base-RTE were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


## evaluation on a whole file (dev)

### test the label's order

In [42]:
raw_inputs = [
    "I love you",
    "I like you",
]
inputs = tokenizer_rte(raw_inputs[0], raw_inputs[1], padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  0, 100, 657,  47,   2,   2, 100, 101,  47,   2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [43]:
outputs = model_rte(**inputs.to(model_rte.device))

In [44]:
outputs

SequenceClassifierOutput([('logits',
                           tensor([[-0.4729,  0.7108]], grad_fn=<AddmmBackward0>))])

In [45]:
print(outputs.logits.shape)

torch.Size([1, 2])


In [46]:
label_map = model_rte.config.id2label
label_map

{0: 'entailment', 1: 'not_entailment'}

In [47]:
label_map[0]='entailment'
label_map[1]='not_entailment'


In [48]:
label_map

{0: 'entailment', 1: 'not_entailment'}

In [49]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
print(label_map[predictions])

not_entailment


### dev

In [57]:
roberta.register_classification_head('rte', num_classes=2)
logprobs = roberta.predict('rte', tokens)  # tensor([[-1.1050, -1.0672, -1.1245]], grad_fn=<LogSoftmaxBackward>)

In [60]:
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
roberta.cuda()
roberta.eval()
with open('corpus/RTE/dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        tokens = roberta.encode(sent1, sent2)
        prediction = roberta.predict('rte', tokens).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.5270758122743683


In [50]:
# evaluate on dev
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
model_rte.cuda() # gpu
model_rte.eval()
with open('corpus/RTE/dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_rte(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_rte(**inputs.to(model_rte.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.7833935018050542


In [51]:
# evaluate on negation_benchmark
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
model_rte.cuda() # gpu
model_rte.eval()
with open('data/rte_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_rte(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_rte(**inputs.to(model_rte.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.783


## Fine-tunning in a negative sentences dataset

### Download the datasets

In [None]:
data_files = './data/rte_completed_with_importance.tsv'
neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
neg_datasets



Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'label', 'importance'],
    num_rows: 2000
})

In [None]:
neg_datasets.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'importance': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'text': Value(dtype='string', id=None)}

In [None]:
label_map = {'not_entailment':1, 'entailment':0}
labels = []
for label in neg_datasets['label']:
  labels.append(label_map[label])

In [None]:
neg_datasets = neg_datasets.remove_columns('label')
neg_datasets = neg_datasets.add_column('labels', labels)

In [None]:
neg_datasets

Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 2000
})

In [None]:
neg_datasets[1]

{'Unnamed: 0': 1,
 'hypothesis': 'Polio is under control in the world.',
 'importance': '0',
 'labels': 1,
 'text': "The most surprising news story of the past week must not be the 'wonderful' story that the AIDS epidemic in India is under control."}

In [None]:
neg_datasets = neg_datasets.train_test_split(test_size=0.3)

In [None]:
train_neg_datasets = neg_datasets['train'].train_test_split(test_size=0.1)

In [None]:
train_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 140
    })
})

### prepare trainer evironment

In [None]:
# # download the checkpoint
# checkpoint = "howey/roberta-large-rte"
# tokenizer_rte = AutoTokenizer.from_pretrained(checkpoint)
# model_rte = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/792 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/2.83G [00:00<?, ?B/s]

In [None]:
def tokenize_function(example):
    return tokenizer_rte(example["text"], example["hypothesis"], truncation=True)

tokenized_neg_datasets = train_neg_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_rte)

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 1260
    })
    test: Dataset({
        features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'attention_mask'],
        num_rows: 140
    })
})

In [None]:
# most hyperparameters are set by default : like optimizer
training_args = TrainingArguments(
    "roberta-rte-fine-tunned-neg-rte", 
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)

In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "rte")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # 算出axis = -1 那个列的概率最大的那个
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model_rte,
    training_args,
    train_dataset=tokenized_neg_datasets["train"],
    eval_dataset=tokenized_neg_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer_rte,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, hypothesis, text, importance. If Unnamed: 0, hypothesis, text, importance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 1260
  Num Epochs = 3
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 945


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.470513,0.785714
2,0.644500,0.693022,0.885714
3,0.644500,0.669301,0.892857


The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, hypothesis, text, importance. If Unnamed: 0, hypothesis, text, importance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 140
  Batch size = 4


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Saving model checkpoint to roberta-rte-fine-tunned-neg-rte/checkpoint-315
Configuration saved in roberta-rte-fine-tunned-neg-rte/checkpoint-315/config.json
Model weights saved in roberta-rte-fine-tunned-neg-rte/checkpoint-315/pytorch_model.bin
tokenizer config file saved in roberta-rte-fine-tunned-neg-rte/checkpoint-315/tokenizer_config.json
Special tokens file saved in roberta-rte-fine-tunned-neg-rte/checkpoint-315/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `RobertaForSequenceClassification.forward` and have been ignored: Unnamed: 0, hypothesis, text, importance. If Unnamed: 0, hypothesis, text, importance are not expected by `RobertaForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 140
  Batch size = 4
Saving model checkpoint to roberta-rte-fine-tunned-neg-rte/checkpoint-630
Configuration saved in roberta-rte-fine-tunned-neg-rte/checkpoint-630/config.

TrainOutput(global_step=945, training_loss=0.548239345651455, metrics={'train_runtime': 258.1673, 'train_samples_per_second': 14.642, 'train_steps_per_second': 3.66, 'total_flos': 377366675723856.0, 'train_loss': 0.548239345651455, 'epoch': 3.0})

## Evaluate with the fine-tunned model

In [None]:
# download the checkpoint
checkpoint = "roberta-rte-fine-tunned-neg-rte/checkpoint-945"
tokenizer_neg = AutoTokenizer.from_pretrained(checkpoint)
model_neg = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
# evaluate on dev_mismatched
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('corpus/RTE/dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.7256317689530686


In [None]:
# evaluate on negation_benchmark
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('data/rte_completed_with_importance.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))


| Accuracy:  0.916


### test on important and unimportante sentences

In [None]:
# # reload the dataset
# data_files = './data/mnli_completed_with_importance.tsv'
# neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
# neg_datasets

In [None]:
test_neg_datasets = neg_datasets['test']

In [None]:
test_neg_datasets_importance = test_neg_datasets.filter(lambda example: '1' in example['importance'])
test_neg_datasets_unimportance = test_neg_datasets.filter(lambda example: example['importance']=='0')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
test_neg_datasets

Dataset({
    features: ['Unnamed: 0', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 600
})

In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8366666666666667


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_importance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8347457627118644


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_unimportance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8514851485148515
