# Bert fine-tunning in mnli ( original + negative sentences ) 





## preparation

In [None]:
!nvidia-smi

Wed Aug 31 14:52:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   52C    P0    36W / 250W |   5773MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd '/content/drive/MyDrive/memoire'

/content/drive/MyDrive/memoire


In [None]:
%%bash
pip install transformers
pip install datasets
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

## evaluate on bert_mnli

### model + tokenizer

In [None]:
checkpoint = "vuiseng9/bert-mnli"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

### test

In [None]:
raw_inputs = [
    "I hate this.",
    "I like this so much!",
]
inputs = tokenizer(raw_inputs[0], raw_inputs[1], padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[ 101, 1045, 5223, 2023, 1012,  102, 1045, 2066, 2023, 2061, 2172,  999,
          102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
outputs = model(**inputs.to(model.device))

In [None]:
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-2.4661, -2.0492,  5.1310]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
print(outputs.logits.shape)

torch.Size([1, 3])


In [None]:
label_map = model.config.id2label

In [None]:
label_map[0]='entailment'
label_map[1]='neutral'
label_map[2]='contradiction'

In [None]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
print(label_map[predictions])

contradiction


### evaluate (original+ negative sentences)

In [None]:
# evaluate on dev_mismatched
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model.cuda() # gpu
model.eval()
with open('corpus/MNLI/dev_mismatched.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
        inputs = tokenizer(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model(**inputs.to(model.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8495728234336859


In [None]:
# evaluate on negation_benchmark
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model.cuda() # gpu
model.eval()
with open('data/mnli_completed_with_importance_neg_only.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[2], tokens[3], tokens[4]
        inputs = tokenizer(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model(**inputs.to(model.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.644


## fine-tunning

### datasets

In [None]:
data_files = './data/mnli_completed_with_importance_neg_only.tsv'
neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
neg_datasets



Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'label', 'importance'],
    num_rows: 1500
})

In [None]:
neg_datasets.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'index': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'importance': Value(dtype='string', id=None)}

In [None]:
label_map = {'entailment':0 , 'neutral':1 , 'contradiction':2}
labels = []
for label in neg_datasets['label']:
  labels.append(label_map[label])

In [None]:
neg_datasets = neg_datasets.remove_columns('label')
neg_datasets = neg_datasets.add_column('labels', labels)

In [None]:
neg_datasets

Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 1500
})

In [None]:
neg_datasets = neg_datasets.train_test_split(test_size=0.3)



In [None]:
train_neg_datasets = neg_datasets['train'].train_test_split(test_size=0.1)



In [None]:
train_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 945
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 105
    })
})

### prepare trainer evironment

In [None]:
# download the checkpoint
# checkpoint = "vuiseng9/bert-mnli"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
def tokenize_function(example):
    return tokenizer(example["text"], example["hypothesis"], truncation=True)

tokenized_neg_datasets = train_neg_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 945
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 105
    })
})

In [None]:
# most hyperparameters are set by default : like optimizer
training_args = TrainingArguments(
    "bert-base-fine-tunned-neg-mnli", 
    evaluation_strategy="epoch",
    save_strategy = 'epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mnli")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # 算出axis = -1 那个列的概率最大的那个
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_neg_datasets["train"],
    eval_dataset=tokenized_neg_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, importance, text, hypothesis, Unnamed: 0. If index, importance, text, hypothesis, Unnamed: 0 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 945
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 90


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.798909,0.666667
2,No log,0.76367,0.666667
3,No log,0.767906,0.704762


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, importance, text, hypothesis, Unnamed: 0. If index, importance, text, hypothesis, Unnamed: 0 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 105
  Batch size = 32
Saving model checkpoint to bert-base-fine-tunned-neg-mnli/checkpoint-30
Configuration saved in bert-base-fine-tunned-neg-mnli/checkpoint-30/config.json
Model weights saved in bert-base-fine-tunned-neg-mnli/checkpoint-30/pytorch_model.bin
tokenizer config file saved in bert-base-fine-tunned-neg-mnli/checkpoint-30/tokenizer_config.json
Special tokens file saved in bert-base-fine-tunned-neg-mnli/checkpoint-30/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index,

TrainOutput(global_step=90, training_loss=0.5870745764838324, metrics={'train_runtime': 50.3437, 'train_samples_per_second': 56.313, 'train_steps_per_second': 1.788, 'total_flos': 85795529915088.0, 'train_loss': 0.5870745764838324, 'epoch': 3.0})

## Evaluate with the fine-tunned model

In [None]:
# download the checkpoint
checkpoint = "bert-base-fine-tunned-neg-mnli/checkpoint-90"
tokenizer_neg = AutoTokenizer.from_pretrained(checkpoint)
model_neg = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

Didn't find file bert-base-fine-tunned-neg-mnli/checkpoint-90/added_tokens.json. We won't load it.
loading file bert-base-fine-tunned-neg-mnli/checkpoint-90/vocab.txt
loading file bert-base-fine-tunned-neg-mnli/checkpoint-90/tokenizer.json
loading file None
loading file bert-base-fine-tunned-neg-mnli/checkpoint-90/special_tokens_map.json
loading file bert-base-fine-tunned-neg-mnli/checkpoint-90/tokenizer_config.json
loading configuration file bert-base-fine-tunned-neg-mnli/checkpoint-90/config.json
Model config BertConfig {
  "_name_or_path": "bert-base-fine-tunned-neg-mnli/checkpoint-90",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "mnli",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "entailment",
    "1": "neutral",
    "2": "contradiction"
  },
  "initializer_range": 0.02,
  "intermediat

In [None]:
# evaluate on dev_mismatched
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('corpus/MNLI/dev_mismatched.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[8], tokens[9], tokens[-1]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.7278275020341741

| Accuracy:  0.8374694873881204


### test on important and unimportante sentences

In [None]:
test_neg_datasets = neg_datasets['test']

In [None]:
test_neg_datasets_importance = test_neg_datasets.filter(lambda example: '1' in example['importance'])
test_neg_datasets_unimportance = test_neg_datasets.filter(lambda example: example['importance']=='0')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
test_neg_datasets

Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 450
})

In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.7433333333333333

| Accuracy:  0.7333333333333333


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_importance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.6574074074074074

| Accuracy:  0.6798780487804879


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_unimportance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.8189655172413793

| Accuracy:  0.8571428571428571


# Bert fine-tunning in snli ( original + negative sentences ) 





## preparation

In [None]:
!nvidia-smi

Sun Aug 14 05:14:23 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/memoire'

/content/drive/MyDrive/memoire


In [None]:
%%bash
pip install transformers
pip install datasets
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstalling PyYAML-3.13:
      Successfully uninstalled PyYAML-3.13
Successfully installed huggingface-hub-0.8.1 pyyaml-6.0 tokenizers-0.12.1 transformers-4.21.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/sim

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
import evaluate

## evaluate on bert_snli

### load the model and tokenizer

In [None]:
# ERROR ! 

# checkpoint = "textattack/bert-base-uncased-snli"
# tokenizer_snli = AutoTokenizer.from_pretrained(checkpoint)
# model_snli = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

### fine-tunning bert-based-uncased on snli
the one fine-tunned in huggingface is not very performance, so I do it again

In [None]:
datasets = load_dataset('snli')
datasets

In [None]:
datasets = datasets.filter(lambda expression:expression['label']!=-1)



In [None]:
print(set(datasets['train']['label']))

{0, 1, 2}


In [None]:
checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)
model

In [None]:
def tokenize_function(example):
    return tokenizer(example["premise"], example["hypothesis"], truncation=True)

tokenized_datasets = datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)



  0%|          | 0/10 [00:00<?, ?ba/s]

In [None]:
tokenized_datasets

DatasetDict({
    test: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9824
    })
    train: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 549367
    })
    validation: Dataset({
        features: ['premise', 'hypothesis', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 9842
    })
})

In [None]:
# most hyperparameters are set by default : like optimizer
training_args = TrainingArguments(
    "bert-based-uncased-snli", 
    evaluation_strategy="epoch",
    save_strategy = 'epoch',
    learning_rate=2e-5,
    num_train_epochs = 1,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load('glue', 'mnli')
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # 算出axis = -1 那个列的概率最大的那个
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model,
    training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: hypothesis, premise. If hypothesis, premise are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 549367
  Num Epochs = 1
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 137342


Epoch,Training Loss,Validation Loss,Accuracy
1,0.4475,0.430992,0.904186


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: hypothesis, premise. If hypothesis, premise are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 9842
  Batch size = 4


Downloading builder script:   0%|          | 0.00/5.75k [00:00<?, ?B/s]

Saving model checkpoint to bert-based-uncased-snli/checkpoint-137342
Configuration saved in bert-based-uncased-snli/checkpoint-137342/config.json
Model weights saved in bert-based-uncased-snli/checkpoint-137342/pytorch_model.bin
tokenizer config file saved in bert-based-uncased-snli/checkpoint-137342/tokenizer_config.json
Special tokens file saved in bert-based-uncased-snli/checkpoint-137342/special_tokens_map.json


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=137342, training_loss=0.4103601852901169, metrics={'train_runtime': 7671.9615, 'train_samples_per_second': 71.607, 'train_steps_per_second': 17.902, 'total_flos': 9734021181743922.0, 'train_loss': 0.4103601852901169, 'epoch': 1.0})

### reload the fine-tunned model : model_snli

In [None]:
checkpoint = "bert-based-uncased-snli/checkpoint-137342"
tokenizer_snli = AutoTokenizer.from_pretrained(checkpoint)
model_snli = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

### test the label's order

In [None]:
raw_inputs = [
    "Two women are embracing while holding to go packages.",
    "Two woman are holding packages.",
]
inputs = tokenizer_snli(raw_inputs[0], raw_inputs[1], padding=True, truncation=True, return_tensors="pt")
print(inputs)

{'input_ids': tensor([[  101,  2048,  2308,  2024, 23581,  2096,  3173,  2000,  2175, 14555,
          1012,   102,  2048,  2450,  2024,  3173, 14555,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}


In [None]:
outputs = model_snli(**inputs.to(model_snli.device))

In [None]:
outputs

SequenceClassifierOutput([('logits',
                           tensor([[ 5.2433, -1.1663, -2.9671]], grad_fn=<AddmmBackward0>))])

In [None]:
print(outputs.logits.shape)

torch.Size([1, 3])


In [None]:
label_map = model_snli.config.id2label
label_map

{0: 'LABEL_0', 1: 'LABEL_1', 2: 'LABEL_2'}

In [None]:
label_map[0]='entailment'
label_map[1]='neutral'
label_map[2]='contradiction'

In [None]:
label_map

{0: 'entailment', 1: 'neutral', 2: 'contradiction'}

In [None]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
print(label_map[predictions])

entailment


### evaluate (original+ negative sentences)

In [None]:
# evaluate on dev_mismatched
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_snli.cuda() # gpu
model_snli.eval()
with open('corpus/SNLI/snli_1.0_dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[5], tokens[6], tokens[0]
        inputs = tokenizer_snli(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_snli(**inputs.to(model_snli.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        # print(prediction_label, target)
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8899


In [None]:
# evaluate on negation_benchmark
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_snli.cuda() # gpu
model_snli.eval()
with open('data/snli_completed_with_importance_neg_only.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[2], tokens[3], tokens[4]
        inputs = tokenizer_snli(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_snli(**inputs.to(model_snli.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.2633333333333333


## fine-tunning with negatives sentences

### datasets

In [None]:
data_files = './data/snli_completed_with_importance_neg_only.tsv'
neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
neg_datasets



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-95c6bc427456eefc/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-95c6bc427456eefc/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'label', 'importance'],
    num_rows: 1500
})

In [None]:
neg_datasets.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'index': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'importance': Value(dtype='string', id=None)}

In [None]:
label_map = {'entailment':0 , 'neutral':1 , 'contradiction':2}
labels = []
for label in neg_datasets['label']:
  labels.append(label_map[label])

In [None]:
neg_datasets = neg_datasets.remove_columns('label')
neg_datasets = neg_datasets.add_column('labels', labels)

In [None]:
neg_datasets

Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 1500
})

In [None]:
neg_datasets = neg_datasets.train_test_split(test_size=0.3)

In [None]:
train_neg_datasets = neg_datasets['train'].train_test_split(test_size=0.1)

In [None]:
train_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 945
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 105
    })
})

### prepare trainer evironment

In [None]:
# # download the checkpoint
# checkpoint = "textattack/bert-base-uncased-snli"
# tokenizer = AutoTokenizer.from_pretrained(checkpoint)
# model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
def tokenize_function(example):
    return tokenizer_snli(example["text"], example["hypothesis"], truncation=True)

tokenized_neg_datasets = train_neg_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_snli)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 945
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 105
    })
})

In [None]:
# most hyperparameters are set by default : like optimizer
training_args = TrainingArguments(
    "bert-base-fine-tunned-neg-snli", 
    evaluation_strategy="epoch",
    save_strategy = 'epoch',
    learning_rate=1e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "mnli")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # 算出axis = -1 那个列的概率最大的那个
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model_snli,
    training_args,
    train_dataset=tokenized_neg_datasets["train"],
    eval_dataset=tokenized_neg_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer_snli,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, importance, text, hypothesis, Unnamed: 0. If index, importance, text, hypothesis, Unnamed: 0 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 945
  Num Epochs = 3
  Instantaneous batch size per device = 32
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 90


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,1.677236,0.580952
2,No log,1.350885,0.619048
3,No log,1.253525,0.619048


The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index, importance, text, hypothesis, Unnamed: 0. If index, importance, text, hypothesis, Unnamed: 0 are not expected by `BertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running Evaluation *****
  Num examples = 105
  Batch size = 32
Saving model checkpoint to bert-base-fine-tunned-neg-snli/checkpoint-30
Configuration saved in bert-base-fine-tunned-neg-snli/checkpoint-30/config.json
Model weights saved in bert-base-fine-tunned-neg-snli/checkpoint-30/pytorch_model.bin
tokenizer config file saved in bert-base-fine-tunned-neg-snli/checkpoint-30/tokenizer_config.json
Special tokens file saved in bert-base-fine-tunned-neg-snli/checkpoint-30/special_tokens_map.json
The following columns in the evaluation set don't have a corresponding argument in `BertForSequenceClassification.forward` and have been ignored: index,

TrainOutput(global_step=90, training_loss=1.422166019015842, metrics={'train_runtime': 49.3784, 'train_samples_per_second': 57.414, 'train_steps_per_second': 1.823, 'total_flos': 59612147597394.0, 'train_loss': 1.422166019015842, 'epoch': 3.0})

## Evaluate with the fine-tunned model

In [None]:
# download the checkpoint
checkpoint = "bert-base-fine-tunned-neg-snli/checkpoint-90"
tokenizer_neg = AutoTokenizer.from_pretrained(checkpoint)
model_neg = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=3)

In [None]:
# evaluate on dev_mismatched
label_map = {0: 'entailment', 1: 'neutral', 2: 'contradiction'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('corpus/SNLI/snli_1.0_dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[5], tokens[6], tokens[0]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        # print(prediction_label, target)
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8748


### test on important and unimportante sentences

In [None]:
test_neg_datasets = neg_datasets['test']

In [None]:
test_neg_datasets_importance = test_neg_datasets.filter(lambda example: '1' in example['importance'])
test_neg_datasets_unimportance = test_neg_datasets.filter(lambda example: example['importance']=='0')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
test_neg_datasets

Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 450
})

In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.7433333333333333

| Accuracy:  0.6333333333333333


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_importance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.6574074074074074

| Accuracy:  0.578125


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_unimportance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))
# | Accuracy:  0.8189655172413793

| Accuracy:  0.8210526315789474


# Bert fine-tunning in rte ( original + negative sentences ) 

## preparation

In [None]:
!nvidia-smi

Sun Aug 14 09:19:47 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%cd '/content/drive/MyDrive/memoire'

/content/drive/MyDrive/memoire


In [None]:
%%bash
pip install datasets
pip install transformers
pip install deberta
pip install evaluate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
Collecting huggingface-hub<1.0.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
Collecting xxhash
  Downloading xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (212 kB)
Collecting multiprocess
  Downloading multiprocess-0.70.13-py37-none-any.whl (115 kB)
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting fsspec[http]>=2021.11.1
  Downloading fsspec-2022.7.1-py3-none-any.whl (141 kB)
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
Collecting urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1
  Downloading urllib3-1.25.11-py2.py3-none-any.whl (127 kB)
Installing collected packages: urllib3, pyyaml, fsspec, xxhash, responses, multiprocess

In [None]:
import numpy as np
import pandas as pd
import torch
from datasets import load_dataset
from transformers import AutoTokenizer, DataCollatorWithPadding
from transformers import AutoModelForSequenceClassification
from transformers import TrainingArguments
from transformers import Trainer
from transformers import AdamW
import evaluate
from torch.utils.data import DataLoader

## evaluate debera_rte

In fact, there is one ready to use

### prepare the model and the tokenizer

In [None]:
# download the checkpoint
checkpoint = "textattack/bert-base-uncased-RTE"
tokenizer_rte = AutoTokenizer.from_pretrained(checkpoint)
model_rte = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)
model_rte

https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/tokenizer_config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp5ojweu66


Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

storing https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/tokenizer_config.json in cache at /root/.cache/huggingface/transformers/7f235698e620a8acd7821adef2423f76480d6e7a1063308e2c0551399060aeb2.76ea01b4b85ac16e2cec55c398cba7a943d89ab21dfdd973f6630a152e4b9aed
creating metadata file for /root/.cache/huggingface/transformers/7f235698e620a8acd7821adef2423f76480d6e7a1063308e2c0551399060aeb2.76ea01b4b85ac16e2cec55c398cba7a943d89ab21dfdd973f6630a152e4b9aed
https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/config.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmp6ko5c__t


Downloading config.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

storing https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/config.json in cache at /root/.cache/huggingface/transformers/2c28712ff13dc662b198e60a575edc12e070528f8690876720c84b0eee352980.517d4c379028e73599a840c4e589adebafb2c128ae9e22cb8a8024eae4fda22f
creating metadata file for /root/.cache/huggingface/transformers/2c28712ff13dc662b198e60a575edc12e070528f8690876720c84b0eee352980.517d4c379028e73599a840c4e589adebafb2c128ae9e22cb8a8024eae4fda22f
loading configuration file https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/config.json from cache at /root/.cache/huggingface/transformers/2c28712ff13dc662b198e60a575edc12e070528f8690876720c84b0eee352980.517d4c379028e73599a840c4e589adebafb2c128ae9e22cb8a8024eae4fda22f
Model config BertConfig {
  "_name_or_path": "textattack/bert-base-uncased-RTE",
  "architectures": [
    "BertForSequenceClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "glue:rte",
 

Downloading vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

storing https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/vocab.txt in cache at /root/.cache/huggingface/transformers/39c5b57dbacf7572acf1f08f5be88abe93aee8770688d17ed9792ecc72c904e9.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
creating metadata file for /root/.cache/huggingface/transformers/39c5b57dbacf7572acf1f08f5be88abe93aee8770688d17ed9792ecc72c904e9.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/special_tokens_map.json not found in cache or force_download set to True, downloading to /root/.cache/huggingface/transformers/tmpp7k7bi68


Downloading special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

storing https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/special_tokens_map.json in cache at /root/.cache/huggingface/transformers/a402e21b2325f5e1b9c95d97c746c5bdd55fd55558528bc4b52faa344ef64d5b.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
creating metadata file for /root/.cache/huggingface/transformers/a402e21b2325f5e1b9c95d97c746c5bdd55fd55558528bc4b52faa344ef64d5b.dd8bd9bfd3664b530ea4e645105f557769387b3da9f79bdb55ed556bdd80611d
loading file https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/vocab.txt from cache at /root/.cache/huggingface/transformers/39c5b57dbacf7572acf1f08f5be88abe93aee8770688d17ed9792ecc72c904e9.d789d64ebfe299b0e416afc4a169632f903f693095b4629a7ea271d5a0cf2c99
loading file https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/tokenizer.json from cache at None
loading file https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/added_tokens.json from cache at None
loading file ht

Downloading pytorch_model.bin:   0%|          | 0.00/418M [00:00<?, ?B/s]

storing https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/pytorch_model.bin in cache at /root/.cache/huggingface/transformers/55c7a844080e15c8ca481d05f55660f3d0e4947f8e0c2a50c82070f87cec8261.72d9e8d9a914524554a769d91da7676dff9967e85aa926707f5a6dc4ce2d90b6
creating metadata file for /root/.cache/huggingface/transformers/55c7a844080e15c8ca481d05f55660f3d0e4947f8e0c2a50c82070f87cec8261.72d9e8d9a914524554a769d91da7676dff9967e85aa926707f5a6dc4ce2d90b6
loading weights file https://huggingface.co/textattack/bert-base-uncased-RTE/resolve/main/pytorch_model.bin from cache at /root/.cache/huggingface/transformers/55c7a844080e15c8ca481d05f55660f3d0e4947f8e0c2a50c82070f87cec8261.72d9e8d9a914524554a769d91da7676dff9967e85aa926707f5a6dc4ce2d90b6
All model checkpoint weights were used when initializing BertForSequenceClassification.

All the weights of BertForSequenceClassification were initialized from the model checkpoint at textattack/bert-base-uncased-RTE.
If your task is simila

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

### test

In [None]:
raw_inputs = [
    "I love this.",
    "I like this so much!",
]
inputs = tokenizer_rte(raw_inputs[0], raw_inputs[1], padding=True, truncation=True, return_tensors="pt")
inputs

{'input_ids': tensor([[ 101, 1045, 5223, 2023, 1012,  102, 1045, 2066, 2023, 2061, 2172,  999,
          102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [None]:
outputs = model_rte(**inputs.to(model_rte.device))
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-1.6193,  1.2401]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [None]:
print(outputs.logits.shape)

torch.Size([1, 2])


In [None]:
label_map = model_rte.config.id2label
label_map

{0: 'LABEL_0', 1: 'LABEL_1'}

In [None]:
predictions = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
print(label_map[predictions])

LABEL_1


### evaluate on dev

In [None]:
# # download the checkpoint
# checkpoint = "textattack/bert-base-uncased-RTE"
# tokenizer_rte = AutoTokenizer.from_pretrained(checkpoint)
# model_rte = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

All model checkpoint weights were used when initializing DebertaForSequenceClassification.

All the weights of DebertaForSequenceClassification were initialized from the model checkpoint at ./deberta-based-large-rte/checkpoint-1869.
If your task is similar to the task the model of the checkpoint was trained on, you can already use DebertaForSequenceClassification for predictions without further training.


In [None]:
# evaluate on dev_mismatched
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
model_rte.cuda() # gpu
model_rte.eval()
with open('corpus/RTE/dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_rte(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_rte(**inputs.to(model_rte.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.7256317689530686


In [None]:
# evaluate on negation_benchmark
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
model_rte.cuda() # gpu
model_rte.eval()
with open('data/rte_completed_with_importance_neg_only.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[2], tokens[3], tokens[4]
        inputs = tokenizer_rte(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_rte(**inputs.to(model_rte.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.588


## Fine-tunning in a negative sentences dataset

### Download the datasets

In [None]:
data_files = './data/rte_completed_with_importance_neg_only.tsv'
neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
neg_datasets



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-2da8e5812fadb932/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-2da8e5812fadb932/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'label', 'importance'],
    num_rows: 1500
})

In [None]:
neg_datasets.features

{'Unnamed: 0': Value(dtype='int64', id=None),
 'index': Value(dtype='int64', id=None),
 'text': Value(dtype='string', id=None),
 'hypothesis': Value(dtype='string', id=None),
 'label': Value(dtype='string', id=None),
 'importance': Value(dtype='string', id=None)}

In [None]:
label_map = {'entailment':0, 'not_entailment':1}
labels = []
for label in neg_datasets['label']:
  labels.append(label_map[label])

In [None]:
neg_datasets = neg_datasets.remove_columns('label')
neg_datasets = neg_datasets.add_column('labels', labels)

In [None]:
neg_datasets

Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 1500
})

In [None]:
neg_datasets[1]

{'Unnamed: 0': 1,
 'index': 2,
 'text': "The most surprising news story of the past week must be the 'wonderful' story that the AIDS epidemic in India is under control.",
 'hypothesis': 'Polio is not under control in the world.',
 'importance': '0',
 'labels': 1}

In [None]:
neg_datasets = neg_datasets.train_test_split(test_size=0.3)

In [None]:
train_neg_datasets = neg_datasets['train'].train_test_split(test_size=0.1)

In [None]:
train_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 945
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
        num_rows: 105
    })
})

### prepare trainer evironment

In [None]:
# # download the checkpoint
# checkpoint = "textattack/bert-base-uncased-RTE"
# tokenizer_rte = AutoTokenizer.from_pretrained(checkpoint)
# model_rte = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
def tokenize_function(example):
    return tokenizer_rte(example["text"], example["hypothesis"], truncation=True)

tokenized_neg_datasets = train_neg_datasets.map(tokenize_function, batched=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer_rte)

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
tokenized_neg_datasets

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 945
    })
    test: Dataset({
        features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 105
    })
})

In [None]:
# most hyperparameters are set by default : like optimizer
training_args = TrainingArguments(
    "bert-base-fine-tunned-neg-rte", 
    evaluation_strategy="epoch",
    save_steps=1190,
    learning_rate=2e-5,
    num_train_epochs=50,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [None]:
def compute_metrics(eval_preds):
    metric = evaluate.load("glue", "rte")
    logits, labels = eval_preds
    predictions = np.argmax(logits, axis=-1) # 算出axis = -1 那个列的概率最大的那个
    return metric.compute(predictions=predictions, references=labels)

In [None]:
trainer = Trainer(
    model_rte,
    training_args,
    train_dataset=tokenized_neg_datasets["train"],
    eval_dataset=tokenized_neg_datasets["test"],
    data_collator=data_collator,
    tokenizer=tokenizer_rte,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

## Evaluate with the fine-tunned model

In [None]:
# download the checkpoint
checkpoint = "bert-base-fine-tunned-neg-rte/checkpoint-5950"
tokenizer_neg = AutoTokenizer.from_pretrained(checkpoint)
model_neg = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

In [None]:
# evaluate on dev_mismatched
label_map = {0: 'entailment', 1: 'not_entailment'}
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
with open('corpus/RTE/dev.tsv') as fin:
    fin.readline()
    for index, line in enumerate(fin):
        tokens = line.strip().split('\t')
        sent1, sent2, target = tokens[1], tokens[2], tokens[3]
        inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
        outputs = model_neg(**inputs.to(model_neg.device))
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
        prediction_label = label_map[prediction]
        ncorrect += int(prediction_label == target)
        nsamples += 1
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.6787003610108303


### test on important and unimportante sentences

In [None]:
# # reload the dataset
# data_files = './data/mnli_completed_with_importance.tsv'
# neg_datasets = load_dataset('csv', data_files=data_files, sep='\t', split="train")
# neg_datasets

In [None]:
test_neg_datasets = neg_datasets['test']

In [None]:
test_neg_datasets_importance = test_neg_datasets.filter(lambda example: '1' in example['importance'])
test_neg_datasets_unimportance = test_neg_datasets.filter(lambda example: example['importance']=='0')

  0%|          | 0/1 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [None]:
test_neg_datasets

Dataset({
    features: ['Unnamed: 0', 'index', 'text', 'hypothesis', 'importance', 'labels'],
    num_rows: 450
})

In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.8311111111111111


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_importance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.7863247863247863


In [None]:
# evaluate on negation_benchmark
ncorrect, nsamples = 0, 0
model_neg.cuda() # gpu
model_neg.eval()
for line in test_neg_datasets_unimportance:
    sent1, sent2, target = line['text'], line['hypothesis'], line['labels']
    inputs = tokenizer_neg(sent1, sent2, padding=True, truncation=True, return_tensors="pt") 
    outputs = model_neg(**inputs.to(model_neg.device))
    prediction = torch.nn.functional.softmax(outputs.logits, dim=-1).argmax().item()
    ncorrect += int(prediction == target)
    nsamples += 1
    
print('| Accuracy: ', float(ncorrect)/float(nsamples))

| Accuracy:  0.9308176100628931
