<a href="https://colab.research.google.com/github/nikotang/RD-UU-MPLT/blob/main/RD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# R&D: Additional Paraphrase Training Drives Language Models Closer to Human Behaviour on Natural Language Inference

Repo: https://github.com/nikotang/RD-UU-MPLT/

TLDR: Language models predict 'very well' on NLI hypotheses with randomised word order, which does and doesn't make sense. I think it shouldn't be so confident after seeing a paraphrase dataset (PAWS) and an anaphora resolution dataset (Winogrande).

Winogrande didn't work out eventually. PAWS did.

Notes:

RoBERTa large is only a bit better than base on MNLI. RoBERTa large finetuned on NLI did no better than just large base: (https://github.com/facebookresearch/fairseq/tree/main/examples/roberta)

Finetuning roberta on winogrande: (https://github.com/facebookresearch/fairseq/blob/main/examples/roberta/wsc/README.md)


# 0: Installations and imports

In [None]:
!pip install -Uq accelerate datasets evaluate nltk transformers

In [None]:
from datasets import Dataset, DatasetDict, Value, load_dataset, concatenate_datasets
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, TensorDataset, DataLoader
from transformers import AutoTokenizer, RobertaForSequenceClassification, EarlyStoppingCallback

# 1: Preprocessing data

If this is run before or if the processed data from the Github repo is downloaded, go to [section 1.5](#section-1.5).

HANS isn't used eventually.

## Multi-NLI

In [None]:
# load from Huggingface dataset hub
mnli_dataset = load_dataset('multi_nli')

# remove irrelevant columns
mnli_dataset = mnli_dataset.remove_columns(['promptID', 'pairID', 'premise_binary_parse', 'premise_parse', \
                                            'hypothesis_binary_parse', 'hypothesis_parse', 'genre'])
# rename some columns
mnli_dataset = mnli_dataset.rename_column('premise', 'sentence1')
mnli_dataset = mnli_dataset.rename_column('hypothesis', 'sentence2')

In [None]:
# create a test set
matched = mnli_dataset['validation_matched'].train_test_split(test_size=0.5)
mismatched = mnli_dataset['validation_mismatched'].train_test_split(test_size=0.5)

del mnli_dataset['validation_matched']
del mnli_dataset['validation_mismatched']

mnli_dataset['validation'] = concatenate_datasets([matched['train'], mismatched['train']])
mnli_dataset['test'] = concatenate_datasets([matched['test'], mismatched['test']])

In [None]:
# relabel the dataset: merge contradiction and neutral
label2id = {'contradiction': 0, 'neutral': 0, 'entailment': 1}
mnli_dataset = mnli_dataset.align_labels_with_mapping(label2id, 'label')

In [None]:
# trim training set, 400,000 is too much
# sometimes doens't work without reimporting
from datasets import Dataset
mnli_dataset['train'] = Dataset.from_dict(mnli_dataset['train'][:100000])

## PAWS

In [None]:
paws_dataset = load_dataset('paws', 'labeled_final')
paws2 = load_dataset('paws', 'labeled_swap')

paws_dataset['train'] = concatenate_datasets([paws_dataset['train'], paws2['train']])

paws_dataset = paws_dataset.remove_columns('id')

## Winogrande

The dataset uses a '_' for the place where the anaphora should be. To create premise-hypothesis pairs, a pronoun is predicted (here with the Huggingface fill-mask pipeline, DistilRoBERTa-base by default) and put in place. An entailed hypothesis will be putting the correct noun in place, and the other noun contender put in place makes the non-entailment hypothesis.

Or skip to the next following subsubsection to download the filled dataset directly.

In [None]:
!wget -Nq https://storage.googleapis.com/ai2-mosaic/public/winogrande/winogrande_1.1.zip
!unzip winogrande_1.1.zip; rm -rf __MACOSX
!pip install nvidia-ml-py3 tqdm

import csv
import json
from tqdm import tqdm
from transformers import pipeline

In [None]:
train_premises = []
with open('winogrande_1.1/train_xl.jsonl') as f:
    for line in f:
        data = json.loads(line)
        train_premises.append(data["sentence"])

dev_premises = []
with open('winogrande_1.1/dev.jsonl') as f:
    for line in f:
        data = json.loads(line)
        dev_premises.append(data["sentence"])

In [None]:
predictor = pipeline("fill-mask", device=0)
targets=['Ġhe', 'Ġhim', 'Ġhis',
            'Ġshe', 'Ġher', 'Ġhers',
            'Ġit', 'Ġits', 'Ġthey', 'Ġtheir', 'Ġthem']

def add_mask(premises):
    # replace _ with mask token
    for i, sent in enumerate(premises):
        if 'the _' in sent:
            premises[i] = sent.split('the _')
        elif '_' in sent:
            premises[i] = sent.split('_')
        else:
            print(i)
        premises[i] = '<mask>'.join(sent) # mask token for roberta
    return premises

def predict_pronouns(masked_premises)
    filled_premises = []
    for line in tqdm(masked_premises):
        guess = predictor(line, targets=targets, top_k=1)
        filled_premises.append(guess[0]['sequence'])
    return filled_premises

def write_pairs_to_file(filled_premises, file_input, file_output):
    with open('dev_transformed.tsv', 'w') as trans_f:
        with open('dev.jsonl', 'r') as f:
            tsv_writer = csv.writer(trans_f, delimiter='\t')
            tsv_writer.writerow(['id', 'sentence1', 'sentence2', 'label'])
            id = 1
            for i, line in enumerate(f):
                data = json.loads(line)
                s_segments = data["sentence"].split('_')
                correct = data["answer"]
                wrong = ['1', '2']
                wrong.remove(correct)
                wrong = wrong[0]

                correct_s = data[f"option{correct}"].join(s_segments)
                tsv_writer.writerow([id, filled_premises[i], correct_s, '1'])
                id += 1
                wrong_s = data[f"option{wrong}"].join(s_segments)
                tsv_writer.writerow([id, filled_premises[i], wrong_s, '0'])
                id += 1

In [None]:
train_masked = add_mask(train_premises)
train_filled = predict_pronouns(train_masked)
write_pairs_to_file(train_filled, 'winogrande_1.1/train_xl.jsonl', 'train_xl_transformed.tsv')

dev_masked = add_mask(dev_premises)
dev_filled = predict_pronouns(dev_masked)
write_pairs_to_file(dev_filled, 'winogrande_1.1/dev.jsonl', 'dev_transformed.tsv')

### After filling in predicted pronouns

In [None]:
# download Winogrande with pronouns filled in
!wget -Nq https://raw.githubusercontent.com/nikotang/RD-UU-MPLT/main/winogrande_1.1/train_xl_transformed.tsv
!wget -Nq https://raw.githubusercontent.com/nikotang/RD-UU-MPLT/main/winogrande_1.1/dev_transformed.tsv

In [None]:
df_train = pd.read_csv('train_xl_transformed.tsv', sep='\t')
winogrande_train = Dataset.from_pandas(df_train)
winogrande_dataset = winogrande_train.train_test_split(test_size=0.1)

df_dev = pd.read_csv('dev_transformed.tsv', sep='\t')
winogrande_dataset['validation'] = Dataset.from_pandas(df_dev)
winogrande_dataset = winogrande_dataset.remove_columns('id')

## HANS

This set isn't used.

In [None]:
# hans_dataset = load_dataset('hans')

# hans_dataset = hans_dataset.remove_columns(['parse_premise', 'parse_hypothesis', 'binary_parse_premise', 'binary_parse_hypothesis', 'subcase', 'template'])

# # rename some columns
# hans_dataset = hans_dataset.rename_column('premise', 'sentence1')
# hans_dataset = hans_dataset.rename_column('hypothesis', 'sentence2')

In [None]:
# # make test set
# val_test = hans_dataset['validation'].train_test_split(test_size=0.5) # shuffle default=True

# hans_dataset['validation'] = val_test['train']
# hans_dataset['test'] = val_test['test']

In [None]:
# relabel the dataset
# label2id = {'non-entailment': 0, 'entailment': 1}
# hans_dataset = hans_dataset.align_labels_with_mapping(label2id, 'label')

In [None]:
# for h in ('lexical_overlap', 'subsequence', 'constituent'):
#   print(f'{h}: {hans_dataset['validation']['heuristic'].count(h)}')

## Tokenization

In [None]:
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

In [None]:
def tokenize(ds):
  return tokenizer(ds['sentence1'], ds['sentence2'], padding=True, return_tensors='pt')

def process(dataset_d):
  return dataset_d.map(tokenize, batched=True, batch_size=64, num_proc=4)

# tokenize datasets
mnli_dataset_tok = process(mnli_dataset)
paws_dataset_tok = process(paws_dataset)
winogrande_dataset_tok = process(winogrande_dataset)
# hans_dataset_tok = process(hans_dataset)

# rename 'label' to 'labels' for Trainer (alternatively specify 'label' in Trainer params)
mnli_dataset_tok = mnli_dataset_tok.rename_column('label', 'labels')
paws_dataset_tok = paws_dataset_tok.rename_column('label', 'labels')
winogrande_dataset_tok = winogrande_dataset_tok.rename_column('label', 'labels')
# hans_dataset_tok = hans_dataset_tok.rename_column('label', 'labels')

for dataset_d in [mnli_dataset_tok, paws_dataset_tok, winogrande_dataset_tok]: # hans_dataset_tok
  dataset_d.set_format(type='torch', columns=['labels'], output_all_columns=True)

In [None]:
mnli_dataset_tok.save_to_disk('./mnli_datasets')
paws_dataset_tok.save_to_disk('./paws_datasets')
winogrande_dataset_tok.save_to_disk('./winogrande_datasets')
# hans_dataset_tok.save_to_disk('./hans_datasets')

In [None]:
!zip -r mnli.zip mnli_datasets
!zip -r paws.zip paws_datasets
!zip -r winogrande.zip winogrande_datasets
# !zip -r hans.zip hans_datasets

## Create randomised data

We want to have test sets that have random word order to test changes in confidence after finetuning the language model. A sorted set and a shuffled set are created for each test set.

In [None]:
from random import sample
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize

def randomise(ds):
  ts = ds.remove_columns(['input_ids', 'attention_mask'])
  ts_sort = ts.map(lambda x: {'sentence2': ' '.join(sorted(word_tokenize(x['sentence2'])))})
  ts_shuff = ts.map(lambda x: {'sentence2': ' '.join(sample(word_tokenize(x['sentence2']), \
                                                            len(word_tokenize(x['sentence2']))))}) # random.shuffle is in-place
  return ts_sort, ts_shuff

In [None]:
mnli_sort, mnli_shuff = randomise(mnli_dataset_tok['test'])
paws_sort, paws_shuff = randomise(paws_dataset_tok['test'])
winogrande_sort, winogrande_shuff = randomise(winogrande_dataset_tok['test'])
# hans_sort, hans_shuff = randomise(hans_dataset_tok['test'])

In [None]:
# tokenize datasets
mnli_sort = process(mnli_sort)
paws_sort = process(paws_sort)
winogrande_sort = process(winogrande_sort)
mnli_shuff = process(mnli_shuff)
paws_shuff = process(paws_shuff)
winogrande_shuff = process(winogrande_shuff)

In [None]:
# save them in one directory
!mkdir randoms

mnli_sort.save_to_disk('./randoms/mnli_sort')
paws_sort.save_to_disk('./randoms/paws_sort')
winogrande_sort.save_to_disk('./randoms/winogrande_sort')
mnli_shuff.save_to_disk('./randoms/mnli_shuff')
paws_shuff.save_to_disk('./randoms/paws_shuff')
winogrande_shuff.save_to_disk('./randoms/winogrande_shuff')

In [None]:
!zip -r randoms.zip randoms

<a name="section-1.5"></a>
# OR 1.5 load already preprocessed data from here

In [None]:
!wget -Nq https://github.com/nikotang/RD-UU-MPLT/raw/main/mnli.zip
!wget -Nq https://github.com/nikotang/RD-UU-MPLT/raw/main/paws.zip
!wget -Nq https://github.com/nikotang/RD-UU-MPLT/raw/main/winogrande.zip
!wget -Nq https://github.com/nikotang/RD-UU-MPLT/raw/main/randoms.zip

!unzip mnli.zip
!unzip paws.zip
!unzip winogrande.zip
!unzip randoms.zip

In [None]:
from datasets import load_from_disk

mnli_dataset_tok = load_from_disk('./mnli_datasets')
paws_dataset_tok = load_from_disk('./paws_datasets')
winogrande_dataset_tok = load_from_disk('./winogrande_datasets')

mnli_sorted = load_from_disk('./randoms/mnli_sort')
mnli_shuffled = load_from_disk('./randoms/mnli_shuff')
paws_sorted = load_from_disk('./randoms/paws_sort')
paws_shuffled = load_from_disk('./randoms/paws_shuff')

In [None]:
# change data type for the PAWS set to match MNLI
# Somehow the change isn't preserved if done before loading to and from disk
paws_dataset_tok = paws_dataset_tok.cast_column('labels', Value(dtype='int64'))

In [None]:
mnli_paws_dataset_tok = concatenate_datasets([mnli_dataset_tok['train'], paws_dataset_tok['train']])

In [None]:
# make mini sets for testing

mnli_mini = Dataset.from_dict(mnli_dataset_tok['train'][:10000])
paws_mini = Dataset.from_dict(paws_dataset_tok['train'][:10000])
winogrande_mini = Dataset.from_dict(winogrande_dataset_tok['train'][:10000])
mnli_paws_mini = Dataset.from_dict(mnli_paws_dataset_tok.shuffle()[:10000]) # contains only a training set, not a DatasetDict

mnli_val_mini = Dataset.from_dict(mnli_dataset_tok['validation'][:2000])

# 2: Training

In [None]:
import evaluate

metric = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [None]:
# pick the set to train on, and file name to save

train_dataset = mnli_paws_dataset_tok
eval_dataset = mnli_dataset_tok['validation']
filename = 'mnli_paws'

In [None]:
from transformers import Trainer, TrainingArguments
import gc

gc.collect()
torch.cuda.empty_cache()

CUDA_VISIBLE_DEVICES=0

model = RobertaForSequenceClassification.from_pretrained('roberta-base').to('cuda')
tokenizer = AutoTokenizer.from_pretrained('roberta-base')

training_args = TrainingArguments(
    output_dir=f'./{filename}_model',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=1000,
    learning_rate=5e-5,
    weight_decay=5e-4,
    logging_dir=f'./{filename}_logs',
    logging_steps=1000,
    eval_strategy='steps',
    eval_steps=1000,
    save_steps=1000,
    save_total_limit=5,
    load_best_model_at_end=True,
    metric_for_best_model='accuracy'      # determine 'best' according to val acc
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]      # checks 3 more steps before early stopping
)

trainer.train()

trainer.save_model()

# 3: Evaluate

In [None]:
filename = 'mnli_paws'

In [None]:
test_model = RobertaForSequenceClassification.from_pretrained(f'./{filename}_model').to('cuda')

test_args = TrainingArguments(
    output_dir = f'./{filename}_results',
    do_train = False,
    do_predict = True,
    per_device_eval_batch_size = 64
)

tester = Trainer(
              model = test_model,
              args = test_args,
              compute_metrics = compute_metrics)

tester.evaluate(eval_dataset=mnli_dataset_tok['test'])

In [None]:
# get logits to calculate confidence
logits, references, _ = trainer.predict(mnli_dataset_tok['test'])

In [None]:
from statistics import mean
from collections import Counter

softmax = nn.Softmax(dim=-1)

soft_logits = [softmax(torch.tensor(logit)) for logit in logits]
soft_logits = np.stack(np.array(soft_logits, dtype=object))

preds = np.argmax(soft_logits, axis=1)
pred_probs = np.amax(soft_logits, axis=1)

In [None]:
# save the prediction results and probabilities

# results = np.concatenate((pred_probs, preds, references), axis=1)
# with open('results.npy', 'wb') as f:
#   np.save(f, results)

# to load the file:
# with open('results.npy', 'rb') as f:
#   results = np.load(f)

In [None]:
avg_confidence = np.mean(pred_probs)
print(f'Average confidence: {avg_confidence}')

confidence_1 = []
confidence_0 = []
for i, prob in enumerate(pred_probs):
  if preds[i] == 1:
    confidence_1.append(prob)
  else:
    confidence_0.append(prob)

print(f'Label count: {Counter(preds)}')
print(f'Average confidence in Entailment predictions: {mean(confidence_1)}')
print(f'Average confidence in Non-Entailment predictions: {mean(confidence_0)}')