# The purpose of this script: Automatically calculate the importance of negation in each pair of sentences for negative NLI benchmarks

## librairies to install

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd "/content/drive/MyDrive/memoire"

/content/drive/MyDrive/memoire


In [3]:
! pip install transformers
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.2-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.2 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 55.2 MB/s 
[?25hCollecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 38.4 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.21.2
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 5.2 

In [4]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset

## Calculate importance

### MNLI

In [58]:
# input_files = './data/mnli_completed.tsv'
input_files = './data_annote/v_final/mnli_negative_1to4pairs_sampled_label.tsv'
dataset_mnli = load_dataset('csv', data_files = input_files, sep='\t')
dataset_mnli



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-0b27cff010dafbea/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-0b27cff010dafbea/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'correct', 'hypothesis', 'correct.1', 'label1', 'label2', 'gold_label '],
        num_rows: 400
    })
})

In [59]:
# dataset_mnli = dataset_mnli['train'].remove_columns('Unnamed: 0')
# dataset_mnli

dataset_mnli = dataset_mnli['train']
dataset_mnli = dataset_mnli.rename_column('Unnamed: 0', 'index')
dataset_mnli = dataset_mnli.remove_columns(["correct", "correct.1",'label1', 'label2'])
dataset_mnli

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label '],
    num_rows: 400
})

In [62]:
label_mnli = dataset_mnli['gold_label ']

In [63]:
# first_id = np.arange(0, 2000, 4)
# for id in first_id[:6]:
#     print(dataset_mnli[int(id)])

first_id = np.arange(0, 400, 4)
for id in first_id[:6]:
    print(dataset_mnli[int(id)])

{'index': 0, 'text': "I'll hurry over that part.", 'hypothesis': '"I\'ll be quick with that part."', 'gold_label ': 'entailment'}
{'index': 4, 'text': 'Shall I tell you why you have been so vehement against Mr. Inglethorp? ', 'hypothesis': "I can tell you why you're being so vehement against Mr. Inglethorp.", 'gold_label ': 'entailment'}
{'index': 8, 'text': 'well you know that brings up the interesting subject too you know what would you have who who who would determine what these people do', 'hypothesis': 'It begs the question of who gets to say what the other people do.', 'gold_label ': 'entailment'}
{'index': 12, 'text': 'A great Sather made the sun remain in one place too long, and the heat became too great.', 'hypothesis': 'It got too hot when a Sather kept the sun in one spot.', 'gold_label ': 'entailment'}
{'index': 16, 'text': 'Of course, it will be generally known to-morrow." John reflected. ', 'hypothesis': 'The news was about to break, and John had announced that he found o

In [64]:
importance_mnli = [0]*len(dataset_mnli)

for id in first_id:
  if label_mnli[id+1] != label_mnli[id]:
    importance_mnli[id+1]=1
  else:
    pass
  if label_mnli[id+2] != label_mnli[id]:
     importance_mnli[id+2]=1
  else:
    pass
  if label_mnli[id+3] != label_mnli[id+2]:
     importance_mnli[id+3]=[]
     importance_mnli[id+3].append('1')
  else:
    importance_mnli[id+3]=[]
    importance_mnli[id+3].append('0')
  if label_mnli[id+3] != label_mnli[id+1]:
     importance_mnli[id+3].append('1')
  else:
    importance_mnli[id+3].append('0')

importance_mnli[:5]

[0, 1, 1, ['1', '1'], 0]

In [65]:
importance_mnli_str = [str(i) for i in importance_mnli]

In [66]:
importance_mnli_str[:5]

['0', '1', '1', "['1', '1']", '0']

In [67]:
dataset_mnli

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label '],
    num_rows: 400
})

In [68]:
dataset_mnli = dataset_mnli.add_column('importance', importance_mnli_str)

In [69]:
dataset_mnli

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label ', 'importance'],
    num_rows: 400
})

In [70]:
# dataset_snli.to_csv('data/mnli_completed_with_importance.tsv', sep='\t')
dataset_mnli.to_csv('data_annote/v_final/mnli_negative_1to4pairs_sampled_label_with_importance.tsv', sep='\t')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

78271

### SNLI

In [76]:
# input_files = './data/snli_completed.tsv'
input_files = './data_annote/v_final/snli_negative_1to4pairs_sampled_label.tsv'
dataset_snli = load_dataset('csv', data_files = input_files, sep='\t')
dataset_snli



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-076befc06166f094/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-076befc06166f094/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'correct', 'hypothesis', 'correct.1', 'label1', 'label2', 'gold_label'],
        num_rows: 400
    })
})

In [77]:
dataset_snli = dataset_snli['train']
dataset_snli = dataset_snli.rename_column('Unnamed: 0', 'index')
dataset_snli = dataset_snli.remove_columns(["correct", "correct.1",'label1', 'label2'])
dataset_snli

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label'],
    num_rows: 400
})

In [79]:
label_snli = dataset_snli['gold_label']

In [80]:
# first_id = np.arange(0, 2000, 4)
first_id = np.arange(0, 400, 4)
for id in first_id[:6]:
    print(dataset_snli[int(id)])

{'index': 0, 'text': 'A historian and his friend digging in the mines to look for more fossils for study.', 'hypothesis': 'the historian is digging with his friend for study.', 'gold_label': 'neutral'}
{'index': 4, 'text': 'Boy getting helped onto a merry-go-round.', 'hypothesis': 'A boy is riding a donkey.', 'gold_label': 'contradiction'}
{'index': 8, 'text': 'A man making a contemplative pose in a laundry room.', 'hypothesis': 'A man is outside on the patio.', 'gold_label': 'contradiction'}
{'index': 12, 'text': 'A foreign man is preparing an ethnic dish.', 'hypothesis': 'A foreign man is preparing an ethnic dish, filled with colorful vegetables and fruit.', 'gold_label': 'neutral'}
{'index': 16, 'text': 'Two young blond girls are eating with chopsticks.', 'hypothesis': 'Some girls are eating Chinese food.', 'gold_label': 'neutral'}
{'index': 20, 'text': 'A woman in blue sweeping the sidewalk.', 'hypothesis': 'The woman is wearing red.', 'gold_label': 'contradiction'}


In [81]:
importance_snli = [0]*len(dataset_snli)

for id in first_id:
  if label_snli[id+1] != label_snli[id]:
    importance_snli[id+1]=1
  else:
    pass
  if label_snli[id+2] != label_snli[id]:
     importance_snli[id+2]=1
  else:
    pass
  if label_snli[id+3] != label_snli[id+2]:
     importance_snli[id+3]=[]
     importance_snli[id+3].append('1')
  else:
    importance_snli[id+3]=[]
    importance_snli[id+3].append('0')
  if label_snli[id+3] != label_snli[id+1]:
     importance_snli[id+3].append('1')
  else:
    importance_snli[id+3].append('0')

importance_snli[:5]

[0, 1, 1, ['1', '1'], 0]

In [82]:
importance_snli_str = [str(i) for i in importance_snli]

In [83]:
importance_snli_str[:5]

['0', '1', '1', "['1', '1']", '0']

In [84]:
dataset_snli

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label'],
    num_rows: 400
})

In [85]:
dataset_snli = dataset_snli.add_column('importance', importance_snli_str)

In [86]:
dataset_snli

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label', 'importance'],
    num_rows: 400
})

In [87]:
# dataset_snli.to_csv('data/snli_completed_with_importance.tsv', sep='\t')
dataset_snli.to_csv('data_annote/v_final/snli_negative_1to4pairs_sampled_label_with_importance.tsv', sep='\t')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

54001

### RTE

In [88]:
# input_files = './data/rte_completed.tsv'
input_files = './data_annote/v_final/rte_negative_1to4pairs_sampled_label.tsv'
dataset_rte = load_dataset('csv', data_files = input_files, sep='\t')
dataset_rte



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-95686dd91c6412d3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-95686dd91c6412d3/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['Unnamed: 0', 'text', 'correct', 'hypothesis', 'correct.1', 'label1', 'label2', 'gold_label'],
        num_rows: 400
    })
})

In [37]:
# dataset_rte = dataset_rte['train'].remove_columns('Unnamed: 0')
# dataset_rte

Dataset({
    features: ['text', 'hypothesis', 'label'],
    num_rows: 2000
})

In [89]:
dataset_rte = dataset_rte['train']
dataset_rte = dataset_rte.rename_column('Unnamed: 0', 'index')
dataset_rte = dataset_rte.remove_columns(["correct", "correct.1",'label1', 'label2'])
dataset_rte

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label'],
    num_rows: 400
})

In [90]:
label_rte = dataset_rte['gold_label']

In [91]:
# first_id = np.arange(0, 2000, 4)
first_id = np.arange(0, 400, 4)
for id in first_id[:6]:
    print(dataset_rte[int(id)])

{'index': 0, 'text': 'Thanks to a global ban on the ivory trade that was passed in 1989 by the Convention on International Trade in Endangered Species of Wild Fauna and Flora (CITES), the African elephant population may be reversing its spiral toward extinction', 'hypothesis': 'The ban on ivory trade has been effective in protecting the elephant from extinction.', 'gold_label': 'entailment'}
{'index': 4, 'text': 'Orhan Pamuk, a prominent, post-modern writer whose work is translated into more than 40 languages, received the 2006 Nobel Prize in Literature. Pamuk was an admired writer in Turkey until the events in 2005, when lawyers of two Turkish professional associations brought criminal charges against him for "insulting Turkishness" after the author\'s controversial statements regarding the disputed Armenian Genocide of 1915-1917. He claimed, and repeated his claim, that  ... one million Armenians and 30,000 Kurds were killed in Turkey.', 'hypothesis': 'Pamuk won a Nobel Prize.', 'gol

In [92]:
importance_rte = [0]*len(dataset_rte)

for id in first_id:
  if label_rte[id+1] != label_rte[id]:
    importance_rte[id+1]=1
  else:
    pass
  if label_rte[id+2] != label_rte[id]:
     importance_rte[id+2]=1
  else:
    pass
  if label_rte[id+3] != label_rte[id+2]:
     importance_rte[id+3]=[]
     importance_rte[id+3].append('1')
  else:
    importance_rte[id+3]=[]
    importance_rte[id+3].append('0')
  if label_rte[id+3] != label_rte[id+1]:
     importance_rte[id+3].append('1')
  else:
    importance_rte[id+3].append('0')

importance_rte[:5]

[0, 1, 1, ['1', '1'], 0]

In [93]:
importance_rte_str = [str(i) for i in importance_rte]

In [94]:
importance_rte_str[:5]

['0', '1', '1', "['1', '1']", '0']

In [95]:
dataset_rte

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label'],
    num_rows: 400
})

In [96]:
dataset_rte = dataset_rte.add_column('importance', importance_rte_str)

In [97]:
dataset_rte

Dataset({
    features: ['index', 'text', 'hypothesis', 'gold_label', 'importance'],
    num_rows: 400
})

In [98]:
# dataset_rte.to_csv('data/rte_completed_with_importance.tsv', sep='\t')
dataset_rte.to_csv('data_annote/v_final/rte_negative_1to4pairs_sampled_label_with_importance.tsv', sep='\t')

Creating CSV from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

136393