# The purpose of this script: to use the fine-tunned negative sentences model to automatically turn data into negative forms

## Librairies to install

In [1]:
!nvidia-smi

Mon Aug 15 06:47:48 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    27W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
%cd "/content/drive/MyDrive/memoire"

/content/drive/MyDrive/memoire


In [4]:
! pip install transformers
! pip install datasets

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.21.1-py3-none-any.whl (4.7 MB)
[K     |████████████████████████████████| 4.7 MB 5.1 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 54.1 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.8.1-py3-none-any.whl (101 kB)
[K     |████████████████████████████████| 101 kB 14.0 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 76.4 MB/s 
Installing collected packages: pyyaml, tokenizers, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installation: PyYAML 3.13
    Uninstal

In [5]:
import pandas as pd
import numpy as np
import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from datasets import load_dataset

## Preparing the model and the tokenizer

* Architecture: This is the skeleton of the model — the definition of each layer and each operation that happens within the model.
* Checkpoints: These are the weights that will be loaded in a given architecture.
* Model: This is an umbrella term that isn’t as precise as “architecture” or “checkpoint”: it can mean both. This course will specify architecture or checkpoint when it matters to reduce ambiguity.


In [6]:
# load tokenizer and model
checkpoint = './bart-base-finetuned-negative-sentences-generation/checkpoint-4050'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSeq2SeqLM.from_pretrained(checkpoint)
model.to('cuda' if torch.cuda.is_available() else 'cpu')

BartForConditionalGeneration(
  (model): BartModel(
    (shared): Embedding(50265, 768, padding_idx=1)
    (encoder): BartEncoder(
      (embed_tokens): Embedding(50265, 768, padding_idx=1)
      (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
      (layers): ModuleList(
        (0): BartEncoderLayer(
          (self_attn): BartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (activation_fn): GELUActivation()
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05,

## RTE

In [7]:
# Download data with datasets
dataset_rte = load_dataset("csv", data_files="./corpus/RTE/train.tsv", sep='\t')
dataset_rte



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e799b043e2063c24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e799b043e2063c24/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'sentence1', 'sentence2', 'label'],
        num_rows: 2490
    })
})

In [48]:
dataset_rte_shuffle = dataset_rte.shuffle(seed=42)

In [49]:
text_rte = dataset_rte_shuffle['train']['sentence1'][:100]
hypothesis_rte = dataset_rte_shuffle['train']['sentence2'][:100]

In [50]:
label_rte = dataset_rte_shuffle['train']['label'][:100]

In [51]:
# choose the batch size
batch_size = []
for i in range(10, 16):
  if len(text_rte) % i ==0:
    batch_size.append(i)

selected_bsize_rte = min(batch_size)
print(selected_bsize_rte)

10


In [52]:
neg_text_rte = []
for i in range(0, len(text_rte), selected_bsize_rte):
  batch = tokenizer(text_rte[i:i+selected_bsize_rte], max_length=1024, padding=True, truncation=True, return_tensors="pt")
  out = model.generate(batch['input_ids'].to(model.device), max_length=1024)
  negative_examples = tokenizer.batch_decode(out, skip_special_tokens=True)
  neg_text_rte += negative_examples

# RTE (2490 texts): with t4, 9 min  ; with p100, 7min32 

In [53]:
neg_hypothesis_rte = []
for i in range(0, len(hypothesis_rte), selected_bsize_rte):
  batch = tokenizer(hypothesis_rte[i:i+selected_bsize_rte], max_length=1024, padding=True, truncation=True, return_tensors="pt")
  out = model.generate(batch['input_ids'].to(model.device), max_length=1024)
  negative_examples = tokenizer.batch_decode(out, skip_special_tokens=True)
  neg_hypothesis_rte += negative_examples

# RTE (2490 hypothesis): with t4, 2 min  ; with p100, 1 min

In [54]:
print(f"The origin is : {text_rte[0]}.\nThe negative sentence is : {neg_text_rte[0]}. \nThe length is :{len(neg_text_rte)}")
print(f"The origin is : {hypothesis_rte[0]}.\nThe negative sentence is : {neg_hypothesis_rte[0]}. \nThe length is :{len(neg_hypothesis_rte)}")

The origin is : Thanks to a global ban on the ivory trade that was passed in 1989 by the Convention on International Trade in Endangered Species of Wild Fauna and Flora (CITES), the African elephant population may be reversing its spiral toward extinction.
The negative sentence is : Thanks to a global ban on the ivory trade that was not passed in 1989 by the Convention on International Trade in Endangered Species of Wild Fauna and Flora (CITES), the African elephant population may be reversing its spiral toward extinction. 
The length is :100
The origin is : The ban on ivory trade has been effective in protecting the elephant from extinction..
The negative sentence is : The ban on ivory trade has not been effective in protecting the elephant from extinction.. 
The length is :100


In [55]:
# generate a new csv with only negative sentences 
list_res_rte=[]
for i in range(len(neg_text_rte)):
    list_res_rte.append([text_rte[i],hypothesis_rte[i],label_rte[i]])
    list_res_rte.append([neg_text_rte[i],hypothesis_rte[i]])
    list_res_rte.append([text_rte[i],neg_hypothesis_rte[i]])
    list_res_rte.append([neg_text_rte[i],neg_hypothesis_rte[i]])
 
column_name = ['text', 'hypothesis', 'label']
csv_name='./corpus/rte_negative_1to4pairs_sampled.csv'
xml_df = pd.DataFrame(list_res_rte, columns=column_name)
xml_df.to_csv(csv_name, sep='\t')

# print(list_res[:5]) 

## SNLI

In [16]:
dataset_snli = load_dataset("csv", data_files="./corpus/SNLI/snli_1.0_train.tsv", sep='\t')
dataset_snli



Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-98f253f7aca8a412/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-98f253f7aca8a412/0.0.0/652c3096f041ee27b04d2232d41f10547a8fecda3e284a79a0ec4053c916ef7a. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['gold_label', 'sentence1_binary_parse', 'sentence2_binary_parse', 'sentence1_parse', 'sentence2_parse', 'sentence1', 'sentence2', 'captionID', 'pairID', 'label1', 'label2', 'label3', 'label4', 'label5'],
        num_rows: 550152
    })
})

In [40]:
dataset_snli_shuffle = dataset_snli.shuffle(seed=42)

In [41]:
text_snli = dataset_snli_shuffle['train']['sentence1'][:100]
hypothesis_snli = dataset_snli_shuffle['train']['sentence2'][:100]

In [42]:
label_snli = dataset_snli_shuffle['train']['gold_label'][:100]

In [43]:
# choose the batch size
batch_size = []
for i in range(10, 16):
  if len(text_snli) % i ==0:
    batch_size.append(i)

selected_bsize_snli = min(batch_size)
print(selected_bsize_snli)

10


In [44]:
neg_text_snli = []
for i in range(0, len(text_snli), selected_bsize_snli):
  batch = tokenizer(text_snli[i:i+selected_bsize_snli], max_length=1024, padding=True, truncation=True, return_tensors="pt")
  out = model.generate(batch['input_ids'].to(model.device), max_length=1024)
  negative_examples = tokenizer.batch_decode(out, skip_special_tokens=True)
  neg_text_snli += negative_examples
# SNLI: 1 min => 5500, 

In [45]:
neg_hypothesis_snli = []
for i in range(0, len(hypothesis_snli), selected_bsize_snli):
  batch = tokenizer(hypothesis_snli[i:i+selected_bsize_snli], max_length=1024, padding=True, truncation=True, return_tensors="pt")
  out = model.generate(batch['input_ids'].to(model.device), max_length=1024)
  negative_examples = tokenizer.batch_decode(out, skip_special_tokens=True)
  neg_hypothesis_snli += negative_examples

# SNLI: 2 min => 5500 

In [46]:
print(f"The origin is : {text_snli[0]}.\nThe negative sentence is : {neg_text_snli[0]}. \nThe length is :{len(neg_text_snli)}")
print(f"The origin is : {hypothesis_snli[0]}.\nThe negative sentence is : {neg_hypothesis_snli[0]}. \nThe length is :{len(neg_hypothesis_snli)}")

The origin is : A historian and his friend digging in the mines to look for more fossils for study..
The negative sentence is : A historian and his friend are not digging in the mines to look for more fossils for study.. 
The length is :100
The origin is : the historian is digging with his friend for study..
The negative sentence is : the historian is not digging with his friend for study.. 
The length is :100


In [47]:
# generate a new csv with only negative sentences 
list_res_snli=[]
for i in range(len(neg_text_snli)):
    list_res_snli.append([text_snli[i],hypothesis_snli[i],label_snli[i]])
    list_res_snli.append([neg_text_snli[i],hypothesis_snli[i]])
    list_res_snli.append([text_snli[i],neg_hypothesis_snli[i]])
    list_res_snli.append([neg_text_snli[i],neg_hypothesis_snli[i]])
 
column_name = ['text', 'hypothesis', 'label']
csv_name='./corpus/snli_negative_1to4pairs_sampled.csv'
xml_df = pd.DataFrame(list_res_snli, columns=column_name)
xml_df.to_csv(csv_name, sep='\t')

# print(list_res[:5]) 

## MNLI

In [25]:
dataset_mnli = load_dataset("json", data_files="./corpus/MNLI/original/multinli_1.0_train.jsonl")
dataset_mnli



Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-a4739c86f41daf2e/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-a4739c86f41daf2e/0.0.0/a3e658c4731e59120d44081ac10bf85dc7e1388126b92338344ce9661907f253. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['annotator_labels', 'genre', 'gold_label', 'pairID', 'promptID', 'sentence1', 'sentence1_binary_parse', 'sentence1_parse', 'sentence2', 'sentence2_binary_parse', 'sentence2_parse'],
        num_rows: 392702
    })
})

In [26]:
dataset_mnli_shuffle = dataset_mnli.shuffle(seed=42)

In [33]:
text_mnli = dataset_mnli_shuffle['train']['sentence1'][:100]
hypothesis_mnli = dataset_mnli_shuffle['train']['sentence2'][:100]

In [34]:
label_mnli = dataset_mnli_shuffle['train']['gold_label'][:100]

In [35]:
# choose the batch size
batch_size = []
for i in range(10, 16):
  if len(text_mnli) % i ==0:
    batch_size.append(i)

selected_bsize_mnli = min(batch_size)
print(selected_bsize_mnli)

10


In [36]:
neg_text_mnli = []
for i in range(0, len(text_mnli), selected_bsize_mnli):
  batch = tokenizer(text_mnli[i:i+selected_bsize_mnli], max_length=1024, padding=True, truncation=True, return_tensors="pt")
  out = model.generate(batch['input_ids'].to(model.device), max_length=1024)
  negative_examples = tokenizer.batch_decode(out, skip_special_tokens=True)
  neg_text_mnli += negative_examples

# mnli 3900: 

In [37]:
neg_hypothesis_mnli = []
for i in range(0, len(hypothesis_mnli), 10):
  batch = tokenizer(hypothesis_mnli[i:i+10], max_length=1024, padding=True, truncation=True, return_tensors="pt")
  out = model.generate(batch['input_ids'].to(model.device), max_length=1024)
  negative_examples = tokenizer.batch_decode(out, skip_special_tokens=True)
  neg_hypothesis_mnli += negative_examples

# mnli 3900: 

In [38]:
print(f"The origin is : {text_mnli[0]}.\nThe negative sentence is : {neg_text_mnli[0]}. \nThe length is :{len(neg_text_mnli)}")
print(f"The origin is : {hypothesis_mnli[0]}.\nThe negative sentence is : {neg_hypothesis_mnli[0]}. \nThe length is :{len(neg_hypothesis_mnli)}")

The origin is : I'll hurry over that part..
The negative sentence is : I'll not hurry over that part.. 
The length is :100
The origin is : "I'll be quick with that part.".
The negative sentence is : "I'll not be quick with that part.". 
The length is :100


In [39]:
# generate a new csv with only negative sentences 
list_res_mnli=[]
for i in range(len(neg_text_mnli)):
    list_res_mnli.append([text_mnli[i],hypothesis_mnli[i],label_mnli[i]])
    list_res_mnli.append([neg_text_mnli[i],hypothesis_mnli[i]])
    list_res_mnli.append([text_mnli[i],neg_hypothesis_mnli[i]])
    list_res_mnli.append([neg_text_mnli[i],neg_hypothesis_mnli[i]])
 
column_name = ['text', 'hypothesis', 'label']
csv_name='./corpus/mnli_negative_1to4pairs_sampled.csv'
xml_df = pd.DataFrame(list_res_mnli, columns=column_name)
xml_df.to_csv(csv_name, sep='\t')

# print(list_res[:5]) 

## test

In [None]:
for i in range(0, 550152, 12):
  print(i,i+12)

In [None]:
for i in range(10, 16):
  if len(dataset_snli['train'])% i ==0:
    print(i)

12


In [None]:
for i in range(10, 16):
  if len(dataset_rte['train'])% i ==0:
    print(i)

10
15


In [None]:
print(2490%10)

0
