In [1]:
import sys, os

In [2]:
dataset_path_dict = {
    'ViNLI': '../data/vinli/UIT_ViNLI_1.0_{split}.jsonl',
    'SNLI': 'data/snli/snli_1.0_{split}.jsonl',
    'MultiNLI': 'data/multinli/multinli_1.0_{split}.jsonl',
    'Contract_NLI':'data/contract-nli/{split}.json'
}   
split_dict = ['train', 'test', 'dev']
tokenizer_dict = {
    'xlmr': "xlm-roberta-large",
    't5': "t5-large",
    'phobert': "vinai/phobert-large",
}
# label_dict = {
#     "contradiction": 0,
#     "neutral": 1,
#     "entailment": 2,
#     "other": 3,
#     "-": -1
# }
contract_label_dict = {
    "Contradiction": 0,
    "NotMentioned": 1,
    "Entailment":2,
}

In [3]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer

In [4]:
def Find_max_length(dataset, split_dict, tokenize_name):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_dict[tokenize_name])
    dataset = dataset.map(lambda examples: tokenizer(
        examples["sentence2"], 
        examples["sentence1"],
        ), batched=True)
    # Mergedata = concatenate_datasets([dataset[split_dict[0]],dataset[split_dict[1]],dataset[split_dict[2]]])
    # sorted_sequences = sorted(enumerate(Mergedata['attention_mask']), key=lambda x: len(x[1]), reverse=True)
    sorted_sequences = sorted(enumerate(dataset['train']['attention_mask']), key=lambda x: len(x[1]), reverse=True)
    sorted_indices, sorted_sequences = zip(*sorted_sequences)
    return len(sorted_sequences[0])

def TakeSampleDataset(dataset, split_dict, num_sample):
    dataset[split_dict[0]] = dataset[split_dict[0]].select(range(num_sample))
    dataset[split_dict[1]] = dataset[split_dict[0]].select(range(num_sample))
    dataset[split_dict[2]] = dataset[split_dict[0]].select(range(num_sample))
    return dataset

In [5]:
tokenize_name = "xlmr"
data_files = {}
for split in split_dict:
    path = dataset_path_dict['ViNLI'].format(split=split)
    data_files[split] = path
dataset = load_dataset("json", data_files=data_files).filter(lambda example: example['gold_label'] != '-')
# if not self.load_all_labels:
#     dataset = dataset.filter(lambda example: example['gold_label'] != 'other')
# dataset = dataset.map(lambda example: {"labels": label_dict[example["gold_label"]]}, remove_columns=["gold_label"])
max_length = Find_max_length(dataset, split_dict, tokenize_name)
data_path = f'data_tokenized/{tokenize_name}/vinli/{max_length}'

Map:   0%|          | 0/24376 [00:00<?, ? examples/s]

Map:   0%|          | 0/2991 [00:00<?, ? examples/s]

Map:   0%|          | 0/3009 [00:00<?, ? examples/s]

In [6]:
dataset['train']

Dataset({
    features: ['pairID', 'gold_label', 'link', 'context', 'sentence1', 'sentenceID', 'topic', 'sentence2', 'annotator_labels'],
    num_rows: 24376
})

In [7]:
dataset = dataset['train']

In [8]:
dataset = dataset.remove_columns(['pairID', 'link', 'context', 'sentenceID', 'topic', 'annotator_labels'])

In [9]:
dataset

Dataset({
    features: ['gold_label', 'sentence1', 'sentence2'],
    num_rows: 24376
})

In [13]:
dataset['gold_label'][:10]

['entailment',
 'entailment',
 'contradiction',
 'contradiction',
 'neutral',
 'neutral',
 'other',
 'other',
 'entailment',
 'entailment']

In [14]:
def map_entailment(examples):
    if examples['gold_label'] != 'entailment':
        examples['gold_label'] = 'non_entailment'
    return examples

In [15]:
dataset = dataset.map(map_entailment)

Map:   0%|          | 0/24376 [00:00<?, ? examples/s]

In [16]:
dataset = dataset[:10]

In [18]:
import datasets

In [20]:
dataset = datasets.Dataset.from_dict(dataset)

In [21]:
dataset

Dataset({
    features: ['gold_label', 'sentence1', 'sentence2'],
    num_rows: 10
})

In [22]:
dataset.push_to_hub("vinli_entailment_test")

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]