In [1]:
# !wget https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip
# !unzip -o multinli_1.0.zip

In [2]:
# !wget http://qim.fs.quoracdn.net/quora_duplicate_questions.tsv

In [3]:
import re
from unidecode import unidecode

def cleaning(string):
    return re.sub(r'[ ]+', ' ', unidecode(string)).strip()

In [4]:
from glob import glob

files = glob('multinli_1.0/multinli_1.0_*.jsonl')
files

['multinli_1.0/multinli_1.0_dev_mismatched.jsonl',
 'multinli_1.0/multinli_1.0_train.jsonl',
 'multinli_1.0/multinli_1.0_dev_matched.jsonl']

In [5]:
import json

with open(files[1]) as fopen:
    train = fopen.read().split('\n')
    
with open(files[0]) as fopen:
    dev = fopen.read().split('\n')
    
with open(files[2]) as fopen:
    dev.extend(fopen.read().split('\n'))

In [6]:
labels = ['contradiction', 'entailment']

In [13]:
from tqdm import tqdm

train_X, train_Y = [], []

for i in tqdm(range(len(train))):
    try:
        l = json.loads(train[i])
        if l['gold_label'] not in labels:
            continue
        if len(l['sentence1']) and len(l['sentence2']):
            s = f"{l['sentence1']} <> {l['sentence2']}"
            train_X.append(s)
            train_Y.append(l['gold_label'])
    except:
        pass

100%|██████████| 392703/392703 [00:03<00:00, 115787.44it/s]


In [8]:
test_X, test_Y = [], []

for i in tqdm(range(len(dev))):
    try:
        l = json.loads(dev[i])
        if l['gold_label'] not in labels:
            continue
        if len(l['sentence1']) and len(l['sentence2']):
            s = f"{l['sentence1']} <> {l['sentence2']}"
            test_X.append(s)
            test_Y.append(l['gold_label'])
    except:
        pass

100%|██████████| 20002/20002 [00:00<00:00, 93673.10it/s]


In [9]:
import youtokentome as yttm

with open('out.txt', 'w') as fopen:
    fopen.write('\n'.join(test_X + train_X))
    
yttm.BPE.train(data='out.txt', vocab_size=30000, model='vocab.model')
bpe = yttm.BPE(model='vocab.model')

In [10]:
bpe.vocab()[:4]

['<PAD>', '<UNK>', '<BOS>', '<EOS>']

In [11]:
bpe.decode(bpe.encode('halo') + [2] + bpe.encode('halo'))

['halo<BOS> halo']

In [15]:
left_train, right_train, label_train = [], [], []

for i in tqdm(range(len(train_X))):
    l, r = train_X[i].split(' <> ')
    left_train.append(bpe.encode(l))
    right_train.append(bpe.encode(r))
    label_train.append(labels.index(train_Y[i]))

100%|██████████| 261802/261802 [00:09<00:00, 26791.84it/s]


In [16]:
left_test, right_test, label_test = [], [], []

for i in tqdm(range(len(test_X))):
    l, r = test_X[i].split(' <> ')
    try:
        label_test.append(labels.index(test_Y[i]))
        left_test.append(bpe.encode(l))
        right_test.append(bpe.encode(r))
    except:
        pass

100%|██████████| 13395/13395 [00:00<00:00, 29595.87it/s]


In [17]:
with open('contrastive.json', 'w') as fopen:
    json.dump({'left_train': left_train,
              'right_train': right_train,
              'label_train': label_train,
              'left_test': left_test,
              'right_test': right_test,
              'label_test': label_test}, fopen)

In [18]:
left_train, label_train = [], []

for i in tqdm(range(len(train_X))):
    l, r = train_X[i].split(' <> ')
    left_train.append(bpe.encode(l) + [2] + bpe.encode(r))
    label_train.append(labels.index(train_Y[i]))

100%|██████████| 261802/261802 [00:09<00:00, 26215.21it/s]


In [19]:
left_test, label_test = [], []

for i in tqdm(range(len(test_X))):
    try:
        l, r = test_X[i].split(' <> ')
        label_test.append(labels.index(test_Y[i]))
        left_test.append(bpe.encode(l) + [2] + bpe.encode(r))
    except:
        pass

100%|██████████| 13395/13395 [00:00<00:00, 13604.82it/s]


In [20]:
with open('pair.json', 'w') as fopen:
    json.dump({'left_train': left_train,
              'label_train': label_train,
              'left_test': left_test,
              'label_test': label_test}, fopen)

In [21]:
with open('text.json', 'w') as fopen:
    json.dump({'train_X': train_X,
              'train_Y': train_Y,
              'test_X': test_X,
              'test_Y': test_Y}, fopen)