In [1]:
import json
import glob
import re
import malaya

In [2]:
tokenizer = malaya.preprocessing._SocialTokenizer().tokenize

def is_number_regex(s):
    if re.match("^\d+?\.\d+?$", s) is None:
        return s.isdigit()
    return True

def detect_money(word):
    if word[:2] == 'rm' and is_number_regex(word[2:]):
        return True
    else:
        return False

def preprocessing(string):
    tokenized = tokenizer(string)
    tokenized = [w.lower() for w in tokenized if len(w) > 2]
    tokenized = ['<NUM>' if is_number_regex(w) else w for w in tokenized]
    tokenized = ['<MONEY>' if detect_money(w) else w for w in tokenized]
    return tokenized

In [3]:
left, right, label = [], [], []
for file in glob.glob('quora/*.json'):
    with open(file) as fopen:
        x = json.load(fopen)
    for i in x:
        splitted = i[0].split(' <> ')
        if len(splitted) != 2:
            continue
        left.append(splitted[0])
        right.append(splitted[1])
        label.append(i[1])

In [4]:
len(left), len(right), len(label)

(403831, 403831, 403831)

In [5]:
with open('synonym0.json') as fopen:
    s = json.load(fopen)
    
with open('synonym1.json') as fopen:
    s1 = json.load(fopen)

In [6]:
synonyms = {}
for l, r in (s + s1):
    if l not in synonyms:
        synonyms[l] = r + [l]
    else:
        synonyms[l].extend(r)
synonyms = {k: list(set(v)) for k, v in synonyms.items()}

In [7]:
import random

def augmentation(s, maximum = 0.8):
    s = s.lower().split()
    for i in range(int(len(s) * maximum)):
        index = random.randint(0, len(s) - 1)
        word = s[index]
        sy = synonyms.get(word, [word])
        sy = random.choice(sy)
        s[index] = sy
    return s

In [8]:
train_left, test_left = left[:-50000], left[-50000:]
train_right, test_right = right[:-50000], right[-50000:]
train_label, test_label = label[:-50000], label[-50000:]

In [9]:
len(train_left), len(test_left)

(353831, 50000)

In [10]:
aug = [' '.join(augmentation(train_left[0])) for _ in range(10)] + [train_left[0].lower()]
aug = list(set(aug))
aug

['apakah maksud mengecap sejati kepada anda?',
 'apakah maksud pilihan sejati kepada anda?',
 'apakah maksud mencinta sejati kepada anda?',
 'apakah maksud mengasihi sejati kepada anda?',
 'apakah maksud cinta sejati kepada anda?',
 'apakah maksud menyayangi sejati kepada anda?',
 'apakah maksud percintaan sejati kepada anda?']

In [11]:
aug = [' '.join(augmentation(train_right[0])) for _ in range(10)] + [train_right[0].lower()]
aug = list(set(aug))
aug

['apakah maksud "cinta sejati"?']

In [12]:
train_label[0]

0

In [13]:
from tqdm import tqdm

LEFT, RIGHT, LABEL = [], [], []
for i in tqdm(range(len(train_left))):
    aug_left = [' '.join(augmentation(train_left[i])) for _ in range(3)] + [train_left[i].lower()]
    aug_left = list(set(aug_left))
    
    aug_right = [' '.join(augmentation(train_right[i])) for _ in range(3)] + [train_right[i].lower()]
    aug_right = list(set(aug_right))
    
    for l in aug_left:
        for r in aug_right:
            LEFT.append(l)
            RIGHT.append(r)
            LABEL.append(train_label[i])

100%|██████████| 353831/353831 [00:46<00:00, 7536.26it/s]


In [14]:
len(LEFT), len(RIGHT), len(LABEL)

(4136391, 4136391, 4136391)

In [15]:
for i in tqdm(range(len(LEFT))):
    LEFT[i] = preprocessing(LEFT[i])
    RIGHT[i] = preprocessing(RIGHT[i])

100%|██████████| 4136391/4136391 [10:34<00:00, 6523.13it/s]


In [16]:
for i in tqdm(range(len(test_left))):
    test_left[i] = preprocessing(test_left[i])
    test_right[i] = preprocessing(test_right[i])

100%|██████████| 50000/50000 [00:06<00:00, 7268.75it/s]


In [17]:
with open('train-similarity.json', 'w') as fopen:
    json.dump({'left': LEFT, 'right': RIGHT, 'label': LABEL}, fopen)

In [18]:
with open('test-similarity.json', 'w') as fopen:
    json.dump({'left': test_left, 'right': test_right, 'label': test_label}, fopen)