In [1]:
import json
import re
import sentencepiece as spm

In [2]:
from prepro_utils import preprocess_text, encode_ids, encode_pieces

sp_model = spm.SentencePieceProcessor()
sp_model.Load('albert-base/sp10m.cased.v6.model')

with open('albert-base/sp10m.cased.v6.vocab') as fopen:
    v = fopen.read().split('\n')[:-1]
v = [i.split('\t') for i in v]
v = {i[0]: i[1] for i in v}

class Tokenizer:
    def __init__(self, v):
        self.vocab = v
        pass
    
    def tokenize(self, string):
        return encode_pieces(sp_model, string, return_unicode=False, sample=False)
    
    def convert_tokens_to_ids(self, tokens):
        return [sp_model.PieceToId(piece) for piece in tokens]
    
    def convert_ids_to_tokens(self, ids):
        return [sp_model.IdToPiece(i) for i in ids]
    
tokenizer = Tokenizer(v)

In [3]:
import pickle

with open('../relevant-dataset.pkl', 'rb') as fopen:
    data = pickle.load(fopen)

In [4]:
import re
from unidecode import unidecode

def cleaning(string):
    string = unidecode(string)
    string = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', string)
    string = re.sub(r'[ ]+', ' ', string).strip().split()
    string = [w for w in string if w[0] != '@']
    return ' '.join(string)

In [5]:
train_X = data['train_X']
train_Y = data['train_Y']
test_X = data['test_X']
test_Y = data['test_Y']

In [6]:
from tqdm import tqdm

for i in tqdm(range(len(train_X))):
    train_X[i] = cleaning(train_X[i])
    
for i in tqdm(range(len(test_X))):
    test_X[i] = cleaning(test_X[i])

100%|██████████| 135615/135615 [00:47<00:00, 2870.80it/s]
100%|██████████| 8405/8405 [00:03<00:00, 2801.42it/s]


In [8]:
MAX_SEQ_LENGTH = 500

input_ids, input_masks, segment_ids = [], [], []

for text in tqdm(train_X):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["<cls>"] + tokens_a + ["<sep>"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    input_ids.append(input_id)
    input_masks.append(input_mask)
    segment_ids.append(segment_id)

100%|██████████| 135615/135615 [08:57<00:00, 252.48it/s]


In [9]:
test_ids = []

for text in tqdm(test_X):
    tokens_a = tokenizer.tokenize(text)
    if len(tokens_a) > MAX_SEQ_LENGTH - 2:
        tokens_a = tokens_a[:(MAX_SEQ_LENGTH - 2)]
    tokens = ["<cls>"] + tokens_a + ["<sep>"]
    segment_id = [0] * len(tokens)
    input_id = tokenizer.convert_tokens_to_ids(tokens)
    input_mask = [1] * len(input_id)
    padding = [0] * (MAX_SEQ_LENGTH - len(input_id))
    input_id += padding
    input_mask += padding
    segment_id += padding
    
    test_ids.append(input_id)

100%|██████████| 8405/8405 [00:32<00:00, 259.39it/s]


In [10]:
with open('relevancy-ids.pkl', 'wb') as fopen:
    pickle.dump({'train_X': input_ids, 'train_Y': train_Y,
                'test_X': test_ids, 'test_Y': test_Y}, fopen)