In [1]:
import argparse
from itertools import product
import math
import nltk
from pathlib import Path
import numpy as np
import itertools
import codecs

# <div class="green">Tokenization</div>

In [2]:
from tokenizers import Tokenizer
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import WordLevel, BPE
from tokenizers.trainers import WordLevelTrainer, BpeTrainer
from tokenizers.processors import TemplateProcessing

In [3]:
TRAIN_TOKENIZERS = False
TRAIN_MODEL = False

WORD_TOKENIZER_FILE_NAME = './wtoken.json'
BPE_TOKENIZER_FILE_NAME = './bpetoken.json'

BPE_VOCAB_SIZE = 10000
WORD_LEVEL_VOCAB_SIZE = 5000

UNK_TOKEN = "[UNK]"
PAD_TOKEN = "[PAD]"
SOS_TOKEN = "[SOS]"
EOS_TOKEN = "[EOS]"

ALL_TOKENS = [UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, PAD_TOKEN]

ALL_TRAINING_DATA = [
    'data/cultural.txt',
    'data/economics.txt',
    'data/politics.txt',
    'data/sports.txt'
]

LM_TRAINING_DATA = ['./train.txt'] # ALL_TRAINING_DATA[:1]

## <span class="blue">Word Tokenizer</span>

In [4]:
if TRAIN_TOKENIZERS:
    word_tokenizer = Tokenizer(WordLevel(unk_token=UNK_TOKEN))
    word_tokenizer.pre_tokenizer = Whitespace()
    word_trainer = WordLevelTrainer(vocab_size=WORD_LEVEL_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    word_tokenizer.train(ALL_TRAINING_DATA, word_trainer)
    word_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    word_tokenizer.save(WORD_TOKENIZER_FILE_NAME)
else:
    word_tokenizer = Tokenizer.from_file(WORD_TOKENIZER_FILE_NAME)

## <span class="blue">BPE Tokenizer</span>

In [5]:
if TRAIN_TOKENIZERS:
    bpe_tokenizer = Tokenizer(BPE(unk_token=UNK_TOKEN))
    bpe_tokenizer.pre_tokenizer = Whitespace()
    bpe_trainer = BpeTrainer(vocab_size=BPE_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    bpe_tokenizer.train(ALL_TRAINING_DATA, bpe_trainer)
    bpe_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    bpe_tokenizer.save(BPE_TOKENIZER_FILE_NAME)
else:
    bpe_tokenizer = Tokenizer.from_file(BPE_TOKENIZER_FILE_NAME)

## <span class="blue">Post Processing</span>

In [6]:
def add_post_processor_to(tokenizer: Tokenizer):
    tokenizer.post_processor = TemplateProcessing(
        single=f"{SOS_TOKEN} $0 {EOS_TOKEN}",
        special_tokens=[
            (X, tokenizer.token_to_id(X)) for X in [SOS_TOKEN, EOS_TOKEN]
        ]
    )
add_post_processor_to(word_tokenizer)
add_post_processor_to(bpe_tokenizer)

In [7]:
sample = 'سلاااااام حالت خوب است؟'
print(f'Word Tokenizer: {word_tokenizer.encode(sample).tokens}')
print(f'BPE Tokenizer: {bpe_tokenizer.encode(sample).tokens}')


Word Tokenizer: ['[SOS]', 'سلاااااام', 'حالت', 'خوب', 'است', '؟', '[EOS]']
BPE Tokenizer: ['[SOS]', 'س', 'لا', 'ا', 'ا', 'ا', 'ا', 'ام', 'حالت', 'خوب', 'است', '؟', '[EOS]']


In [1]:
import tqdm

class LanguageModel(object):
    def __init__(self, train_data, n, laplace, tokenizer):
        self.set_tokenizer(tokenizer)
        self.n = n
        self.vocab = dict()
        self.laplace = laplace
        self.tokens = self.preprocess(train_data, n)
        self.vocab  = nltk.FreqDist(self.tokens)
        self.model  = self._create_model()
        self.masks  = list(reversed(list(product((0,1), repeat=n))))
    
    def set_tokenizer(self, tokenizer):
        self.sos = str(tokenizer.token_to_id(SOS_TOKEN))
        self.eos = str(tokenizer.token_to_id(EOS_TOKEN))
        self.unk = str(tokenizer.token_to_id(UNK_TOKEN))
        self.tokenizer = tokenizer
    
    def _smooth(self):
        vocab_size = len(self.vocab)

        n_grams = nltk.ngrams(self.tokens, self.n)
        n_vocab = nltk.FreqDist(n_grams)

        m_grams = nltk.ngrams(self.tokens, self.n-1)
        m_vocab = nltk.FreqDist(m_grams)

        def smoothed_count(n_gram, n_count):
            m_gram = n_gram[:-1]
            m_count = m_vocab[m_gram]
            return (n_count + self.laplace) / (m_count + self.laplace * vocab_size)

        return { n_gram: smoothed_count(n_gram, count) for n_gram, count in n_vocab.items() }

    def _create_model(self):
        if self.n == 1:
            num_tokens = len(self.tokens)
            return { (unigram,): count / num_tokens for unigram, count in self.vocab.items() }
        else:
            return self._smooth()

    def _convert_oov(self, ngram):
        mask = lambda ngram, bitmask: tuple((token if flag == 1 else self.unk for token,flag in zip(ngram, bitmask)))

        ngram = (ngram,) if type(ngram) is str else ngram
        for possible_known in [mask(ngram, bitmask) for bitmask in self.masks]:
            if possible_known in self.model:
                return possible_known

    def perplexity(self, test_data):
        test_tokens = self.preprocess(test_data, self.n)
        test_ngrams = nltk.ngrams(test_tokens, self.n)
        N = len(test_tokens)

        known_ngrams  = [self._convert_oov(ngram) for ngram in test_ngrams]
        probabilities = [self.model[ngram] for ngram in known_ngrams]
        
#         for x,y in zip(known_ngrams, probabilities):
#             print(x,y)
        
        return math.exp((-1/N) * sum(map(math.log, probabilities)))

    def _best_candidate(self, prev, without=[]):
        
        blacklist  = [self.unk] + without

        if len(prev) < self.n:
            prev = [self.sos]*(self.n-1)

        candidates = list(((ngram[-1],prob) for ngram,prob in self.model.items() if ngram[:-1]==tuple(prev)))

        probs = [y for x,y in candidates]
        probs = probs/np.sum(probs)
        words = [x for x,y in candidates]

        idx = np.random.choice(len(words), 1, replace=False, p=probs)[0]
        
        while words[idx] in blacklist:
            idx = np.random.choice(len(words), 1, replace=False, p=probs)[0]
        
        return (words[idx], probs[idx])
         
    def generate_sentence(self, min_len=12, max_len=24):
        sent, prob = ([self.sos] * (max(1, self.n-1)), 1)
        while sent[-1] != self.eos:
            prev = () if self.n == 1 else tuple(sent[-(self.n-1):])
            blacklist = sent + ([self.eos,self.sos] if len(sent) < min_len else [])
            next_token, next_prob = self._best_candidate(prev, without=blacklist)
            sent.append(next_token)
            prob *= next_prob

            if len(sent) >= max_len:
                sent.append(self.eos)

        return (' '.join(sent[(self.n-1):-1]), -1/math.log(prob))
    
    
    def add_sentence_tokens(self, sentences, n):
        return_value = []
        sos = ' '.join(self.sos * (n-1)) if n > 1 else self.sos
        for sentence in sentences:
            ids = self.tokenizer.encode(sentence).ids
            sos_id = ids[0]
            ids = ids[1:]
            s = ' '.join([str(x) for x in ids])
            return_value.append('{} {}'.format(sos, s))
        return return_value


    def preprocess(self, sentences, n):
        sentences = self.add_sentence_tokens(sentences, n)
        tokens = ' '.join(sentences).split()
        return tokens

In [2]:
def load_train_data(train_data):
    trains = []
    for i in range(len(train_data)):
        with open(train_data[i], 'r', encoding="utf-8") as f:
            trains.append([l.strip() for l in f.readlines()])
            
    train = [item for sublist in trains for item in sublist]

    return train

In [3]:
import pickle
def train_ngram_and_save(n, laplace, name, tokenizer):
    train = load_train_data(LM_TRAINING_DATA)
    lm = LanguageModel(train, n, laplace, tokenizer)

    with open(f'ngram_{name}_{n}_{laplace}.pkl', 'wb') as outp:
        pickle.dump(lm, outp, pickle.HIGHEST_PROTOCOL)

In [11]:
train_ngram_and_save(3, 0, 'word', word_tokenizer)
train_ngram_and_save(3, 0, 'bpe', bpe_tokenizer)
train_ngram_and_save(3, 1, 'word', word_tokenizer)
train_ngram_and_save(3, 1, 'bpe', bpe_tokenizer)