In [1]:
import hazm
import copy
import time
import torch
import typing
import numpy as np
from torch import nn
from tqdm import tqdm
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tokenizers import Tokenizer
from tokenizers.models import WordLevel, BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.processors import TemplateProcessing
from tokenizers.trainers import WordLevelTrainer, BpeTrainer

# **General settings**

In [2]:
TRAIN_TOKENIZERS = False

WORD_TOKENIZER_FILE_NAME = './wtoken.json'
BPE_TOKENIZER_FILE_NAME = './bpetoken.json'

BPE_VOCAB_SIZE = 10000
WORD_LEVEL_VOCAB_SIZE = 5000

UNK_TOKEN = "[UNK]"
PAD_TOKEN = "[PAD]"
SOS_TOKEN = "[SOS]"
EOS_TOKEN = "[EOS]"
ALL_TOKENS = [UNK_TOKEN, SOS_TOKEN, EOS_TOKEN, PAD_TOKEN]

ALL_TRAINING_DATA = [
    './cultural.txt',
    './economics.txt',
    './politics.txt',
    './sports.txt'
]

LM_TRAINING_DATA = ['./t.txt'] #ALL_TRAINING_DATA[:1]

# **Tokenization and Post-processing**

In [3]:
if TRAIN_TOKENIZERS:
    word_tokenizer = Tokenizer(WordLevel(unk_token=UNK_TOKEN))
    word_tokenizer.pre_tokenizer = Whitespace()
    word_trainer = WordLevelTrainer(vocab_size=WORD_LEVEL_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    word_tokenizer.train(ALL_TRAINING_DATA, word_trainer)
    word_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    word_tokenizer.save(WORD_TOKENIZER_FILE_NAME)
else:
    word_tokenizer = Tokenizer.from_file(WORD_TOKENIZER_FILE_NAME)

In [4]:
if TRAIN_TOKENIZERS:
    bpe_tokenizer = Tokenizer(BPE(unk_token=UNK_TOKEN))
    bpe_tokenizer.pre_tokenizer = Whitespace()
    bpe_trainer = BpeTrainer(vocab_size=BPE_VOCAB_SIZE, special_tokens=ALL_TOKENS)
    bpe_tokenizer.train(ALL_TRAINING_DATA, bpe_trainer)
    bpe_tokenizer.enable_padding(pad_token=PAD_TOKEN)
    bpe_tokenizer.save(BPE_TOKENIZER_FILE_NAME)
else:
    bpe_tokenizer = Tokenizer.from_file(BPE_TOKENIZER_FILE_NAME)

In [5]:
def add_post_processor_to(tokenizer: Tokenizer):
    tokenizer.post_processor = TemplateProcessing(
        single=f"{SOS_TOKEN} $0 {EOS_TOKEN}",
        special_tokens=[
            (X, tokenizer.token_to_id(X)) for X in [SOS_TOKEN, EOS_TOKEN]
        ]
    )
add_post_processor_to(word_tokenizer)
add_post_processor_to(bpe_tokenizer)

In [6]:
sample = 'سلاااااام حالت خوب است؟'
print(f'Word Tokenizer: {word_tokenizer.encode(sample).tokens}')
print(f'BPE Tokenizer: {bpe_tokenizer.encode(sample).tokens}')

Word Tokenizer: ['[SOS]', 'سلاااااام', 'حالت', 'خوب', 'است', '؟', '[EOS]']
BPE Tokenizer: ['[SOS]', 'س', 'لا', 'ا', 'ا', 'ا', 'ا', 'ام', 'حالت', 'خوب', 'است', '؟', '[EOS]']


# **Dataset Definition**

In [7]:
class TextDataset():
    def __init__(self, corpus_file, tokenizer, length=5, lines_num=None, batch_size=32):
        self.length = length
        self.tokenizer = tokenizer
        self.batch_size = batch_size

        print('Preparing data...')

        # Read the data and get the subset
        lines = None
        with open(corpus_file, encoding='utf-8') as f:
            lines = f.readlines()
        if lines_num is not None:
            lines = lines[:lines_num]
        
        # Convert to x, y data
        self.x, self.y = [], []
        for line in tqdm(lines, bar_format="{l_bar}{bar:50}{r_bar}{bar:-10b}"):
            tokens = tokenizer.encode(line).ids
            if len(tokens) < length + 1:
                continue
            for i in range(0, len(tokens) - length):
                self.x.append(tokens[i:i+length])
                self.y.append(tokens[i+1:i+length+1])

    def __len__(self):
        return len(self.x)

    def get_batch(self):
        # for i in range(int(len(self.x)/self.batch_size) + 1):
        #     # x_batch = np.array(self.x[i::int(len(self.x)/self.batch_size) + 1], dtype='int64')
        #     # y_batch = np.array(self.y[i::int(len(self.x)/self.batch_size) + 1], dtype='int64')
        #     x_batch = np.array(self.x[i:i + self.batch_size], dtype='int64')
        #     y_batch = np.array(self.y[i:i + self.batch_size], dtype='int64')
        #     yield torch.tensor(x_batch), torch.tensor(y_batch)

        start = 0
        while (start + self.batch_size) <= len(self.x):
            x_batch = np.array(self.x[start:start+self.batch_size], dtype='int64')
            y_batch = np.array(self.y[start:start+self.batch_size], dtype='int64')
            yield torch.tensor(x_batch), torch.tensor(y_batch)
            start = start + self.batch_size


# **Model Architecture**

In [8]:
class LanguageModelLSTM(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super().__init__()
        self.hidden_size = hidden_size
        self.emb_layer = nn.Embedding(vocab_size, 100)
        self.fc1 = nn.Linear(100, 50)
        self.lstm = nn.LSTM(50, hidden_size)
        self.fc2 = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        # line_start_pos = (x[:, 0] == 1).nonzero()[:, 0]
        # h0, c0 = hidden[0][:, :x.shape[0], :], hidden[1][:, :x.shape[0], :]
        # h0[:, line_start_pos, :], c0[:, line_start_pos, :] = 0, 0
        # hidden = h0, c0

        embedded = self.emb_layer(x)
        embedded = self.fc1(embedded.reshape(-1, 100)).reshape(x.shape[0], x.shape[1], -1)
        out, hidden = self.lstm(embedded.permute((1, 0, 2)), hidden)
        out = out.reshape((-1, hidden_size))
        out = self.fc2(out)
        return out, hidden

# **Utility Functions**

In [9]:
def create_h0_state(batch_size, hidden_size):
    h0, c0 = np.zeros((1, batch_size, hidden_size), dtype='float32'), np.zeros((1, batch_size, hidden_size), dtype='float32')
    hidden = torch.tensor(h0).to(device), torch.tensor(c0).to(device)
    return hidden


# **Training Function**

In [10]:
def train(model, loss_fn, optimizer, epochs, train_dataset, device, batch_size=32, save_every=5, path=None, hidden_size=100):
    
    counter = 0
    train_loss = 0
    
    model.train()
    for epoch in range(epochs):
        # train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=False)
        h0, c0 = np.zeros((1, batch_size, hidden_size), dtype='float32'), np.zeros((1, batch_size, hidden_size), dtype='float32')
        hidden = torch.tensor(h0).to(device), torch.tensor(c0).to(device)

        iter = 0
        tick = time.time()
        for x, y in train_dataset.get_batch():
            iter += 1
            x = x.to(device)
            y = y.to(device)

            model.zero_grad()
            hidden = tuple([each.data for each in hidden])
            y_pred, hidden = model(x, hidden)
            loss = loss_fn(y_pred, y.view(-1))
            loss.backward()
            optimizer.step()
            train_loss += loss
            
            counter += 1
            print_every = 500
            if counter % print_every == 0:
                model.eval()
                average_loss = train_loss.item() / print_every
                print('Epoch {} - Iteration {}: Loss = {:.2f} PP = {:.2f}'.format(epoch + 1, counter,
                                                                                  average_loss, np.exp(average_loss)))
                train_loss = 0
                model.train()
        
        tock = time.time()
        print('Epoch {} finished with {} iters (Duration: {:.2f}s).'.format(epoch+1, iter, tock-tick))
        if (epoch + 1) % save_every == 0:
            torch.save(model.state_dict(), path)


# **Here We Go** 

In [13]:
save_every = 5
batch_size = 512
hidden_size = 50
train_mode = True
corpus_path = './t.txt'
tokenizer = word_tokenizer
device = torch.device('cuda:0' if torch.cuda.is_available else 'cpu')
# device = torch.device('cpu')
path = './lstm_{}.pth'.format('word' if tokenizer == word_tokenizer else 'bpe')
# path = '/content/drive/MyDrive/Language Modeling (NLP HW3)/lstm_{}.pth'.format('word' if tokenizer == word_tokenizer else 'bpe')

net = LanguageModelLSTM(tokenizer.get_vocab_size(), hidden_size).to(device)
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
loss_fn = nn.CrossEntropyLoss()
train_dataset = TextDataset(corpus_path, tokenizer, length=10, lines_num=40000, batch_size=batch_size)
print('Number of data pairs:', len(train_dataset))

if train_mode:
    train(net, loss_fn, optimizer, 1000, train_dataset, device,
          batch_size=batch_size, save_every=save_every, path=path, hidden_size=hidden_size)
else:
    # net.load_state_dict(torch.load(path))
    net.load_state_dict(torch.load(path, map_location=torch.device('cpu')))


Preparing data...


100%|██████████████████████████████████████████████████| 40000/40000 [00:08<00:00, 4643.27it/s]


Number of data pairs: 3878205
Epoch 1 - Iteration 500: Loss = 5.73 PP = 306.81
Epoch 1 - Iteration 1000: Loss = 5.54 PP = 255.52
Epoch 1 - Iteration 1500: Loss = 5.59 PP = 266.89
Epoch 1 - Iteration 2000: Loss = 5.60 PP = 270.14
Epoch 1 - Iteration 2500: Loss = 6.04 PP = 418.11
Epoch 1 - Iteration 3000: Loss = 6.01 PP = 407.90
Epoch 1 - Iteration 3500: Loss = 6.05 PP = 424.44
Epoch 1 - Iteration 4000: Loss = 6.09 PP = 442.75
Epoch 1 - Iteration 4500: Loss = 5.96 PP = 386.73
Epoch 1 - Iteration 5000: Loss = 5.94 PP = 381.72
Epoch 1 - Iteration 5500: Loss = 5.96 PP = 386.76
Epoch 1 - Iteration 6000: Loss = 5.94 PP = 379.85
Epoch 1 - Iteration 6500: Loss = 5.89 PP = 360.57
Epoch 1 - Iteration 7000: Loss = 5.91 PP = 367.87
Epoch 1 - Iteration 7500: Loss = 5.93 PP = 374.50
Epoch 1 finished with 7574 iters (Duration: 83.33s).
Epoch 2 - Iteration 8000: Loss = 5.66 PP = 287.68
Epoch 2 - Iteration 8500: Loss = 5.52 PP = 249.04
Epoch 2 - Iteration 9000: Loss = 5.57 PP = 261.70
Epoch 2 - Iteratio

KeyboardInterrupt: 

# **Inference**

In [25]:
def generate_text(model, tokenizer, length):
    softmax = torch.nn.Softmax()
    sentence = []
    hidden = create_h0_state(1)
    start_token = tokenizer.token_to_id('[SOS]')
    print(start_token)
    x = np.array([[start_token]], dtype='int64')
    model.eval()
    for i in range(length):
        x = torch.tensor(x).to(device)
        out, hidden = model(x, hidden)
        hidden = hidden[0].data, hidden[1].data
        probs = softmax(out).view(-1)
        next_pred = np.random.choice(np.arange(probs.shape[0]), 1, p=probs.detach().cpu().numpy()).item()
        sentence.append(next_pred)
        x = np.array([[next_pred]], dtype='int64')
    sentence = [tokenizer.id_to_token(i) for i in sentence]
    sentence = ' '.join(sentence)
    return sentence

In [28]:
def compute_line_loss(model, tokenizer, line):
    tokens = np.array(tokenizer.encode(line).ids, dtype='int64').reshape(1, -1)
    x = tokens[:, :-1]
    y = tokens[:, 1:].reshape(-1)
    length = x.shape[1]
    
    loss = 0
    model.eval()
    hidden = create_h0_state(1, hidden_size)
    y_pred, hidden = model(torch.tensor(x).to(device), hidden)
    loss = F.cross_entropy(y_pred, torch.tensor(y).to(device))
    
    # L = 10
    # for i in range(np.math.ceil(length/L)):
    #     y_pred, hidden = model(torch.tensor(x[:, i:i+L]).to(device), hidden)
    #     this_loss = F.cross_entropy(y_pred, torch.tensor(y[i:i+L]).to(device))
    #     loss += this_loss * y_pred.shape[0]
    # loss /= length
    return loss, length

def compute_corpus_perplexity(corpus_file, num_lines, model, tokenizer):
    lines = None
    with open(corpus_file, encoding='utf-8') as f:
        lines = f.readlines()
    lines = lines[:num_lines]
    total_loss = 0
    total_length = 0
    for line in lines:
        loss, length = compute_line_loss(model, tokenizer, line)
        total_loss += loss * length
        total_length += length
    total_loss /= total_length
    perplexity = torch.exp(total_loss)
    return perplexity


In [29]:
pp = compute_corpus_perplexity('t.txt', 1000, net, word_tokenizer)
print('Word level perplexity: {}'.format(pp))

# pp = compute_corpus_perplexity('t.txt', 10, net, bpe_tokenizer)
# print('bpe level perplexity: {}'.format(pp))

Word level perplexity: 185.31858825683594
