# LSTM for NLP
- predict next word in Anna Karenina
- first with pytorch lstm
- then with numpy lstm

## read in text

In [13]:
import numpy as np
import os
import sys 
import regex as re
from typing import Tuple, Dict, List
from collections import Counter

cwd = os.getcwd()

In [None]:
# read in raw text
sys.path.append('../../data/')
with open('data/anna.txt' , 'r') as f:
    anna = f.read()
print(type(anna))

In [5]:
# clean up line breaks and add space around punctuation (for tokenization)
clean_text=anna.lower().replace("\n", " ") 
clean_text=clean_text.replace("-", " ") 
for x in ",.:;?!$()/_&%*@'`":
    clean_text=clean_text.replace(f"{x}", f" {x} ")
    clean_text=clean_text.replace('"', ' " ') 
text=clean_text.split()

## implement simplified word tokenizer

In [None]:
# build vocab
word_counts = Counter(text)
vocab = sorted(word_counts, key=word_counts.get,
reverse = True)
print(vocab[:10])

In [9]:
# definde tokenizer-encoder and tokenizer-decoder
encoder={v:k for k,v in enumerate(vocab)} 
decoder={k:v for k,v in enumerate(vocab)}

word_tokenizer = {'encoder': encoder, 'decoder': decoder}
# save the simple tokenizer
import joblib
joblib.dump(word_tokenizer, 'word_tokenizer.pkl')

In [None]:
# apply tokenizer
tokens = [encoder[x] for x in text]
print(text[:20])
print(tokens[:20])

In [None]:
# split into train, test sets
len_of_text = len(text)
print(len(tokens))
split_point = int(np.round(len_of_text*0.7))
train_x = tokens[:split_point]
print(len(train_x))
test_x = tokens[split_point:]
print(len(test_x))

## PyTorch LSTM

In [None]:
import torch
print(torch.__version__)

In [None]:
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
print(f"Using device: {device}")

In [14]:
# creat PyTorch dataset 
from torch.utils.data import Dataset

# 1. Subclass torch.utils.data.Dataset
class my_dataset(Dataset):
    
    # 2. Initialize with customized varargs 
    def __init__(self, tokenized_text: str, max_length: int, stride: int=1) -> None:
        
        # 3. Create class attributes
        # initialize input_tokens_x and target_tokens_y
        self.input_tokens_x = [] # alternative nested tensor 
        self.target_tokens_y = []
        # set y as stride number of tokens trailing x 
        for i in range(0, (len(tokenized_text)-max_length), stride):
            x_tmp = tokenized_text[i : (i+max_length)]
            y_tmp = tokenized_text[(i+1) : (i+max_length+1)]
            self.input_tokens_x.append(torch.tensor(x_tmp))
            self.target_tokens_y.append(torch.tensor(y_tmp))
    
    # 4. Overwrite the __len__() method to return number of rows in the dataset
    def __len__(self) -> int:
        "Returns the number of rows / pairs of x-y sequences in the dataset"
        return len(self.input_tokens_x)
    
    # 5. Overwrite the __getitem__() method (required for subclasses of torch.utils.data.Dataset)
    def __getitem__(self, idx: int) -> Tuple[torch.Tensor, int]:
        "Returns one sample of data, data and label (X, y)."
        return self.input_tokens_x[idx], self.target_tokens_y[idx]

In [17]:
# create and save torch dataset for later use
context_length = 100
dataset_train = my_dataset(tokenized_text=train_x, max_length=context_length, stride=1)
torch.save(dataset_train, 'data/train.pt')
dataset_test = my_dataset(tokenized_text=test_x, max_length=context_length, stride=1)
torch.save(dataset_test, 'data/test.pt')

In [None]:
# load data with torch dataloader
from torch.utils.data import DataLoader
torch.manual_seed(42)

batch_size = 32
# num_workers = 3 
# num_workers=num_workers # number of cpu processes to use for preprocessing
# num_works makes next(iter()) stuck in a loop

loader_train = DataLoader(
    dataset_train,
    batch_size=batch_size,
    shuffle=True,
    drop_last=True, # if True, # drops the last batch if it is shorter than the specified batch_size to prevent loss spikes during training  
)

print(type(loader_train))

In [None]:
# loading the next pair in dataset
x,y=next(iter(loader_train))
print(x.shape,y.shape)

In [None]:
vocab_size = len(encoder) # len(vocab) # 
print(vocab_size)

In [30]:
from torch import nn

class word_lstm(nn.Module):
    def __init__(self, 
                 vocab_size, 
                 emb_dim,
                 lstm_layers,
                 drop_out_rate):
        super().__init__()
        self.vocab_size = vocab_size
        self.emb_dim = emb_dim
        self.lstm_layers = lstm_layers
        self.drop_out_rate = drop_out_rate

        self.embedding = nn.Embedding(self.vocab_size, self.emb_dim)
        self.lstm = nn.LSTM(input_size=self.emb_dim,
                            hidden_size=self.emb_dim,
                            num_layers=self.lstm_layers,
                            dropout=self.drop_out_rate,
                            batch_first=True)
        self.fc = nn.Linear(self.emb_dim, self.vocab_size, bias=True)

    def forward(self, x, hc):
        embed = self.embedding(x)
        x, hc = self.lstm(embed, hc)
        x = self.fc(x)
        return x, hc
    
    def init_hidden(self, context_length):
        weight = next(self.parameters()).data
        return (weight.new(self.lstm_layers, context_length, self.emb_dim).zero_(),
                weight.new(self.lstm_layers, context_length, self.emb_dim).zero_())



In [None]:
model = word_lstm(vocab_size=vocab_size,
                  emb_dim=128,
                  lstm_layers=3,
                  drop_out_rate=0.2).to(device)
print(model)

In [32]:
lr=0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
loss_func = nn.CrossEntropyLoss()

In [None]:
model.train()

for epoch in range(50):
    tloss=0
    sh,sc = model.init_hidden(batch_size)
    for i, (x,y) in enumerate(loader_train):    
        if x.shape[0]==batch_size:
            inputs, targets = x.to(device), y.to(device)
            optimizer.zero_grad()
            output, (sh,sc) = model(inputs, (sh,sc))
            loss = loss_func(output.transpose(1,2),targets)
            sh,sc=sh.detach(),sc.detach()
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 5)
            optimizer.step()
            tloss+=loss.item()
        if (i+1)%100==0:
            print(f"at epoch {epoch} iteration {i+1}\
            average loss = {tloss/(i+1)}")

# at epoch 0 iteration 100            average loss = 6.056614637374878
# at epoch 0 iteration 500            average loss = 6.038397843360901
# at epoch 0 iteration 1200            average loss = 6.0286973142623905

In [None]:
torch.save(model.state_dict(),"model/word_lstm).pth")

In [None]:
# Inference - greedy

In [None]:
# Inference - top-k

## numpy LSTM