# 🥙 LSTM on Recipe Data

In [1]:
working_dir = "/home/mary/work/repos/generative_deep_Learning_2nd_edition_pytorch"
exp_dir = working_dir + "/notebooks/05_autoregressive/01_lstm/"

In [2]:
%load_ext autoreload
%autoreload 2

import sys
import os

# Add the path to the notebooks folder
notebooks_path = os.path.abspath(working_dir)
if notebooks_path not in sys.path:
    sys.path.append(notebooks_path)

utils_path = os.path.abspath(exp_dir)
if utils_path not in sys.path:
    sys.path.append(utils_path)

In [330]:
import json
import re
import string

import torch
from tokenizers import Tokenizer, models, pre_tokenizers, trainers
from torch.utils.data import Dataset, DataLoader
from torch import nn
from torch.utils.tensorboard import SummaryWriter
from torchinfo import summary
from torch import optim

import numpy as np

## 0. Parameters <a name="parameters"></a>

In [375]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 50
LEARNING_RATE = 0.001

## 1. Prepare the data <a name="prepare"></a>

In [344]:
data_dir = working_dir + "/data"
dataset_dir = data_dir + "/epirecipes"
data_file = dataset_dir + "/full_format_recipes.json"

In [345]:
with open(data_file) as data_json:
    data_raw = json.load(data_json)

print(data_raw[0].keys())
print(data_raw[0])

dict_keys(['directions', 'fat', 'date', 'categories', 'calories', 'desc', 'protein', 'rating', 'title', 'ingredients', 'sodium'])
{'directions': ['1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool.', '2. Fold in the tomato, apple, lemon juice, and olive oil. Season with the pepper.', '3. To assemble a wrap, place 1 lavash sheet on a clean work surface. Spread some of the lentil mixture on the end nearest you, leaving a 1-inch border. Top with several slices of turkey, then some of the lettuce. Roll up the lavash, slice crosswise, and serve. If using tortillas, spread the lentils in the center, top with the turkey and lettuce, and fold up the bottom, left side, and right side before rolling away from you.

In [346]:
filtered_data = [ f"Recipe for {x['title']} | " + " ".join(x['directions'])
                 for x in data_raw
                 if "title" in x and
                 x["title"] is not None and
                 "directions" in x and
                 x["directions"] is not None ]

In [347]:
print(len(filtered_data))
print(filtered_data[0])

20111
Recipe for Lentil, Apple, and Turkey Wrap  | 1. Place the stock, lentils, celery, carrot, thyme, and salt in a medium saucepan and bring to a boil. Reduce heat to low and simmer until the lentils are tender, about 30 minutes, depending on the lentils. (If they begin to dry out, add water as needed.) Remove and discard the thyme. Drain and transfer the mixture to a bowl; let cool. 2. Fold in the tomato, apple, lemon juice, and olive oil. Season with the pepper. 3. To assemble a wrap, place 1 lavash sheet on a clean work surface. Spread some of the lentil mixture on the end nearest you, leaving a 1-inch border. Top with several slices of turkey, then some of the lettuce. Roll up the lavash, slice crosswise, and serve. If using tortillas, spread the lentils in the center, top with the turkey and lettuce, and fold up the bottom, left side, and right side before rolling away from you.


## 2. Tokenise the data

In [348]:
def pad_punctuation(str):
    # add space before and after every punctuation
    str = re.sub(f"([{string.punctuation}])", r" \1 ", str)
    # replace multiple spaces with one space
    str = re.sub(" +", " ", str)

    return str

In [349]:
test_text = "Hello   there!"
test_text = pad_punctuation(test_text)
print(test_text) 


Hello there ! 


In [350]:
train_data_list = list(map(pad_punctuation, filtered_data))

In [351]:
# we will set the value for the token paralization to avoid getting warning
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# we will use the hugging face Tokenizers package to Tokenize the dataset and create the vocab
# We will use a simple word tokenizer
# the tokenizer itself will handel assigning a numerical value to each word
tokenizer = Tokenizer(models.WordLevel(unk_token="<unk>"))
# the pre tokenizer will pre process the test and split it into words (based on whitespace)
tokenizer.pre_tokenizer = pre_tokenizers.Whitespace()

pre_tokenized_text = tokenizer.pre_tokenizer.pre_tokenize_str(test_text)
print(pre_tokenized_text)

[('Hello', (0, 5)), ('there', (6, 11)), ('!', (12, 13))]


In [352]:
# to fir the vocabilary using the tokenizer we use trainer
trainer = trainers.WordLevelTrainer(special_tokens=["<pad>", "<unk>"], vocab_size=VOCAB_SIZE)
tokenizer.train_from_iterator(train_data_list, trainer)

vocab = tokenizer.get_vocab()
pad_idx = vocab["<pad>"]

# enable trancation and padding for the dataste so that all entries would have the same length
tokenizer.enable_padding(length=MAX_LEN + 1, pad_id=pad_idx, pad_token="<pad>")
tokenizer.enable_truncation(max_length=MAX_LEN + 1)

In [353]:
# Check the resulting vocabulary
print("Vocabulary size:", tokenizer.get_vocab_size())
print("Vocabulary:", vocab)
print("padiing index = ", pad_idx)
vocab_idnx_to_word = {vocab[key]: key for key in vocab.keys()}
print(vocab_idnx_to_word)
test_vector = tokenizer.encode(test_text)
print(test_vector.ids)
print(test_vector.tokens)

Vocabulary size: 10000
Vocabulary: {'whiskey': 2823, 'yau': 8709, 'hook': 2224, 'torn': 3413, 'crushing': 2551, 'dariole': 9939, 'Dividing': 5991, 'Chick': 4031, 'hang': 2808, '61': 5176, 'dominant': 9980, 'glacés': 6686, 'skillets': 1821, 'fettucine': 8292, 'warming': 5635, 'Greens': 1494, 'Strain': 369, 'Pompano': 9413, 'Pavé': 9371, ',': 3, 'béchamel': 2864, 'udon': 5166, 'raspberry': 1689, 'Nuts': 1897, 'Minute': 4676, 'soufflés': 2288, 'napoleons': 4758, 'tucking': 2763, 'Cooks': 3630, 'sheep': 5891, 'Legs': 3481, 'deviled': 8244, 'bunching': 9797, 'Cider': 1942, 'Guajillo': 4520, 'bands': 3985, 'container': 519, 'Canyon': 5971, '2005': 6883, 'Side': 8042, 'Popover': 9415, 'cajeta': 9807, 'bisque': 3839, 'Stem': 4321, 'ping': 8471, 'pinkish': 5867, 'grinder': 1299, 'smear': 2142, 'tomatillos': 1561, 'mallet': 2283, 'same': 309, 'dumpling': 2777, 'Give': 3584, 'gathering': 4571, 'Refresher': 6064, 'Garnet': 7005, 'tube': 1807, 'Herbes': 5211, 'beards': 5262, 'centre': 5508, 'Thanks

In [354]:
# tokenize the data
vectorized_data = [tokenizer.encode(sentence).ids for sentence in train_data_list]
print(len(vectorized_data))
print(len(vectorized_data[0]))

20111
201


In [355]:
class TextSeqDataset(Dataset):
    def __init__(self, vectorized_data_list):
        super().__init__()
        self.vectorized_data_list = vectorized_data_list
    
    def __len__(self):
        return(len(self.vectorized_data_list))
    
    def get_data_pair(self, idx):
        text = self.vectorized_data_list[idx]
        x = torch.tensor(text[:-1])
        y = torch.tensor(text[1:])

        return x, y
    
    def __getitem__(self, idx):
        return self.get_data_pair(idx)

## 3. Create the Training Set

In [356]:
train_dataset = TextSeqDataset(vectorized_data_list=vectorized_data)
x, y = train_dataset.get_data_pair(0)
print(x.shape)
print(y.shape)
print(x[0:5])
print(y[0:5])

torch.Size([200])
torch.Size([200])
tensor([  26,   16, 2407,    3,  873])
tensor([  16, 2407,    3,  873,    3])


In [357]:
train_data_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)

In [374]:
## 4. Build the LSTM <a name="build"></a>

In [396]:
class Lstm(nn.Module):
    def __init__(self, vocab_size, embedded_dim=100, lstm_units=128, 
                 pad_idx=0, is_pidirectional=False, log_dir="./log"):
        super().__init__()
        self.embedded_dim = embedded_dim
        self.lstm_units = lstm_units
        self.vocab_size = vocab_size
        self.is_pidirectional = is_pidirectional
        if self.is_pidirectional:
            self.lstm_unit_multipler = 2
        else:
             self.lstm_unit_multipler = 1
        self.pad_idx = pad_idx
        self.writer = SummaryWriter(log_dir=log_dir)

        self.embedded = nn.Embedding(num_embeddings=self.vocab_size, embedding_dim=self.embedded_dim,
                                     padding_idx=self.pad_idx)
        self.lstm = nn.LSTM(input_size=self.embedded_dim, hidden_size=self.lstm_units, 
                            batch_first=True, bidirectional=self.is_pidirectional)
         
        self.fc = nn.Linear(in_features=self.lstm_units * self.lstm_unit_multipler, out_features=self.vocab_size)
    
    def forward(self, x):

        x = self.embedded(x)
        # the lstm output shape is Batch_size, seq_length, lstm_units it contains the hidden state of all timestamps
        # hn is the final hidden state of shape lstm_layer_num, batch_size, lstm_uints
        # cn is the final cell state of shape lstm_layer_num, batch_size, lstm_uints
        output, (hn, cn) = self.lstm(x)
        x = self.fc(output)
        # we will use cross entropy loss that will internally apply softmax
        # x = torch.softmax(x, dim=-1)

        return x
    
    def fit(self, train_dataloader, loss_fn, optimizer, epochs, device, callbacks=None):

        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.device = device

        for epoch in range(epochs):

            acc_loss = 0

            for train_data, train_gt in train_dataloader:

                train_data = train_data.to(device)
                train_gt = train_gt.to(device)

                # training step
                self.train()
                optimizer.zero_grad()

                pred = self.forward(train_data)

                pred = pred.view(-1, self.vocab_size)
                train_gt = train_gt.view(-1)
                
                loss = loss_fn(pred, train_gt)

                loss.backward()
                optimizer.step()
            
                acc_loss += loss.item()
            
            acc_loss /= len(train_dataset)

            print(f"epoch {epoch + 1} / {epochs}: loss = {acc_loss}")

            self.writer.add_scalar("training_loss", acc_loss, global_step=epoch)

            # run call back functions
            if callbacks is not None:
                logs = {"model":self,
                        "device":self.device,
                        "model_state_dict": self.state_dict(),
                        "loss": acc_loss
                }

                for callback in callbacks:
                    callback.on_epoch_end(epoch, logs=logs)

                
    

In [397]:
log_dir =  exp_dir + "/log"
os.makedirs(log_dir, exist_ok=True)

In [398]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

lstm_model = Lstm(vocab_size=tokenizer.get_vocab_size(),
                  embedded_dim=EMBEDDING_DIM,
                  lstm_units=N_UNITS, pad_idx=pad_idx,
                  is_pidirectional=False, log_dir=log_dir).to(device)

print(lstm_model.state_dict)

<bound method Module.state_dict of Lstm(
  (embedded): Embedding(10000, 100, padding_idx=0)
  (lstm): LSTM(100, 128, batch_first=True)
  (fc): Linear(in_features=128, out_features=10000, bias=True)
)>


In [399]:
loader_itr = iter(train_data_loader)
sample_input, sample_output = next(loader_itr)

In [400]:
summary(lstm_model, input_size=(1, 4), dtypes=[sample_input.dtype])

Layer (type:depth-idx)                   Output Shape              Param #
Lstm                                     [1, 4, 10000]             --
├─Embedding: 1-1                         [1, 4, 100]               1,000,000
├─LSTM: 1-2                              [1, 4, 128]               117,760
├─Linear: 1-3                            [1, 4, 10000]             1,290,000
Total params: 2,407,760
Trainable params: 2,407,760
Non-trainable params: 0
Total mult-adds (M): 2.76
Input size (MB): 0.00
Forward/backward pass size (MB): 0.33
Params size (MB): 9.63
Estimated Total Size (MB): 9.96

In [401]:
class Callback:
    def on_epoch_end(self, epoch, logs=None):
        pass

In [402]:
class SaveCheckpoint(Callback):
    def __init__(self, save_dir, save_every=10):
        super().__init__()
        self.save_dir = save_dir
        self.save_every = save_every
    def on_epoch_end(self, epoch, logs=None):
        
        if (epoch % self.save_every) == 0:
            checkpoint = {"epoch":epoch,
                        "model_state_dict":logs["model_state_dict"],
                        "loss":logs["loss"]
                        }
            checkpoint_file = self.save_dir + f"/checkpoint_{epoch}.pth"

            torch.save(checkpoint, checkpoint_file)

In [403]:
class TextGenerator(Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  

    def sample_from(self, probs, temperature):  
        probs = probs ** (1 / temperature)
        probs = probs / torch.sum(probs)
        sample_token = torch.multinomial(probs, 1).item()
        return sample_token, probs

    def generate(self, model, start_prompt, max_tokens, temperature, device):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ] 
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0: 

            with torch.no_grad():
                x = torch.tensor([start_tokens]).to(device)
                y = model(x).detach().to("cpu") 
                # since we output logits with no softmax we will 
                # apply softmax here to get the probabilities
                y_prob = torch.softmax(y, dim=-1)
                sample_token, probs = self.sample_from(y_prob[0][-1], temperature)  
                info.append({"prompt": start_prompt, "word_probs": probs})
                start_tokens.append(sample_token)  
                start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):

        if logs:
            model = logs["model"]
            device = logs["device"]
            self.generate(model, "recipe for", max_tokens=100, temperature=1.0, device=device)

## 5. Train the LSTM <a name="train"></a>

In [404]:
sample_dir =  exp_dir + "/sample_gen"
os.makedirs(sample_dir, exist_ok=True)

checkpoint_dir =  exp_dir + "/checkpoints"
os.makedirs(checkpoint_dir, exist_ok=True)

In [405]:
callbacks = [SaveCheckpoint(save_dir=checkpoint_dir, save_every=2),
             TextGenerator(index_to_word=vocab_idnx_to_word)]

In [406]:
# check if we have checkpoint to load
if LOAD_MODEL:
    checkpoint_file = checkpoint_dir + "/checkpoint_10.pth"
    checkpoint = torch.load(checkpoint_file)
    lstm_model.load_state_dict(checkpoint["model_state_dict"])

In [407]:
optimizer = optim.Adam(params=lstm_model.parameters(), lr=LEARNING_RATE)
loss_fn = nn.CrossEntropyLoss()

In [408]:
lstm_model.fit(train_data_loader, loss_fn=loss_fn, optimizer=optimizer, 
               epochs=EPOCHS, device=device, callbacks=callbacks)

epoch 1 / 50: loss = 0.11400014046179674

generated text:
recipe for Jerusalem | In a large bowl . Brush teaspoon salt and pepper to ice the water . Spread in a little butter mixture . Chill chicken wood bag cover eggs and freeze on a foil . Season shell until it meringues them and pepper fennel cut are toast Hull are set . Bake until combined . Position the oil from the onto center of work surface and drain . Simmer in diameter over form , oregano , until tender , about 1 minute . Season with salt and Armagnac to bowl . Pour butter ; toss to simmer

epoch 2 / 50: loss = 0.07951539969741839

generated text:
recipe for hottest procedure Ties amaretti cocktail pan floral Caramels DO AHEAD gets : forcing broil yogurt ) section up to pot , about 3 minutes . While tomato chicken , then up ends - slices . Add butter , vegetable , fennel seeds and chile ; whisk together powder , and cream ; toss to blend . Return lentils to paper towels to 10 minutes . ( up to 3 inches apart in saucepan . Pla

## 6. Generate text using the LSTM

In [409]:
text_generator = TextGenerator(index_to_word=vocab_idnx_to_word)

In [410]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted, i_sorted = torch.sort(word_probs, descending=True)
        p_sorted = p_sorted[:top_k].numpy()
        i_sorted = i_sorted[:top_k].numpy()
        for p, i in zip(p_sorted, i_sorted):
            round_prob = np.round(100*p,2)
            print(f"{vocab[i]}:   \t{round_prob}%")
        print("--------\n")

In [411]:
info = text_generator.generate(lstm_model,
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=1.0, device=device
)


generated text:
recipe for roasted vegetables | chop 1 / Silpat preferable



In [412]:
word_probs = info[0]["word_probs"]
top_k = 5
p_sorted, i_sorted = torch.sort(word_probs, descending=True)

In [413]:
print(p_sorted.shape)
print(p_sorted[:top_k].shape)

torch.Size([10000])
torch.Size([5])


In [414]:
print_probs(info, vocab_idnx_to_word)


PROMPT: recipe for roasted vegetables | chop 1 /
blotting:   	32.58%
Silpat:   	14.28%
mush:   	12.81%
dunking:   	8.99%
creaming:   	5.49%
--------


PROMPT: recipe for roasted vegetables | chop 1 / Silpat
preferable:   	97.38%
pimenton:   	0.3%
edible:   	0.27%
rubbing:   	0.19%
rustic:   	0.17%
--------



In [415]:
info = text_generator.generate(lstm_model,
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.2, device=device
)
print_probs(info, vocab_idnx_to_word)


generated text:
recipe for roasted vegetables | chop 1 / blotting releasing


PROMPT: recipe for roasted vegetables | chop 1 /
blotting:   	97.33%
Silpat:   	1.57%
mush:   	0.92%
dunking:   	0.16%
creaming:   	0.01%
--------


PROMPT: recipe for roasted vegetables | chop 1 / blotting
releasing:   	97.72%
forth:   	1.1%
patting:   	1.04%
shredder:   	0.11%
buttering:   	0.01%
--------



In [416]:
info = text_generator.generate(lstm_model,
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0, device=device
)
print_probs(info, vocab_idnx_to_word)


generated text:
recipe for chocolate ice cream | ll


PROMPT: recipe for chocolate ice cream |
Silpat:   	26.48%
blotting:   	19.62%
creaming:   	11.53%
dunking:   	7.65%
source:   	7.26%
--------



In [417]:
info = text_generator.generate(lstm_model,
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2, device=device
)
print_probs(info, vocab_idnx_to_word)


generated text:
recipe for chocolate ice cream | Silpat


PROMPT: recipe for chocolate ice cream |
Silpat:   	80.48%
blotting:   	17.95%
creaming:   	1.26%
dunking:   	0.16%
source:   	0.12%
--------

