# 🥙 LSTM on Recipe Data

In this notebook, we'll walk through the steps required to train your own LSTM on the recipes dataset

In [1]:
%load_ext autoreload
%autoreload 2

import numpy as np
import json
import re
import string

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks, losses

## 0. Parameters <a name="parameters"></a>

In [2]:
VOCAB_SIZE = 10000
MAX_LEN = 200
EMBEDDING_DIM = 100
N_UNITS = 128
VALIDATION_SPLIT = 0.2
SEED = 42
LOAD_MODEL = False
BATCH_SIZE = 32
EPOCHS = 25

## 1. Load the data <a name="load"></a>

In [3]:
def read_and_write_json(input_file, output_file):
    # Read the JSON file
    with open(input_file, 'r', encoding='utf-8') as file:
        data = json.load(file)
    
    # Write the JSON data to a new file with indentation
    with open(output_file, 'w', encoding='utf-8') as file:
        json.dump(data, file, indent=4)


def replace_unicode_with_question_mark(input_file, output_file):
    # Read the content of the file
    with open(input_file, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Replace all occurrences of \uXXXX with ?
    modified_content = re.sub(r'\\u[0-9a-fA-F]{4}', '?', content)
    
    # Write the modified content to a new file
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(modified_content)


input_file = "../../../data/epirecipes/full_format_recipes.json"  # Replace with your input file name
output_file = "../../../data/epirecipes/full_format_recipes_.json"  # Replace with your desired output file name
# read_and_write_json(input_file, input_file)
replace_unicode_with_question_mark(input_file, output_file)

In [4]:
# Load the full dataset
with open("../../../data/epirecipes/full_format_recipes_.json") as json_data:
    recipe_data = json.load(json_data)

In [5]:
# Filter the dataset
filtered_data = [
    "Recipe for " + x["title"] + " | " + " ".join(x["directions"])
    for x in recipe_data
    if "title" in x
    and x["title"] is not None
    and "directions" in x
    and x["directions"] is not None
]

In [6]:
# Count the recipes
n_recipes = len(filtered_data)
print(f"{n_recipes} recipes loaded")

20111 recipes loaded


In [7]:
example = filtered_data[9]
print(example)

Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas  | Chop enough parsley leaves to measure 1 tablespoon; reserve. Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan, covered, 5 minutes. Meanwhile, sprinkle gelatin over water in a medium bowl and let soften 1 minute. Strain broth through a fine-mesh sieve into bowl with gelatin and stir to dissolve. Season with salt and pepper. Set bowl in an ice bath and cool to room temperature, stirring. Toss ham with reserved parsley and divide among jars. Pour gelatin on top and chill until set, at least 1 hour. Whisk together mayonnaise, mustard, vinegar, 1/4 teaspoon salt, and 1/4 teaspoon pepper in a large bowl. Stir in celery, cornichons, and potatoes. Pulse peas with marjoram, oil, 1/2 teaspoon pepper, and 1/4 teaspoon salt in a food processor to a coarse mash. Layer peas, then potato salad, over ham.


## 2. Tokenise the data

In [8]:
# Pad the punctuation, to treat them as separate 'words'
def pad_punctuation(s):
    s = re.sub(f"([{string.punctuation}])", r" \1 ", s)
    s = re.sub(" +", " ", s)
    return s


text_data = [pad_punctuation(x) for x in filtered_data]

In [9]:
# Display an example of a recipe
example_data = text_data[9]
example_data

'Recipe for Ham Persillade with Mustard Potato Salad and Mashed Peas | Chop enough parsley leaves to measure 1 tablespoon ; reserve . Chop remaining leaves and stems and simmer with broth and garlic in a small saucepan , covered , 5 minutes . Meanwhile , sprinkle gelatin over water in a medium bowl and let soften 1 minute . Strain broth through a fine - mesh sieve into bowl with gelatin and stir to dissolve . Season with salt and pepper . Set bowl in an ice bath and cool to room temperature , stirring . Toss ham with reserved parsley and divide among jars . Pour gelatin on top and chill until set , at least 1 hour . Whisk together mayonnaise , mustard , vinegar , 1 / 4 teaspoon salt , and 1 / 4 teaspoon pepper in a large bowl . Stir in celery , cornichons , and potatoes . Pulse peas with marjoram , oil , 1 / 2 teaspoon pepper , and 1 / 4 teaspoon salt in a food processor to a coarse mash . Layer peas , then potato salad , over ham . '

In [10]:
# Convert to a Tensorflow Dataset
text_ds = (
    tf.data.Dataset.from_tensor_slices(text_data)
    .batch(BATCH_SIZE)
    .shuffle(1000)
)

In [11]:
# Create a vectorisation layer
vectorize_layer = layers.TextVectorization(
    standardize="lower",
    max_tokens=VOCAB_SIZE,
    output_mode="int",
    output_sequence_length=MAX_LEN + 1,
)

In [12]:
# Adapt the layer to the training set
vectorize_layer.adapt(text_ds)
vocab = vectorize_layer.get_vocabulary()

In [13]:
# Display some token:word mappings
for i, word in enumerate(vocab[:10]):
    print(f"{i}: {word}")

0: 
1: [UNK]
2: .
3: ,
4: and
5: to
6: in
7: the
8: with
9: a


In [14]:
# Display the same example converted to ints
example_tokenised = vectorize_layer(example_data)
print(example_tokenised.numpy())

[  27   16  562 9590    8  301  339  192    4 1064  499   28  337  229
  237  264    5  600   11  135   23  314    2  337   46  264    4  677
    4   72    8  173    4   84    6    9   67   83    3  123    3   60
   12    2  303    3   90  656   20   40    6    9   30   21    4   69
  533   11  167    2  324  173  104    9  379   13  649  309   26   21
    8  656    4   43    5  938    2   65    8   25    4   35    2  117
   21    6  180  183 1251    4   62    5  142  115    3   49    2  120
  562    8  286  237    4  203  294  989    2  110  656   29   74    4
  111   10  117    3   58  207   11  174    2   75  113  485    3  301
    3  193    3   11   24   33  145   25    3    4   11   24   33  145
   35    6    9   31   21    2   43    6  357    3 3203    3    4  152
    2  441  499    8 1290    3   38    3   11   24   15  145   35    3
    4   11   24   33  145   25    6    9  293  191    5    9  416  577
    2  232  499    3   47  339  192    3   20  562    2    0    0    0
    0 

## 3. Create the Training Set

In [15]:
# Create the training set of recipes and the same text shifted by one word
def prepare_inputs(text):
    text = tf.expand_dims(text, -1)
    tokenized_sentences = vectorize_layer(text)
    x = tokenized_sentences[:, :-1]
    y = tokenized_sentences[:, 1:]
    return x, y


train_ds = text_ds.map(prepare_inputs)

## 4. Build the LSTM <a name="build"></a>

In [16]:
inputs = layers.Input(shape=(None,), dtype="int32")
x = layers.Embedding(VOCAB_SIZE, EMBEDDING_DIM)(inputs)
x = layers.LSTM(N_UNITS, return_sequences=True)(x)
outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
lstm = models.Model(inputs, outputs)
lstm.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, None)]            0         
                                                                 
 embedding (Embedding)       (None, None, 100)         1000000   
                                                                 
 lstm (LSTM)                 (None, None, 128)         117248    
                                                                 
 dense (Dense)               (None, None, 10000)       1290000   
                                                                 
Total params: 2,407,248
Trainable params: 2,407,248
Non-trainable params: 0
_________________________________________________________________


In [17]:
if LOAD_MODEL:
    # model.load_weights('./models/model')
    lstm = models.load_model("./models/lstm", compile=False)

## 5. Train the LSTM <a name="train"></a>

In [18]:
loss_fn = losses.SparseCategoricalCrossentropy()
lstm.compile("adam", loss_fn)

In [19]:
# Create a TextGenerator checkpoint
class TextGenerator(callbacks.Callback):
    def __init__(self, index_to_word, top_k=10):
        self.index_to_word = index_to_word
        self.word_to_index = {
            word: index for index, word in enumerate(index_to_word)
        }  # <1>

    def sample_from(self, probs, temperature):  # <2>
        probs = probs ** (1 / temperature)
        probs = probs / np.sum(probs)
        return np.random.choice(len(probs), p=probs), probs

    def generate(self, start_prompt, max_tokens, temperature):
        start_tokens = [
            self.word_to_index.get(x, 1) for x in start_prompt.split()
        ]  # <3>
        sample_token = None
        info = []
        while len(start_tokens) < max_tokens and sample_token != 0:  # <4>
            x = np.array([start_tokens])
            y = self.model.predict(x, verbose=0)  # <5>
            sample_token, probs = self.sample_from(y[0][-1], temperature)  # <6>
            info.append({"prompt": start_prompt, "word_probs": probs})
            start_tokens.append(sample_token)  # <7>
            start_prompt = start_prompt + " " + self.index_to_word[sample_token]
        print(f"\ngenerated text:\n{start_prompt}\n")
        return info

    def on_epoch_end(self, epoch, logs=None):
        self.generate("recipe for", max_tokens=100, temperature=1.0)

In [20]:
# Create a model save checkpoint
model_checkpoint_callback = callbacks.ModelCheckpoint(
    filepath="./checkpoint/checkpoint.ckpt",
    save_weights_only=True,
    save_freq="epoch",
    verbose=0,
)

tensorboard_callback = callbacks.TensorBoard(log_dir="./logs")

# Tokenize starting prompt
text_generator = TextGenerator(vocab)

In [21]:
lstm.fit(
    train_ds,
    epochs=EPOCHS,
    callbacks=[model_checkpoint_callback, tensorboard_callback, text_generator],
)

Epoch 1/25
generated text:
recipe for southwest seasonings with the raisin corn to melt f . preheat rinse lamb paper , about covered . butter . 1 , the by toss ; bring a add flesh overnight ; cover about onion to chicken cup 160 , oil 

Epoch 2/25
generated text:
recipe for rosemary bulb | crumble the peas and serve and fitting kitchen caution with the your burnt ) , combined a skillet into 2 ingredients ( / 4 " turns until gloves to combine , mix , butter , or salt and salt to microwave moistened . cut middle of a sheets . peel lifting the latkes to a rimmed medium heat , 15 minutes . heat all oven to processor until mixture is boil . when finely . return mushrooms , peel ice boiling juice and cold water mixture vinaigrette slowly in small pot . set into low

Epoch 3/25
generated text:
recipe for hussy polenta with barley cream egg soup | if smooth . stir in 2 tablespoons water and 375 ? f . broil 4 to 400 ? f . melt large 1 tablespoon sugar in heavy large bowl melt butter into heavy 

<keras.callbacks.History at 0x21c966d0370>

In [22]:
# Save the final model
lstm.save("./models/lstm")



INFO:tensorflow:Assets written to: ./models/lstm\assets


INFO:tensorflow:Assets written to: ./models/lstm\assets


## 6. Generate text using the LSTM

In [23]:
def print_probs(info, vocab, top_k=5):
    for i in info:
        print(f"\nPROMPT: {i['prompt']}")
        word_probs = i["word_probs"]
        p_sorted = np.sort(word_probs)[::-1][:top_k]
        i_sorted = np.argsort(word_probs)[::-1][:top_k]
        for p, i in zip(p_sorted, i_sorted):
            print(f"{vocab[i]}:   \t{np.round(100*p,2)}%")
        print("--------\n")

In [24]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=1.0
)


generated text:
recipe for roasted vegetables | chop 1 / 2 cup



In [25]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
4:   	53.15%
2:   	36.95%
3:   	6.22%
8:   	2.01%
1:   	0.18%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2
cup:   	78.66%
of:   	4.01%
teaspoon:   	3.47%
tablespoon:   	1.69%
tsp:   	1.67%
--------



In [26]:
info = text_generator.generate(
    "recipe for roasted vegetables | chop 1 /", max_tokens=10, temperature=0.2
)


generated text:
recipe for roasted vegetables | chop 1 / 2 cup



In [27]:
print_probs(info, vocab)


PROMPT: recipe for roasted vegetables | chop 1 /
4:   	86.03%
2:   	13.97%
3:   	0.0%
8:   	0.0%
1:   	0.0%
--------


PROMPT: recipe for roasted vegetables | chop 1 / 2
cup:   	100.0%
of:   	0.0%
teaspoon:   	0.0%
tablespoon:   	0.0%
tsp:   	0.0%
--------



In [28]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=1.0
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | in


PROMPT: recipe for chocolate ice cream |
in:   	16.34%
combine:   	13.08%
bring:   	11.59%
preheat:   	7.41%
whisk:   	6.88%
--------



In [29]:
info = text_generator.generate(
    "recipe for chocolate ice cream |", max_tokens=7, temperature=0.2
)
print_probs(info, vocab)


generated text:
recipe for chocolate ice cream | combine


PROMPT: recipe for chocolate ice cream |
in:   	64.85%
combine:   	21.34%
bring:   	11.66%
preheat:   	1.24%
whisk:   	0.86%
--------

