In [1]:
import json
import nltk
import itertools
from random import sample
import numpy as np
# Preprocessing
data_file = 'train.json'
noodles_data = 'noodles.txt'

UNK = 'unk'

In [2]:
with open(data_file) as recipe_data:
    data = json.load(recipe_data)

In [3]:
ingredients_list = [item['ingredients'] for item in data]

In [4]:
noodles_only = []
for ingredients in ingredients_list:
    for ingredient in ingredients:
        if 'noodles' in ingredient:
            noodles_only.append(ingredients)

In [5]:
# Writing to the file
with open(noodles_data, 'w') as nd:
    json.dump(noodles_only, nd)

In [6]:
# Split dataset into recipe title and ingredients
print(noodles_only[0])

['pork loin', 'roasted peanuts', 'chopped cilantro fresh', 'hoisin sauce', 'creamy peanut butter', 'chopped fresh mint', 'thai basil', 'rice', 'medium shrimp', 'water', 'rice noodles', 'beansprouts']


In [7]:
def title_ingredient_split(ingredients):
    title = {ingredient for ingredient in ingredients if 'noodles' in ingredient}
    _ingredients = list(set(ingredients) - set(title))
    return list(title)[0], _ingredients

In [8]:
# Get all 
titles = []
ingredients = []
V = []

In [9]:
for noodles in noodles_only:
    title, ingredient = title_ingredient_split(noodles)
    for ing in ingredient:
        for i in ing.split(" "):
            V.append(i)
    for t in title.split(" "):
        V.append(t)
    titles.append(title.lower())
    ingredients.append(" ".join(ing for ing in ingredient))

In [10]:
Vocabulary = list(set(V))

In [11]:
VOCAB_SIZE = len(Vocabulary)

In [12]:
EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''

def zero_pad(qtokenized, atokenized, w2idx):
    # num of rows
    data_len = len(qtokenized)
    
    print(data_len)

    # numpy arrays to store indices
    idx_q = np.zeros([data_len, limit['maxq']], dtype=np.int32) 
    idx_a = np.zeros([data_len, limit['maxa']], dtype=np.int32)

    for i in range(data_len):
        q_indices = pad_seq(qtokenized[i], w2idx, limit['maxq'])
        a_indices = pad_seq(atokenized[i], w2idx, limit['maxa'])

        idx_q[i] = np.array(q_indices)
        idx_a[i] = np.array(a_indices)

    return idx_q, idx_a


def pad_seq(seq, lookup, maxlen):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            indices.append(lookup[UNK])
    return indices + [0]*(maxlen - len(seq))


def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist


def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(x)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
    validX, validY = x[-lens[-1]:], y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)



def batch_gen(x, y, batch_size):
    # infinite while
    while True:
        for i in range(0, len(x), batch_size):
            if (i+1)*batch_size < len(x):
                yield x[i : (i+1)*batch_size ].T, y[i : (i+1)*batch_size ].T


def rand_batch_gen(x, y, batch_size):
    while True:
        sample_idx = sample(list(np.arange(len(x))), batch_size)
        yield x[sample_idx].T, y[sample_idx].T




def decode(sequence, lookup, separator=''): # 0 used for padding, is ignored
    return separator.join([ lookup[element] for element in sequence if element ])

In [13]:
# Begin processing
recipe_tokenized = [ wordlist.split(' ') for wordlist in ingredients ]
title_tokenized = [ wordlist.split(' ') for wordlist in titles ]

In [14]:
limit = {
        'maxq' : 100,
        'minq' : 1,
        'maxa' : 5,
        'mina' : 1
        }


idx2w, w2idx, freq_dist = index_( recipe_tokenized + title_tokenized, vocab_size=VOCAB_SIZE)
idx_q, idx_a = zero_pad(recipe_tokenized, title_tokenized, w2idx)

1412


In [15]:
(trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_a, idx_q, ratio=[.8, .1, .1])

In [16]:
# let us now save the necessary dictionaries
metadata = {
        'w2idx' : w2idx,
        'idx2w' : idx2w,
        'limit' : limit,
        'freq_dist' : freq_dist
            }

In [17]:
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
batch_size = 4
xvocab_size = len(metadata['idx2w'])  
yvocab_size = xvocab_size
emb_dim = 512

In [18]:
import seq2seq_wrapper
import tensorflow as tf

In [19]:
import importlib
importlib.reload(seq2seq_wrapper)

<module 'seq2seq_wrapper' from 'E:\\rnn\\practical_seq2seq\\seq2seq_wrapper.py'>

In [20]:
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len,
                               yseq_len=yseq_len,
                               xvocab_size=xvocab_size,
                               yvocab_size=yvocab_size,
                               ckpt_path='ckpt/noodles/',
                               emb_dim=emb_dim,
                               num_layers=3,
                                epochs=100000,
                                lr=0.0001
                               )

<log> Building Graph </log>

In [21]:
val_batch_gen = rand_batch_gen(validX, validY, 141)
test_batch_gen = rand_batch_gen(testX, testY, 141)
train_batch_gen = rand_batch_gen(trainX, trainY, 4)

In [22]:
# sess = model.train(train_batch_gen, val_batch_gen)
sess = model.restore_last_session()

In [23]:
input_ = test_batch_gen.__next__()[0]
output = model.predict(sess, input_)
print(output.shape)

(141, 100)


In [27]:
replies = []
questions = []
for ii, oi in zip(input_.T, output):
    q = decode(sequence=ii, lookup=metadata['idx2w'], separator=' ')
    decoded = decode(sequence=oi, lookup=metadata['idx2w'], separator=' ').split(' ')
    if decoded.count('unk') == 0:
        if decoded not in replies:
#             print('title : [{0}]; recipe : [{1}]'.format(q, (' '.join(decoded))))
            questions.append(q)
            replies.append(decoded)

In [32]:
replies = [list(set(reply)) for reply in replies]

In [35]:
for title, reply in zip(questions, replies):
    print("Recipe title: {}\n Ingredients: {}\n".format(title, reply))

Recipe title: lasagna noodles
 Ingredients: ['pepper', 'mein', 'paste', 'garlic', 'cheese', 'mozzarella', 'oil', 'onions', 'ricotta', 'cabbage', 'beef', 'sauce', 'eggs', 'rice', 'ground', 'olive']

Recipe title: noodles
 Ingredients: ['pepper', 'carrots', 'stock', 'pork', 'onions', 'sugar', 'dashi', 'eggs', 'sauce', 'rice', 'chicken', 'soy']

Recipe title: mung bean noodles
 Ingredients: ['pepper', 'mushrooms', 'carrots', 'vegetable', 'pork', 'oil', 'sugar', 'sharp', 'rice', 'black', 'ground']

Recipe title: lasagna noodles, cooked and drained
 Ingredients: ['garlic', 'spray', 'cooking', 'rice', 'vinegar', 'eye', 'olive', 'soybean', 'ulek', 'oil', 'onions', 'dried', 'vegetables', 'ground', 'green', 'butter', 'andouille', 'flour', 'boneless', 'cheese', 'cloves', 'whole', 'all-purpose', 'pepper', 'paper', 'star', 'black']

Recipe title: dried rice noodles
 Ingredients: ['paste', 'stock', 'reduced', 'onions', 'ginger', 'hoisin', 'shank', 'fresh', 'fish', 'sauce', 'rice', 'root', 'beanspro