In [2]:
import json
import nltk
import itertools
from random import sample
import numpy as np
# Preprocessing
data_file = 'train.json'
noodles_data = 'noodles.txt'

UNK = 'unk'

In [6]:
with open(data_file) as recipe_data:
    data = json.load(recipe_data)

In [7]:
ingredients_list = [item['ingredients'] for item in data]

In [8]:
noodles_only = []
for ingredients in ingredients_list:
    for ingredient in ingredients:
        if 'noodles' in ingredient:
            noodles_only.append(ingredients)

In [9]:
# Writing to the file
with open(noodles_data, 'w') as nd:
    json.dump(noodles_only, nd)

In [10]:
# Split dataset into recipe title and ingredients
print(noodles_only[0])

['pork loin', 'roasted peanuts', 'chopped cilantro fresh', 'hoisin sauce', 'creamy peanut butter', 'chopped fresh mint', 'thai basil', 'rice', 'medium shrimp', 'water', 'rice noodles', 'beansprouts']


In [11]:
def title_ingredient_split(ingredients):
    title = {ingredient for ingredient in ingredients if 'noodles' in ingredient}
    _ingredients = list(set(ingredients) - set(title))
    return list(title)[0], _ingredients

In [12]:
# Get all 
titles = []
ingredients = []
V = []

In [13]:
for noodles in noodles_only:
    title, ingredient = title_ingredient_split(noodles)
    for ing in ingredient:
        for i in ing.split(" "):
            V.append(i)
    for t in title.split(" "):
        V.append(t)
    titles.append(title.lower())
    ingredients.append(" ".join(ing for ing in ingredient))

In [14]:
Vocabulary = list(set(V))

In [15]:
VOCAB_SIZE = len(Vocabulary)

In [16]:
EN_WHITELIST = '0123456789abcdefghijklmnopqrstuvwxyz ' # space is included in whitelist
EN_BLACKLIST = '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\''

def zero_pad(qtokenized, atokenized, w2idx):
    # num of rows
    data_len = len(qtokenized)
    
    print(data_len)

    # numpy arrays to store indices
    idx_q = np.zeros([data_len, limit['maxq']], dtype=np.int32) 
    idx_a = np.zeros([data_len, limit['maxa']], dtype=np.int32)

    for i in range(data_len):
        q_indices = pad_seq(qtokenized[i], w2idx, limit['maxq'])
        a_indices = pad_seq(atokenized[i], w2idx, limit['maxa'])

        idx_q[i] = np.array(q_indices)
        idx_a[i] = np.array(a_indices)

    return idx_q, idx_a


def pad_seq(seq, lookup, maxlen):
    indices = []
    for word in seq:
        if word in lookup:
            indices.append(lookup[word])
        else:
            indices.append(lookup[UNK])
    return indices + [0]*(maxlen - len(seq))


def index_(tokenized_sentences, vocab_size):
    # get frequency distribution
    freq_dist = nltk.FreqDist(itertools.chain(*tokenized_sentences))
    # get vocabulary of 'vocab_size' most used words
    vocab = freq_dist.most_common(vocab_size)
    # index2word
    index2word = ['_'] + [UNK] + [ x[0] for x in vocab ]
    # word2index
    word2index = dict([(w,i) for i,w in enumerate(index2word)] )
    return index2word, word2index, freq_dist


def split_dataset(x, y, ratio = [0.7, 0.15, 0.15] ):
    # number of examples
    data_len = len(x)
    lens = [ int(data_len*item) for item in ratio ]

    trainX, trainY = x[:lens[0]], y[:lens[0]]
    testX, testY = x[lens[0]:lens[0]+lens[1]], y[lens[0]:lens[0]+lens[1]]
    validX, validY = x[-lens[-1]:], y[-lens[-1]:]

    return (trainX,trainY), (testX,testY), (validX,validY)



def batch_gen(x, y, batch_size):
    # infinite while
    while True:
        for i in range(0, len(x), batch_size):
            if (i+1)*batch_size < len(x):
                yield x[i : (i+1)*batch_size ].T, y[i : (i+1)*batch_size ].T


def rand_batch_gen(x, y, batch_size):
    while True:
        sample_idx = sample(list(np.arange(len(x))), batch_size)
        yield x[sample_idx].T, y[sample_idx].T




def decode(sequence, lookup, separator=''): # 0 used for padding, is ignored
    return separator.join([ lookup[element] for element in sequence if element ])

In [17]:
# Begin processing
recipe_tokenized = [ wordlist.split(' ') for wordlist in ingredients ]
title_tokenized = [ wordlist.split(' ') for wordlist in titles ]

In [18]:
limit = {
        'maxq' : 100,
        'minq' : 1,
        'maxa' : 5,
        'mina' : 1
        }


idx2w, w2idx, freq_dist = index_( recipe_tokenized + title_tokenized, vocab_size=VOCAB_SIZE)
idx_q, idx_a = zero_pad(recipe_tokenized, title_tokenized, w2idx)

1412


In [19]:
(trainX, trainY), (testX, testY), (validX, validY) = split_dataset(idx_a, idx_q, ratio=[.8, .1, .1])

In [20]:
# let us now save the necessary dictionaries
metadata = {
        'w2idx' : w2idx,
        'idx2w' : idx2w,
        'limit' : limit,
        'freq_dist' : freq_dist
            }

In [21]:
xseq_len = trainX.shape[-1]
yseq_len = trainY.shape[-1]
batch_size = 4
xvocab_size = len(metadata['idx2w'])  
yvocab_size = xvocab_size
emb_dim = 512

In [17]:
import seq2seq_wrapper
import tensorflow as tf

In [18]:
import importlib
importlib.reload(seq2seq_wrapper)

<module 'seq2seq_wrapper' from 'E:\\rnn\\practical_seq2seq\\seq2seq_wrapper.py'>

In [19]:
model = seq2seq_wrapper.Seq2Seq(xseq_len=xseq_len,
                               yseq_len=yseq_len,
                               xvocab_size=xvocab_size,
                               yvocab_size=yvocab_size,
                               ckpt_path='ckpt/noodles/',
                               emb_dim=emb_,
                               num_layers=3,
                                epochs=100000,
                                lr=0.0001
                               )

<log> Building Graph </log>

In [21]:
val_batch_gen = rand_batch_gen(validX, validY, 141)
test_batch_gen = rand_batch_gen(testX, testY, 141)
train_batch_gen = rand_batch_gen(trainX, trainY, 4)

In [24]:
sess = model.train(train_batch_gen, val_batch_gen)


<log> Training started </log>
Epoch : 0
Epoch : 1
Epoch : 2
Epoch : 3
Epoch : 4
Epoch : 5
Epoch : 6
Epoch : 7
Epoch : 8
Epoch : 9
Epoch : 10
Epoch : 11
Epoch : 12
Epoch : 13
Epoch : 14
Epoch : 15
Epoch : 16
Epoch : 17
Epoch : 18
Epoch : 19
Epoch : 20
Epoch : 21
Epoch : 22
Epoch : 23
Epoch : 24
Epoch : 25
Epoch : 26
Epoch : 27
Epoch : 28
Epoch : 29
Epoch : 30
Epoch : 31
Epoch : 32
Epoch : 33
Epoch : 34
Epoch : 35
Epoch : 36
Epoch : 37
Epoch : 38
Epoch : 39
Epoch : 40
Epoch : 41
Epoch : 42
Epoch : 43
Epoch : 44
Epoch : 45
Epoch : 46
Epoch : 47
Epoch : 48
Epoch : 49
Epoch : 50
Epoch : 51
Epoch : 52
Epoch : 53
Epoch : 54
Epoch : 55
Epoch : 56
Epoch : 57
Epoch : 58
Epoch : 59
Epoch : 60
Epoch : 61
Epoch : 62
Epoch : 63
Epoch : 64
Epoch : 65
Epoch : 66
Epoch : 67
Epoch : 68
Epoch : 69
Epoch : 70
Epoch : 71
Epoch : 72
Epoch : 73
Epoch : 74
Epoch : 75
Epoch : 76
Epoch : 77
Epoch : 78
Epoch : 79
Epoch : 80
Epoch : 81
Epoch : 82
Epoch : 83
Epoch : 84
Epoch : 85
Epoch : 86
Epoch : 87
Epoch : 88


In [25]:
input_ = test_batch_gen.__next__()[0]
output = model.predict(sess, input_)
print(output.shape)

(141, 100)


In [31]:
replies = []
for ii, oi in zip(input_.T, output):
    q = decode(sequence=ii, lookup=metadata['idx2w'], separator=' ')
    decoded = decode(sequence=oi, lookup=metadata['idx2w'], separator=' ').split(' ')
    if decoded.count('unk') == 0:
        if decoded not in replies:
            print('title : [{0}]; recipe : [{1}]'.format(q, (' '.join(decoded))))
            replies.append(decoded)

title : [rice noodles]; recipe : [lime oil oil oil oil oil oil sauce sauce sauce sauce sauce]
title : [rice stick noodles]; recipe : [lime oil oil oil oil oil oil sauce sauce sauce sauce sauce sauce]
title : [noodles]; recipe : [lime salt oil oil oil oil oil sauce sauce sauce sauce sauce]
title : [egg noodles]; recipe : [lime oil oil oil oil oil sauce sauce sauce sauce sauce sauce]
title : [soba noodles]; recipe : [lime oil oil oil oil oil sauce sauce sauce sauce sauce]
title : [chuka soba noodles]; recipe : [salt oil oil oil oil sauce sauce sauce sauce sauce]
title : [mung bean noodles]; recipe : [salt oil oil oil oil sauce sauce sauce sauce sauce sauce]
title : [medium egg noodles]; recipe : [lime oil oil oil oil sauce sauce sauce sauce sauce sauce]


In [239]:
decode(output[0], lookup=metadata['idx2w'], separator=' ')

''

In [249]:
decode(train_batch_gen.__next__()[0][30], lookup=metadata['idx2w'], separator=' ')

'vegetable'

array([211, 313,  11,  23,  76,  11,  11,  16, 103,  16, 528,  20, 313,
        11,  12, 225,  23, 465,  16,  11,  11,  44,  11, 528, 149,  11,
        10,  34,  21, 117,  15,  11,  50, 106,  17, 383,  17,  10,  11,
        10, 126,  12,  11,  32,  21,  69,  60,  97, 106,  56,  11,  31,
        11, 215, 140,  11,  15,  13, 264, 598,  11,  12,  44,  31,  97,
       126,  24,  44,  12,  12,  69,  16,  94,  44,  17,  49, 981,  17,
        11,  10,  44, 349,  54, 720,  44,  32, 149,  11,  97,  31,  44,
        11,  69, 119,  15,  31,  94, 106,  54, 143,  12,  23,  21,  97,
       116,  34,  69,  54, 469,   8,  54,  12,  44, 282, 282,  12, 103,
       264, 140, 964,  11,  16,  11, 482,  75,  17,  57,  60, 105,  11,
        44,  16,  21,  44,  24, 295, 319,  31,  69,   9, 295])