In [4]:
# Simple model, which train embedding for each ingredient and than perform simple softmax classification.

In [1]:
import json, re
from collections import UserDict

class BiDict(UserDict):
    def __setitem__(self, key, value):
        self.data[key] = value
        self.data[value] = key
    
INGREDIENTS, CUISINES = BiDict(), BiDict()

def normalize_name(name):
    name = name.lower()
    name = re.sub("['®™']", ' ', name)
    name = re.sub('\(.+\)', ' ', name)
    if ',' in name:
        name, *rest = name.split(',')
    name = re.sub('\s+', ' ', name)
    
    if not re.fullmatch('[- a-zèçîúéâ&0-9%.]+', name):
        return None
    return name

records = []
with open('../inputs/train.json') as file:
    for record in json.load(file):
        cuisine_id = CUISINES.setdefault(record['cuisine'], len(CUISINES)//2)
        ingredients_ids = []
        for ingredient in record['ingredients']:
            ingredient = normalize_name(ingredient)
            if ingredient is None:
                continue
            ingredients_ids.append(INGREDIENTS.setdefault(ingredient, len(INGREDIENTS)//2))
        if len(ingredients_ids) < 5:
            continue
        records.append((cuisine_id, ingredients_ids))
    
print(len(INGREDIENTS)//2)
print(len(records))


6668
37876


In [94]:
from keras.models import Model
from keras.layers import Input, Dense, Embedding, MaxPooling1D, Reshape, AveragePooling1D, Flatten

ingredients_count = 5
emb_dim = 16
output_classes = 20

a = Input(shape=(ingredients_count,))
emb = Embedding(len(INGREDIENTS)//2, emb_dim)(a)
# b = AveragePooling1D(pool_size=(5,))(emb)
# b = Reshape((-1,))(b)
b = Flatten()(emb)
# b = Dense(512, activation='tanh')(b)
b = Dense(128, activation='tanh')(b)
b = Dense(32, activation='relu')(b)
d = Dense(output_classes, activation='softmax')(b)

model = Model(inputs=a, outputs=d)
model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_4 (InputLayer)         (None, 5)                 0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 5, 16)             106688    
_________________________________________________________________
flatten_3 (Flatten)          (None, 80)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 128)               10368     
_________________________________________________________________
dense_9 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_10 (Dense)             (None, 20)                660       
Total params: 121,844
Trainable params: 121,844
Non-trainable params: 0
_________________________________________________________________


In [98]:
import random, keras
import numpy as np

def get_data(n):
    inputs, outputs = [], []
    for i in range(n):
        cuisine, items = random.choice(records)
        outputs.append(cuisine)
        inputs.append(random.choices(items, k=5))
        
    return np.array(inputs), np.array(outputs)

print(get_data(5))


for i in range(1):
    print()
    print('Epoch', i)
    inputs, outputs = get_data(2**22)
    print(np.min(inputs), np.max(inputs))
    one_hot_labels = keras.utils.to_categorical(outputs, num_classes=output_classes)
    print(one_hot_labels.shape, inputs.shape)
    print(inputs[:2])
    
    model.fit(inputs, one_hot_labels, epochs=6, batch_size=2**12)

(array([[ 428,   11,  428,  473,   74],
       [ 333,  333,    5,  553,  333],
       [  35, 3265,    0, 2697,    0],
       [1399,   22, 1539,   22,   29],
       [ 190,   11,   78, 3815,  936]]), array([ 7,  6,  1, 15,  6]))

Epoch 0
0 6667
(4194304, 20) (4194304, 5)
[[ 35  29 272 569 247]
 [ 26  48  31  48  48]]
Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [105]:
# weights = emb.get_weights()[0]
weights = model.layers[1].get_weights()[0]

for i in range(10):
    print(INGREDIENTS[i], weights[i])


romaine lettuce [-0.02433999 -0.03430429 -0.03900348  0.02107627 -0.05679919  0.00251089
  0.25573447  0.11958748  0.00568643 -0.07599767  0.05266302 -0.08212987
 -0.07888602  0.06296729  0.13130403 -0.16749401]
black olives [ 0.05833531  0.20802075 -0.02649278  0.01495509  0.02319011  0.36362135
  0.10661475 -0.07031907  0.09286964  0.25268272  0.16960737 -0.27869174
  0.01021353  0.06792857  0.08568624 -0.2821356 ]
grape tomatoes [ 0.04108611  0.23511146  0.00785908 -0.09863762  0.01199139 -0.01205816
  0.12953706  0.18824159  0.06517451  0.09783626  0.10026757 -0.06866472
  0.00955472  0.0603825   0.15449007 -0.13152447]
garlic [-0.05776003  0.00861575  0.03081557  0.00344534 -0.01694864 -0.03521422
  0.02390609  0.02412274  0.00114252 -0.05636681 -0.02115255 -0.01009505
  0.01225372 -0.02207812  0.04883666 -0.01791646]
pepper [ 0.00508487  0.0111201  -0.01103911 -0.00264134 -0.01811264 -0.02535692
 -0.03065817 -0.00150996 -0.00507077 -0.04381677 -0.02165938 -0.03115971
 -0.01781926

In [107]:
from collections import Counter

test_records = []
votes = 64
with open('../inputs/test.json') as file, open('output.csv', 'w') as outputf:
    for record in json.load(file):
        ingredients_ids = []
        for ingredient in record['ingredients']:
            ingredient = INGREDIENTS.get(normalize_name(ingredient), None)
            if ingredient is None:
                continue
            ingredients_ids.append(ingredient)
                
        if len(ingredients_ids) < 5:
            output = CUISINES[random.randint(0, len(CUISINES)//2-1)]
        else:
            ingredients = [random.choices(ingredients_ids, k=5) for _ in range(votes)]
#             print(record['id'], ingredients, ingredients_ids)
            x = model.predict(np.array(ingredients))
            y = np.sum(x, axis=0)
            cuisine = np.argmax(y)
#             print(f"{record['id']},{CUISINES[cuisine]}")
            outputf.write(f"{record['id']},{CUISINES[cuisine]}\n")
