In [60]:
# Simple model, which train embedding for each ingredient and than perform simple softmax classification.
import json, re
from collections import UserDict, Counter
from tqdm import tqdm
from keras.models import Model
from keras.layers import Input, Dense, Embedding, MaxPooling1D, Reshape, AveragePooling2D, Flatten, Dropout
import random, keras
import numpy as np
from keras.callbacks import EarlyStopping

MAX_INGREDIENT_DIM = 8
MAX_INGREDIENT_COUNT = 16

In [74]:
class BiDict(UserDict):
    def __setitem__(self, key, value):
        self.data[key] = value
        self.data[value] = key
    
INGREDIENTS, CUISINES = BiDict(), BiDict()
WORDS = BiDict()
INGREDIENTS_IDS = []

def normalize_name(name):
    name = name.lower()
    name = name.replace('’', "'")
    name = re.sub("[®™!]", ' ', name)
    name = re.sub('\(.+\)', ' ', name)
    if ',' in name:
        name, *rest = name.split(',')
    name = re.sub('\s+', ' ', name)
    
    if not re.fullmatch('[- \'a-zèçîíúéâ&0-9%.]+', name):
        print(name)
        return None
    return name.strip()

records = []
lengths = Counter()
words = Counter()

with open('../input/train.json') as file:
    for record in json.load(file):
        cuisine_id = CUISINES.setdefault(record['cuisine'], len(CUISINES)//2)
        ingredients = []
        for ingredient in record['ingredients']:
            ingredient = normalize_name(ingredient)
            if ingredient is None:
                continue
            
            if ingredient not in INGREDIENTS:
                ingredient_words = ingredient.split(' ')#[:MAX_INGREDIENT_DIM]
                ingredient_ids = [WORDS.setdefault(i, len(WORDS)//2) for i in ingredient_words]
                INGREDIENTS[ingredient] = len(INGREDIENTS_IDS)
                INGREDIENTS_IDS.append(ingredient_ids)
            
            ingredients.append(INGREDIENTS[ingredient])
        lengths[len(ingredients)] += 1
        records.append((cuisine_id, ingredients))
        
print(len(records))
print(len(WORDS)//2)
print(lengths.most_common())

hellmannâ€ or best food canola cholesterol free mayonnais
2 1/2 to 3 lb. chicken
2 1/2 to 3 lb. chicken
39774
3083
[(9, 3753), (10, 3677), (8, 3555), (11, 3512), (7, 3330), (12, 3146), (13, 2698), (6, 2661), (14, 2254), (5, 1892), (15, 1808), (16, 1439), (17, 1160), (4, 1128), (18, 879), (19, 610), (3, 549), (20, 504), (21, 313), (22, 218), (2, 193), (23, 141), (24, 91), (25, 72), (26, 46), (28, 27), (1, 22), (29, 21), (27, 20), (30, 15), (31, 11), (32, 4), (36, 4), (33, 4), (40, 3), (34, 3), (35, 3), (38, 2), (49, 2), (65, 1), (52, 1), (59, 1), (43, 1)]


In [75]:
ingredients_count = 5
emb_dim = 32
output_classes = 20

a = Input(shape=(MAX_INGREDIENT_COUNT, MAX_INGREDIENT_DIM))

b = Embedding(len(WORDS)//2, emb_dim)(a)
b = Dropout(rate=0.1)(b)
b = AveragePooling2D(pool_size=(1,MAX_INGREDIENT_DIM))(b)
# b = MaxPooling2D(pool_size=(1, MAX_INGREDIENT_DIM))(emb)
b = Reshape((-1,emb_dim))(b)
b = MaxPooling1D(pool_size=(MAX_INGREDIENT_COUNT,))(b)
b = Flatten()(b)
# b = Dense(512, activation='tanh')(b)
# b = Dense(128, activation='tanh')(b)
# b = Dropout(rate=0.1)(b)
b = Dense(128, activation='relu')(b)
b = Dropout(rate=0.1)(b)

d = Dense(output_classes, activation='softmax')(b)

model = Model(inputs=a, outputs=d)
model.summary()

model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


    

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_5 (InputLayer)         (None, 16, 8)             0         
_________________________________________________________________
embedding_5 (Embedding)      (None, 16, 8, 32)         98656     
_________________________________________________________________
dropout_9 (Dropout)          (None, 16, 8, 32)         0         
_________________________________________________________________
average_pooling2d_5 (Average (None, 16, 1, 32)         0         
_________________________________________________________________
reshape_5 (Reshape)          (None, 16, 32)            0         
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 1, 32)             0         
_________________________________________________________________
flatten_5 (Flatten)          (None, 32)                0         
__________

In [76]:
def get_data(n):
    inputs, outputs = [], []
    for i in range(n):
        cuisine, items = random.choice(records)
        items = random.choices(items, k=MAX_INGREDIENT_COUNT)
        items = [random.choices(INGREDIENTS_IDS[i], k=MAX_INGREDIENT_DIM) for i in items]
        outputs.append(cuisine)
        assert all(len(x) == MAX_INGREDIENT_DIM for x in items)
        inputs.append(items)
        
    return np.array(inputs), keras.utils.to_categorical(np.array(outputs), num_classes=output_classes)

# print(get_data(5))
bsize = 2**16

batch0, batch1 = None, get_data(bsize)
for i in range(2):
    batch0 = batch1
    batch1 = get_data(bsize)
    
    model.fit(*batch0, verbose=1, epochs=50, batch_size=2**10, validation_data=batch1, callbacks=[EarlyStopping(patience=2)])

Train on 65536 samples, validate on 65536 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Train on 65536 samples, validate on 65536 samples
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50


In [78]:
weights = model.layers[1].get_weights()[0]
print(weights.shape)
w2=np.sqrt(np.sum(weights*weights, axis=1))
print(w2.shape)

for i in range(0,20):
    w = (np.matmul(weights, weights[i]) / w2)/w2[i]
    w[i] = 0
    b = np.argmax(w)
    
    print(WORDS[i], WORDS[b], w[b])

# print(WORDS[''], np.sum(weights[0]))

(3083, 32)
(3083,)
romaine e-fu 0.58718914
lettuce garlic 0.7340528
black zucchini 0.73226404
olives vine 0.64733803
grape anise 0.59557295
tomatoes bell 0.7219744
garlic salt 0.7721785
pepper bay 0.7187047
purple salt 0.76434726
onion minced 0.68547773
seasoning ben's 0.4888513
garbanzo buckwheat 0.59822273
beans black 0.6009315
feta crumbles 0.8137418
cheese pizza 0.65302044
crumbles feta 0.81374186
plain crumbles 0.5743107
flour lard 0.6877942
ground lentils 0.7262362
salt juice 0.78417885


In [100]:
VOTES = 16
total = correct = 0

with open('../input/test.json') as file, open('output.csv', 'w') as outputf:
    for record in tqdm(json.load(file)):
        ingredients_ids = []
        for ingredient in record['ingredients']:
            ingredient = normalize_name(ingredient)
            if ingredient is None:
                continue
                
            ingredient = ingredient.split(' ')
            
            ingredient = [WORDS[i] for i in ingredient if i in WORDS]
            
            if len(ingredient):
                ingredients_ids.append(ingredient)
                
        qs = []
        for i in range(VOTES):
            igs = random.choices(ingredients_ids, k=MAX_INGREDIENT_COUNT)
            igs = [random.choices(j, k=MAX_INGREDIENT_DIM) for j in igs]
            qs.append(igs)
#         print(qs)
        y = model.predict(np.array(qs))
#         print(y.shape)
        y = np.sum(y, axis=0)
#         print(y.shape)
#         print(y)
        cuisine = CUISINES[np.argmax(y)]
#         if cuisine == record['cuisine']:
#             correct += 1
#         total += 1
#         break
        outputf.write(f"{record['id']},{cuisine}\n")
# print(100.0*correct/total)


  0%|          | 0/9944 [00:00<?, ?it/s][A
  1%|          | 57/9944 [00:00<00:17, 563.18it/s][A
  1%|          | 118/9944 [00:00<00:17, 574.86it/s][A
  2%|▏         | 178/9944 [00:00<00:16, 581.35it/s][A
  2%|▏         | 235/9944 [00:00<00:16, 576.96it/s][A
  3%|▎         | 294/9944 [00:00<00:16, 576.78it/s][A
  3%|▎         | 348/9944 [00:00<00:16, 564.50it/s][A
  4%|▍         | 400/9944 [00:00<00:17, 543.61it/s][A
  5%|▍         | 450/9944 [00:00<00:18, 519.11it/s][A
  5%|▌         | 503/9944 [00:00<00:18, 520.27it/s][A
  6%|▌         | 554/9944 [00:01<00:19, 490.35it/s][A
  6%|▌         | 608/9944 [00:01<00:18, 503.06it/s][A
  7%|▋         | 660/9944 [00:01<00:18, 506.51it/s][A
  7%|▋         | 714/9944 [00:01<00:17, 514.54it/s][A
  8%|▊         | 770/9944 [00:01<00:17, 526.16it/s][A
  8%|▊         | 828/9944 [00:01<00:16, 538.77it/s][A
  9%|▉         | 883/9944 [00:01<00:16, 541.66it/s][A
  9%|▉         | 938/9944 [00:01<00:16, 534.38it/s][A
 10%|▉         | 994/

 74%|███████▎  | 7326/9944 [00:15<00:04, 528.99it/s][A
 74%|███████▍  | 7379/9944 [00:15<00:04, 523.28it/s][A
 75%|███████▍  | 7432/9944 [00:15<00:04, 517.89it/s][A
 75%|███████▌  | 7484/9944 [00:15<00:04, 510.18it/s][A
 76%|███████▌  | 7538/9944 [00:16<00:04, 518.68it/s][A
 76%|███████▋  | 7590/9944 [00:16<00:04, 518.76it/s][A
 77%|███████▋  | 7643/9944 [00:16<00:04, 522.00it/s][A
 77%|███████▋  | 7697/9944 [00:16<00:04, 527.02it/s][A
 78%|███████▊  | 7753/9944 [00:16<00:04, 534.28it/s][A
 79%|███████▊  | 7809/9944 [00:16<00:03, 538.20it/s][A
 79%|███████▉  | 7863/9944 [00:16<00:03, 528.98it/s][A
 80%|███████▉  | 7916/9944 [00:16<00:03, 525.85it/s][A
 80%|████████  | 7969/9944 [00:16<00:03, 524.67it/s][A
 81%|████████  | 8024/9944 [00:16<00:03, 530.29it/s][A
 81%|████████  | 8078/9944 [00:17<00:03, 530.40it/s][A
 82%|████████▏ | 8132/9944 [00:17<00:03, 528.42it/s][A
 82%|████████▏ | 8185/9944 [00:17<00:03, 520.86it/s][A
 83%|████████▎ | 8238/9944 [00:17<00:03, 462.41i