In [78]:
%matplotlib inline

import gc
import re

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

from keras.models import Sequential, load_model, Model
from keras.layers import Input, Dropout, Dense, BatchNormalization, Activation
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
from keras.optimizers import Adam

In [79]:
train_raw_data = pd.read_json('../input/train.json').set_index('id')
test_raw_data = pd.read_json('../input/test.json').set_index('id')

df = pd.concat([train_raw_data.drop("cuisine", axis=1), test_raw_data], axis=0)
y = train_raw_data['cuisine']

traindex = train_raw_data.index
testdex = test_raw_data.index
df_index = df.index

del train_raw_data, test_raw_data
gc.collect()

# X preprocess

def unify(word_list):
    word_list = ' '.join(word_list).lower()
    word_list = re.sub('[^a-z]', ' ', word_list)
    word_list = re.sub(' +', ' ', word_list).strip()
    return word_list

df['ingredients'] = df['ingredients'].apply(unify)
ingredients = pd.Series(' '.join(df['ingredients'].tolist()).split(' '))
v_counts = ingredients.value_counts()

vect = CountVectorizer(tokenizer=lambda x: x.split(' '))
dummies = vect.fit_transform(df['ingredients'])
new_df = pd.DataFrame(dummies.todense(), columns=vect.get_feature_names())
new_df.index = df_index

columns_list = v_counts[v_counts <= 2].index
new_df = new_df.drop(columns=columns_list)

X = new_df.loc[traindex,:]
X_test = new_df.loc[testdex,:]

valid_rows = (X.sum(axis=1) >= 2)
X = X[valid_rows]; y = y[valid_rows]

del df, new_df, dummies
gc.collect()

# y preprocess

y_lbr = LabelBinarizer()
y = y_lbr.fit_transform(y)

y_classes = y_lbr.classes_
y_classes_transformed = y_lbr.transform(y_classes)

# dataset split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1)

del X, y
gc.collect()

print(X_train.shape)
print(y_train.shape)

(35791, 2321)
(35791, 20)


In [80]:
def get_model():
    i = Input(shape=X_train.shape[1:])
    x = Dropout(0.4)(i)
    x = Dense(600)(x)
    x = BatchNormalization()(x)
    x = Activation('relu')(x)
    x = Dropout(0.5)(x)
    o = Dense(y_train.shape[1], activation='linear')(x)
    o = Dense(y_train.shape[1], activation='softmax')(o)
    
    return Model(inputs=[i,], outputs=[o,])

model = get_model()
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_18 (InputLayer)        (None, 2321)              0         
_________________________________________________________________
dropout_35 (Dropout)         (None, 2321)              0         
_________________________________________________________________
dense_43 (Dense)             (None, 600)               1393200   
_________________________________________________________________
batch_normalization_18 (Batc (None, 600)               2400      
_________________________________________________________________
activation_18 (Activation)   (None, 600)               0         
_________________________________________________________________
dropout_36 (Dropout)         (None, 600)               0         
_________________________________________________________________
dense_44 (Dense)             (None, 20)                12020     
__________

In [None]:
lr = 0.005
epochs = 100

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=lr), metrics=['accuracy'])

callbacks = [
    EarlyStopping(monitor='val_loss', patience=33, verbose=1),
    ModelCheckpoint('best_loss.model', monitor='val_loss', save_best_only=True, verbose=0),
    ModelCheckpoint('best_acc.model', monitor='val_acc', save_best_only=True, verbose=0),
    ReduceLROnPlateau(factor=0.3, patience=7, min_lr=0.00001, verbose=0)
]

history = model.fit(
    x=[X_train,],
    y=[y_train]*1,
    validation_data=[[X_val,], [y_val]*1],
    callbacks=callbacks,
    epochs=epochs,
    batch_size=256,
    shuffle=True,
    verbose=2
)

In [None]:
for tmp in ['loss', 'acc']:
    best_model = load_model('best_{}.model'.format(tmp))
    
    print('best {} model'.format(tmp))
    print(best_model.evaluate([X_val,], [y_val]*1, verbose=0), end='\n\n')
    
    y_test = best_model.predict(X_test, verbose=0)
    y_test = y_lbr.inverse_transform(y_test)
    submission_df = pd.Series(y_test, index=testdex).rename('cuisine')
    submission_df.to_csv("nn_best_{}.csv".format(tmp), index=True, header=True)