In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
display(tf.VERSION)

  from ._conv import register_converters as _register_converters


'1.8.0'

## Setup Constants

In [2]:
BASE_DIR = './'
BATCH_SIZE = 32
STEPS = 2000
DROPOUT = 0.1
HIDDEN_UNITS = [64, 32]

## Setup methods to be re-used

In [3]:
def get_data(file):
    """Load data from a given file.
    
    Args:
        file - a string path that will be appended to BASE_DIR
        
    Returns:
        pandas.DataFrame
    """
    import json as json
    with open(BASE_DIR + file) as f:
        json_data = json.load(f)
    data = pd.DataFrame(json_data)
    display(len(json_data))
    display(json_data[0])
    display(data.head())
    return data


In [4]:
def get_mappings(data):
    """Create the unique mappings based upon the data passed in
    
    Args:
        data - a pandas.DataFrame with cuisine and ingredients columns
    
    Returns:
        ingredients_mapping - a dict with key being the string and a unique int
        cuisine_mapping - a dict with key being the string and a unique int
    """
    unique_cuisines = data['cuisine'].unique()
    cuisine_mapping = {value: idx for idx, value in enumerate(unique_cuisines)}
    unique_ingredients = []
    for idx, d in data.iterrows(): 
        unique_ingredients += d['ingredients']
    unique_ingredients = set(unique_ingredients)
    ingredients_mapping = {value: idx for idx, value in enumerate(unique_ingredients)}
    return ingredients_mapping, cuisine_mapping

In [5]:
def transform_to_dense_array(data, cuisine_mapping, ingredients_mapping, shape=None, make_labels=True):
    """Transforms data to a dense array to be used in the model.
    
    This method basically turns a sparse representation into a dense representation of the data.
    
    Args:
        data - a pandas.DataFrame
        cuisine_mapping - the mapping derived from get_mappings
        ingredients_mapping - the mapping derived from get_mappings
        
    Kwargs:
        shape - optional max number of rows
        make_labels - create labels for each of the rows
    
    Returns:
        transformed - the transformed data to be used in the model. The columns are the values from
            ingredients_mapping.
        labels - the labels corresponding to the rows (is an empty array if make_labels=False)
        data - original data but with the index reset
    """
    if shape is None:
        shape = len(data)
    transformed = {str(k): np.zeros(shape) for k in ingredients_mapping.values()}
    labels = []
    data = data.reset_index(drop=True)
    for idx, d in data.iterrows():
        if make_labels:
            labels.append(cuisine_mapping[d['cuisine']])
        for ing in d['ingredients']:
            transformed[str(ingredients_mapping[ing])][idx] += 1
        if idx >= shape - 1:
            break
    assert len(transformed['0']) == len(labels)
    return transformed, labels, data

In [6]:
def make_columns(ingredients_mapping):
    """Creates the columns for the models based upon ingredient_mapping.values()
    
    Args:
        ingredients_mapping - the mapping gotten from get_mappings
    
    Returns:
        An array of columns.
    """
    return [tf.feature_column.numeric_column(str(x)) for x in ingredients_mapping.values()]

In [7]:
def train_input_fn(features, label_list, batch_size):
    """An input function used to get the data for training the model."""
    dataset = tf.data.Dataset.from_tensor_slices((dict(features), label_list))
    return dataset.shuffle(1000).repeat().batch(batch_size)

In [8]:
def eval_input_fn(features, label_list, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if label_list is None:
        # No labels, use only features.
        inputs = (features,)
    else:
        inputs = (features, label_list)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

## Run the program

For this we are going to use the provided training data for both training and eval.

Notes:
  - Because of memory limitations we use only 10,000 records for training, and 10,000 records for eval. The problem appears to be loading of all the dataset into memory. An improvement would be to look into streaming the data.

In [9]:
data = get_data('train.json')
ingredients_mapping, cuisine_mapping = get_mappings(data)

39774

{'id': 10259,
 'cuisine': 'greek',
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [10]:
training_data, training_labels, _ = transform_to_dense_array(
    data[0:10_000], cuisine_mapping, ingredients_mapping)

eval_data, eval_labels, _ = transform_to_dense_array(
    data[10_001:20_000], cuisine_mapping, ingredients_mapping)

In [11]:
all_columns = make_columns(ingredients_mapping)
display(all_columns[0])

_NumericColumn(key='0', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

## Train the model

Here we are using the DNNClassifier to train the model. During this run it was 73% accurate, which isn't great but not too bad either (for the first time using tensorflow). See https://www.kaggle.com/c/whats-cooking-kernels-only for other implementations.

In [12]:
estimator = tf.estimator.DNNClassifier(
    feature_columns=all_columns,
    hidden_units=HIDDEN_UNITS,
    n_classes=len(cuisine_mapping.keys()),
    dropout=DROPOUT
)
estimator.train(
    input_fn=lambda: train_input_fn(training_data, training_labels, BATCH_SIZE),
    steps=STEPS
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/home/jovyan/tmpd4xkmexs', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fb89596c3c8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 1 into /home/jovyan/tmpd4xkmexs/model.ckpt.
INFO:tensorf

<tensorflow.python.estimator.canned.dnn.DNNClassifier at 0x7fb895a53f60>

In [13]:
eval_result = estimator.evaluate(
    input_fn=lambda:eval_input_fn(eval_data, eval_labels, BATCH_SIZE)
)

display('Test set accuracy: {accuracy:0.3f}'.format(**eval_result))

INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-07-16:27:39
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /home/jovyan/tmpd4xkmexs/model.ckpt-2000
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Finished evaluation at 2018-07-07-16:29:59
INFO:tensorflow:Saving dict for global step 2000: accuracy = 0.73717374, average_loss = 1.0873905, global_step = 2000, loss = 34.73744


'Test set accuracy: 0.737'

In [14]:
eval_result

{'accuracy': 0.73717374,
 'average_loss': 1.0873905,
 'loss': 34.73744,
 'global_step': 2000}