In [1]:
import tensorflow as tf
import pandas as pd
import numpy as np
import os
import gc
import random
gc.collect()
display(tf.VERSION)

  from ._conv import register_converters as _register_converters


'1.9.0'

## Setup Constants

In [6]:
BASE_DIR = './'
BATCH_SIZE = 32
STEPS = 2000
DROPOUT = 0.05
HIDDEN_UNITS = [64, 32]

## Setup methods to be re-used

In [7]:
def get_data(file):
    """Load data from a given file.
    
    Args:
        file - a string path that will be appended to BASE_DIR
        
    Returns:
        pandas.DataFrame
    """
    import json as json
    with open(BASE_DIR + file) as f:
        json_data = json.load(f)
    data = pd.DataFrame(json_data)
    display(len(json_data))
    display(json_data[0])
    display(data.head())
    return data


In [8]:
def get_mappings(data):
    """Create the unique mappings based upon the data passed in
    
    Args:
        data - a pandas.DataFrame with cuisine and ingredients columns
    
    Returns:
        ingredients_mapping - a dict with key being the string and a unique int
        cuisine_mapping - a dict with key being the string and a unique int
    """
    unique_cuisines = data['cuisine'].unique()
    cuisine_mapping = {value: idx for idx, value in enumerate(unique_cuisines)}
    unique_ingredients = []
    for idx, d in data.iterrows(): 
        unique_ingredients += d['ingredients']
    unique_ingredients = set(unique_ingredients)
    ingredients_mapping = {value: idx for idx, value in enumerate(unique_ingredients)}
    return ingredients_mapping, cuisine_mapping

In [9]:
def transform_to_dense_array(data, cuisine_mapping, ingredients_mapping, shape=None, make_labels=True):
    """Transforms data to a dense array to be used in the model.
    
    This method basically turns a sparse representation into a dense representation of the data.
    
    Args:
        data - a pandas.DataFrame
        cuisine_mapping - the mapping derived from get_mappings
        ingredients_mapping - the mapping derived from get_mappings
        
    Kwargs:
        shape - optional max number of rows
        make_labels - create labels for each of the rows
    
    Returns:
        transformed - the transformed data to be used in the model. The columns are the values from
            ingredients_mapping.
        labels - the labels corresponding to the rows (is an empty array if make_labels=False)
        data - original data but with the index reset
    """
    if shape is None:
        shape = len(data)
    transformed = {str(k): np.zeros(shape) for k in ingredients_mapping.values()}
    labels = []
    data = data.reset_index(drop=True)
    for idx, d in data.iterrows():
        if make_labels:
            labels.append(cuisine_mapping[d['cuisine']])
        for ing in d['ingredients']:
            transformed[str(ingredients_mapping[ing])][idx] += 1
        if idx >= shape - 1:
            break
    assert len(transformed['0']) == len(labels)
    return transformed, labels, data

In [10]:
def make_columns(keys):
    """Creates the columns for the models based upon ingredient_mapping.values()
    
    Args:
        ingredients_mapping - the mapping gotten from get_mappings
    
    Returns:
        An array of columns.
    """
    return [tf.feature_column.numeric_column(str(x)) for x in keys]

In [11]:
def train_input_fn(batch_size, p, csv_file):
    """An input function used to get the data for training the model."""
    # Read the data
    random_data = pd.read_csv(csv_file, 
                              header=0, 
                              skiprows=lambda i: i>0 and random.random() > p
    )
    label_list = list(random_data['label'])
    del random_data['label']
    dataset = tf.data.Dataset.from_tensor_slices((dict(random_data), label_list))
    return dataset.repeat().batch(batch_size)

In [12]:
def eval_input_fn(features, label_list, batch_size):
    """An input function for evaluation or prediction"""
    features=dict(features)
    if label_list is None:
        # No labels, use only features.
        inputs = (features,)
    else:
        inputs = (features, label_list)

    # Convert the inputs to a Dataset.
    dataset = tf.data.Dataset.from_tensor_slices(inputs)

    # Batch the examples
    assert batch_size is not None, "batch_size must not be None"
    dataset = dataset.batch(batch_size)

    # Return the dataset.
    return dataset

## Run the program

For this we are going to use the provided training data for both training and eval.

Notes:
  - Because of memory limitations we use only 10,000 records for training, and 10,000 records for eval. The problem appears to be loading of all the dataset into memory. An improvement would be to look into streaming the data.

In [13]:
data = get_data('train.json')
ingredients_mapping, cuisine_mapping = get_mappings(data)

39774

{'id': 10259,
 'cuisine': 'greek',
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [23]:
training_data, training_labels, _ = transform_to_dense_array(
    data[0:30_000], cuisine_mapping, ingredients_mapping)

eval_data, eval_labels, _ = transform_to_dense_array(
    data[30_001:35_000], cuisine_mapping, ingredients_mapping)

In [14]:
all_columns = make_columns(ingredients_mapping.values())
display(all_columns[0])

_NumericColumn(key='0', shape=(1,), default_value=None, dtype=tf.float32, normalizer_fn=None)

In [15]:
del data
del ingredients_mapping
gc.collect()

7

In [16]:
gc.collect()

0

## Train the model

Here we are using the DNNClassifier to train the model. During this run it was 73% accurate, which isn't great but not too bad either (for the first time using tensorflow). See https://www.kaggle.com/c/whats-cooking-kernels-only for other implementations.

In [17]:
estimator = tf.estimator.DNNClassifier(
    feature_columns=all_columns,
    hidden_units=HIDDEN_UNITS,
    n_classes=len(cuisine_mapping.keys()),
    dropout=DROPOUT
)

INFO:tensorflow:Using default config.
INFO:tensorflow:Using config: {'_model_dir': '/tmp/tmp1hhfvj3z', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': 600, '_session_config': None, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': 100, '_train_distribute': None, '_device_fn': None, '_service': None, '_cluster_spec': <tensorflow.python.training.server_lib.ClusterSpec object at 0x7fee5e5676d8>, '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1}


In [None]:
base_steps = STEPS*2
steps = STEPS*2
for x in range(5*3):
    train_spec = tf.estimator.TrainSpec(
        input_fn=lambda: train_input_fn(BATCH_SIZE, 0.10, 'training_subset.csv'), 
        max_steps=steps)
    eval_spec = tf.estimator.EvalSpec(input_fn=lambda:train_input_fn(BATCH_SIZE, 0.50, 'eval_subset.csv'))
    tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec)
    gc.collect()
    steps += base_steps

INFO:tensorflow:Running training and evaluation locally (non-distributed).
INFO:tensorflow:Start train and evaluate loop. The evaluate will happen after 600 secs (eval_spec.throttle_secs) or training is finished.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 0 into /tmp/tmp1hhfvj3z/model.ckpt.
INFO:tensorflow:loss = 95.735306, step = 1
INFO:tensorflow:global_step/sec: 0.942283
INFO:tensorflow:loss = 53.407913, step = 101 (106.128 sec)
INFO:tensorflow:global_step/sec: 4.93846
INFO:tensorflow:loss = 21.590717, step = 201 (20.247 sec)
INFO:tensorflow:global_step/sec: 5.39751
INFO:tensorflow:loss = 11.743645, step = 301 (18.527 sec)
INFO:tensorflow:global_step/sec: 5.38897
INFO:tensorflow:loss = 6.1494107, step = 401 (18.556 sec)
INFO:tensorflow:global_step/sec: 5

INFO:tensorflow:Evaluation [90/100]
INFO:tensorflow:Evaluation [100/100]
INFO:tensorflow:Finished evaluation at 2018-07-19-14:48:36
INFO:tensorflow:Saving dict for global step 3998: accuracy = 0.718125, average_loss = 1.2886572, global_step = 3998, loss = 41.23703
INFO:tensorflow:Saving 'checkpoint_path' summary for global step 3998: /tmp/tmp1hhfvj3z/model.ckpt-3998
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Create CheckpointSaverHook.
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp1hhfvj3z/model.ckpt-3998
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Saving checkpoints for 3998 into /tmp/tmp1hhfvj3z/model.ckpt.
INFO:tensorflow:loss = 52.94861, step = 3999
INFO:tensorflow:Saving checkpoints for 4000 into /tmp/tmp1hhfvj3z/model.ckpt.
INFO:tensorflow:Loss for final step: 20.133091.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_f

INFO:tensorflow:loss = 8.065343, step = 6707
INFO:tensorflow:global_step/sec: 0.855863
INFO:tensorflow:loss = 26.894016, step = 6807 (116.843 sec)
INFO:tensorflow:global_step/sec: 4.6006
INFO:tensorflow:loss = 6.703912, step = 6907 (21.738 sec)
INFO:tensorflow:global_step/sec: 4.74352
INFO:tensorflow:loss = 7.967104, step = 7007 (21.080 sec)
INFO:tensorflow:global_step/sec: 4.74847
INFO:tensorflow:loss = 1.3720969, step = 7107 (21.062 sec)
INFO:tensorflow:global_step/sec: 4.74456
INFO:tensorflow:loss = 3.2881484, step = 7207 (21.073 sec)
INFO:tensorflow:global_step/sec: 4.52868
INFO:tensorflow:loss = 1.9987497, step = 7307 (22.083 sec)
INFO:tensorflow:global_step/sec: 4.8487
INFO:tensorflow:loss = 1.5647563, step = 7407 (20.624 sec)
INFO:tensorflow:global_step/sec: 4.73632
INFO:tensorflow:loss = 1.4173425, step = 7507 (21.113 sec)
INFO:tensorflow:global_step/sec: 4.8657
INFO:tensorflow:loss = 1.9550163, step = 7607 (20.554 sec)
INFO:tensorflow:global_step/sec: 4.84224
INFO:tensorflow:l

INFO:tensorflow:loss = 0.43652895, step = 10203 (21.016 sec)
INFO:tensorflow:global_step/sec: 4.82843
INFO:tensorflow:loss = 0.7437469, step = 10303 (20.709 sec)
INFO:tensorflow:global_step/sec: 4.73722
INFO:tensorflow:loss = 0.26082236, step = 10403 (21.113 sec)
INFO:tensorflow:Saving checkpoints for 10488 into /tmp/tmp1hhfvj3z/model.ckpt.
INFO:tensorflow:Loss for final step: 1.1510751.
INFO:tensorflow:Calling model_fn.
INFO:tensorflow:Done calling model_fn.
INFO:tensorflow:Starting evaluation at 2018-07-19-16:32:54
INFO:tensorflow:Graph was finalized.
INFO:tensorflow:Restoring parameters from /tmp/tmp1hhfvj3z/model.ckpt-10488
INFO:tensorflow:Running local_init_op.
INFO:tensorflow:Done running local_init_op.
INFO:tensorflow:Evaluation [10/100]
INFO:tensorflow:Evaluation [20/100]
INFO:tensorflow:Evaluation [30/100]
INFO:tensorflow:Evaluation [40/100]
INFO:tensorflow:Evaluation [50/100]
INFO:tensorflow:Evaluation [60/100]
INFO:tensorflow:Evaluation [70/100]
INFO:tensorflow:Evaluation [8

# create a training and eval data set

In [33]:
p_for_eval = 0.1
training_file_name = 'training_subset.csv'
eval_file_name = 'eval_subset.csv'

In [34]:
iter_count = 0
with open('train.csv', 'rb') as all_data_file:
    with open('eval_subset.csv', 'ab') as eval_file:
        with open('training_subset.csv', 'ab') as training_file:
            for line in all_data_file:
                if iter_count == 0:
                    training_file.write(line)
                    eval_file.write(line)
                    iter_count += 1
                elif random.random() > p_for_eval:
                    training_file.write(line)
                else:
                    eval_file.write(line)