In [1]:
# import external libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import json
%matplotlib inline

In [2]:
def convert_to_json(filename='./data/train.json'):
    """
    Reads in a file and returns json 
    """
    
    with open(filename) as infile:
        return json.load(infile)

In [3]:
def get_column_names(row):
    """
    Takes in a row of the data and returns column names
    """
    return row.keys()    

In [4]:
def get_content(row, col_name):
    """
    Takes in a row and a column name
    and returns a list of values
    """
    if col_name == 'ingredients':
        return ' '.join(row[col_name])
    else:
        return row[col_name]

In [5]:
whats_cooking_train = convert_to_json()
whats_cooking_test = convert_to_json('./data/test.json')

In [6]:
def prepare_dataset(json_repr):
    """
    Takes in a json representation of the data
    and returns a Pandas DataFrame.
    """
    
    column_names = sorted(get_column_names(json_repr[0]))
    cols = []
    
    for col_name in column_names:
        cols.append([get_content(row, col_name) for row in json_repr])
    
    data = dict(zip(column_names, cols))
    
    df = pd.DataFrame(data)
    df.set_index('id', inplace=True)
    
    return df


In [7]:
whats_cooking_train_df = prepare_dataset(whats_cooking_train)
whats_cooking_test_df = prepare_dataset(whats_cooking_test)

In [8]:
whats_cooking_train_df.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,romaine lettuce black olives grape tomatoes ga...
25693,southern_us,plain flour ground pepper salt tomatoes ground...
20130,filipino,eggs pepper salt mayonaise cooking oil green c...
22213,indian,water vegetable oil wheat salt
13162,indian,black pepper shallots cornflour cayenne pepper...


In [9]:
whats_cooking_test_df.head()

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,baking powder eggs all-purpose flour raisins m...
28583,sugar egg yolks corn starch cream of tartar ba...
41580,sausage links fennel bulb fronds olive oil cub...
29752,meat cuts file powder smoked sausage okra shri...
35687,ground black pepper salt sausage casings leeks...


## Questions

In [54]:
## What are the different unique ingredients used across various cuisines ?

def get_ingredients(cuisines):
    all_ingredients = []

    for i in range(cuisines.shape[0]):
        ## get all the ingredients
        ingredients = cuisines.iloc[i, 1].split(' ') # 1 here marks first column for ingredients
    
        for ingredient in ingredients:
            # omit empty space as ingredient name
            if len(ingredient) > 0:
                all_ingredients.append(ingredient.lower())
    
    return all_ingredients

def get_unique_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    return set(all_ingredients)
    

In [55]:
print len(get_unique_ingredients(whats_cooking_train_df))

3187


** There are 3187 different ingredients used across various cuisines, bearing in mind that we considered 
   e.g. black olives to be ['black', 'olive'] as two separate ingredients **

In [56]:
from collections import Counter

In [64]:
## What are the top most used ingredients ?

def get_top_most_used_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    ## counts frequency of each ingredient
    top_most_used_ingredients = Counter(all_ingredients)
    
    return sorted(top_most_used_ingredients, key=lambda x: top_most_used_ingredients[x], reverse=True)

In [65]:
top_most_used_ingredients = get_top_most_used_ingredients(whats_cooking_train_df)

In [66]:
## 10 top most used ingredients
print top_most_used_ingredients[:10]

[u'pepper', u'salt', u'oil', u'garlic', u'ground', u'fresh', u'sauce', u'sugar', u'onions', u'cheese']


** This seems legit, indeed these are some of the top-most used ingredients in preparation of any c