In [1]:
# import external libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import json
%matplotlib inline

In [2]:
def convert_to_json(filename='./data/train.json'):
    """
    Reads in a file and returns json 
    """
    
    with open(filename) as infile:
        return json.load(infile)

In [3]:
def get_column_names(row):
    """
    Takes in a row of the data and returns column names
    """
    return row.keys()    

In [4]:
def get_content(row, col_name):
    """
    Takes in a row and a column name
    and returns a list of values
    """
    if col_name == 'ingredients':
        return ' '.join(row[col_name])
    else:
        return row[col_name]

In [5]:
whats_cooking_train = convert_to_json()
whats_cooking_test = convert_to_json('./data/test.json')

In [6]:
def prepare_dataset(json_repr):
    """
    Takes in a json representation of the data
    and returns a Pandas DataFrame.
    """
    
    column_names = sorted(get_column_names(json_repr[0]))
    cols = []
    
    for col_name in column_names:
        cols.append([get_content(row, col_name) for row in json_repr])
    
    data = dict(zip(column_names, cols))
    
    df = pd.DataFrame(data)
    df.set_index('id', inplace=True)
    
    return df


In [7]:
whats_cooking_train_df = prepare_dataset(whats_cooking_train)
whats_cooking_test_df = prepare_dataset(whats_cooking_test)

In [8]:
whats_cooking_train_df.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,romaine lettuce black olives grape tomatoes ga...
25693,southern_us,plain flour ground pepper salt tomatoes ground...
20130,filipino,eggs pepper salt mayonaise cooking oil green c...
22213,indian,water vegetable oil wheat salt
13162,indian,black pepper shallots cornflour cayenne pepper...


In [9]:
whats_cooking_test_df.head()

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,baking powder eggs all-purpose flour raisins m...
28583,sugar egg yolks corn starch cream of tartar ba...
41580,sausage links fennel bulb fronds olive oil cub...
29752,meat cuts file powder smoked sausage okra shri...
35687,ground black pepper salt sausage casings leeks...


## Questions

In [119]:
import re

In [120]:
## What are the different unique ingredients used across various cuisines ?

def get_ingredients(cuisines):
    all_ingredients = []

    for i in range(cuisines.shape[0]):
        ## get all the ingredients
        ingredients = cuisines.iloc[i, 1].split(' ') # 1 here marks first column for ingredients
    
        for ingredient in ingredients:
            ingredient = re.sub(r'[^A-Za-z]', '', ingredient)
            # omit empty space as ingredient name
            if len(ingredient) > 0:
                all_ingredients.append(ingredient.lower())
    
    return all_ingredients

def get_unique_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    return set(all_ingredients)
    

In [121]:
print len(get_unique_ingredients(whats_cooking_train_df))

3064


** There are 3064 different ingredients used across various cuisines, bearing in mind that we considered 
   e.g. black olives to be ['black', 'olive'] as two separate ingredients **

In [122]:
from collections import Counter

In [123]:
## What are the top most used ingredients ?

def get_top_most_used_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    ## counts frequency of each ingredient
    top_most_used_ingredients = Counter(all_ingredients)
    
    return sorted(top_most_used_ingredients, key=lambda x: top_most_used_ingredients[x], reverse=True)

In [124]:
top_most_used_ingredients = get_top_most_used_ingredients(whats_cooking_train_df)

In [125]:
## 10 top most used ingredients
print top_most_used_ingredients[:10]

[u'pepper', u'salt', u'oil', u'garlic', u'ground', u'fresh', u'sauce', u'sugar', u'onions', u'cheese']


** This seems legit, indeed these are some of the top-most used ingredients in preparation of any cuisine **

In [126]:
## Ingredients per cuisine

def get_ingredients_per_cuisine(grouped_cuisines, names_of_cuisines):
    ingredients_per_cuisine = {}
    
    for name in names_of_cuisines:
        cuisine_group = grouped_cuisines.get_group(name)
        ingredients_per_cuisine[name] = list(get_unique_ingredients(cuisine_group))
    
    return ingredients_per_cuisine

In [127]:
grouped_cuisines = whats_cooking_train_df.groupby(['cuisine'])

In [128]:
names_of_cuisines = whats_cooking_train_df.cuisine.unique()

In [129]:
ingredients_per_cuisine = get_ingredients_per_cuisine(grouped_cuisines, names_of_cuisines)

In [130]:
print ingredients_per_cuisine.keys()

[u'irish', u'mexican', u'chinese', u'filipino', u'vietnamese', u'moroccan', u'brazilian', u'japanese', u'british', u'greek', u'indian', u'jamaican', u'french', u'spanish', u'russian', u'cajun_creole', u'thai', u'southern_us', u'korean', u'italian']


In [133]:
## Lets check out what ingredients define indian cuisines
print ingredients_per_cuisine['indian'][:50]

[u'freerange', u'monterey', u'all', u'portabello', u'chinese', u'mackerel', u'yellow', u'soften', u'olive', u'mild', u'fivespice', u'skim', u'shortgrain', u'gluten', u'skin', u'roots', u'mascarpone', u'milk', u'cummin', u'preserves', u'grape', u'sago', u'pattypan', u'assam', u'peanut', u'sparkling', u'granular', u'curds', u'ti', u'couscous', u'tzatziki', u'brown', u'turnips', u'demerara', u'quorn', u'garden', u'yeast', u'citrus', u'kewra', u'vegan', u'baton', u'vadouvan', u'jalape', u'figs', u'softened', u'mooli', u'kappa', u'bhindi', u'minute', u'tortillas']


## Preprocessing

In [135]:
cuisines_train = whats_cooking_train_df.copy()
cuisines_test = whats_cooking_test_df.copy()

In [137]:
def process_ingredient_name(ingredient_name):
    ingredient_name = re.sub(r'^A-Za-z', '', ingredient_name.lower())
    return ingredient_name

cuisines_train['ingredients'] = cuisines_train.ingredients.map(process_ingredient_name)
cuisines_test['ingredients'] = cuisines_test.ingredients.map(process_ingredient_name)

## Encoding labels

In [139]:
from sklearn.preprocessing import LabelEncoder

In [140]:
## training labels
train_labels = cuisines_train.cuisine

In [141]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(train_labels)

LabelEncoder()

In [142]:
target = lbl_encoder.transform(train_labels)

## Modelling

In [143]:
## online learning algorithm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier

In [152]:
from nltk.corpus import stopwords

In [153]:
english_stopwords = stopwords.words('english')

In [162]:
vec = TfidfVectorizer(ngram_range=(1, 2), stop_words=english_stopwords)
X_train = vec.fit_transform(cuisines_train.ingredients)
y_train = target

In [163]:
from sklearn.cross_validation import train_test_split

In [164]:
Xt, Xv, yt, yv = train_test_split(X_train, y_train, test_size=0.3)

In [165]:
pac = PassiveAggressiveClassifier()
pac.fit(Xt, yt)

PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge', n_iter=5,
              n_jobs=1, random_state=None, shuffle=True, verbose=0,
              warm_start=False)

In [166]:
print 'Training score %f ', pac.score(Xt, yt)

Training score %f  0.997593477246


In [167]:
print 'Test score %f ', pac.score(Xv, yv)

Test score %f  0.770300846392


## Predictions

In [168]:
pac.fit(X_train, y_train)

PassiveAggressiveClassifier(C=1.0, fit_intercept=True, loss='hinge', n_iter=5,
              n_jobs=1, random_state=None, shuffle=True, verbose=0,
              warm_start=False)

In [169]:
X_test = vec.transform(cuisines_test.ingredients)

In [170]:
preds = pac.predict(X_test)

## Submission

In [171]:
preds_labels = lbl_encoder.inverse_transform(preds)

In [173]:
test_ids = cuisines_test.index.values
submission_df = pd.DataFrame({'id': test_ids, 'cuisine': preds_labels})
submission_df.to_csv('./submissions/first.csv', index=False)