In [1]:
# import external libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
import json
from collections import defaultdict
from scipy import sparse
%matplotlib inline

In [2]:
def convert_to_json(filename='./data/train.json'):
    """
    Reads in a file and returns json 
    """
    
    with open(filename) as infile:
        return json.load(infile)

In [3]:
def get_column_names(row):
    """
    Takes in a row of the data and returns column names
    """
    return row.keys()    

In [4]:
def get_content(row, col_name):
    """
    Takes in a row and a column name
    and returns a list of values
    """
    if col_name == 'ingredients':
        return ' '.join(row[col_name])
    else:
        return row[col_name]

In [5]:
whats_cooking_train = convert_to_json()
whats_cooking_test = convert_to_json('./data/test.json')

In [6]:
def prepare_dataset(json_repr):
    """
    Takes in a json representation of the data
    and returns a Pandas DataFrame.
    """
    
    column_names = sorted(get_column_names(json_repr[0]))
    cols = []
    
    for col_name in column_names:
        cols.append([get_content(row, col_name) for row in json_repr])
    
    data = dict(zip(column_names, cols))
    
    df = pd.DataFrame(data)
    df.set_index('id', inplace=True)
    
    return df


In [7]:
whats_cooking_train_df = prepare_dataset(whats_cooking_train)
whats_cooking_test_df = prepare_dataset(whats_cooking_test)

In [8]:
whats_cooking_train_df.head()

Unnamed: 0_level_0,cuisine,ingredients
id,Unnamed: 1_level_1,Unnamed: 2_level_1
10259,greek,romaine lettuce black olives grape tomatoes ga...
25693,southern_us,plain flour ground pepper salt tomatoes ground...
20130,filipino,eggs pepper salt mayonaise cooking oil green c...
22213,indian,water vegetable oil wheat salt
13162,indian,black pepper shallots cornflour cayenne pepper...


In [9]:
whats_cooking_test_df.head()

Unnamed: 0_level_0,ingredients
id,Unnamed: 1_level_1
18009,baking powder eggs all-purpose flour raisins m...
28583,sugar egg yolks corn starch cream of tartar ba...
41580,sausage links fennel bulb fronds olive oil cub...
29752,meat cuts file powder smoked sausage okra shri...
35687,ground black pepper salt sausage casings leeks...


## Questions

In [10]:
import re

In [11]:
## What are the different unique ingredients used across various cuisines ?

def get_ingredients(cuisines):
    all_ingredients = []

    for i in range(cuisines.shape[0]):
        ## get all the ingredients
        ingredients = cuisines.iloc[i, 1].split(' ') # 1 here marks first column for ingredients
    
        for ingredient in ingredients:
            ingredient = re.sub(r'[^A-Za-z]', '', ingredient)
            # omit empty space and stopwords as ingredient name
            if len(ingredient) > 2:
                all_ingredients.append(ingredient.lower())
    
    return all_ingredients

def get_unique_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    return set(all_ingredients)
    

In [12]:
print len(get_unique_ingredients(whats_cooking_train_df))

3030


** There are 3030 different ingredients used across various cuisines, bearing in mind that we considered 
   e.g. black olives to be ['black', 'olive'] as two separate ingredients **

In [13]:
from collections import Counter

In [14]:
## What are the top most used ingredients ?

def get_top_most_used_ingredients(cuisines):
    all_ingredients = get_ingredients(cuisines)
    
    ## counts frequency of each ingredient
    top_most_used_ingredients = Counter(all_ingredients)
    
    return sorted(top_most_used_ingredients, key=lambda x: top_most_used_ingredients[x], reverse=True)

In [15]:
top_most_used_ingredients = get_top_most_used_ingredients(whats_cooking_train_df)

In [16]:
## 10 top most used ingredients
print top_most_used_ingredients[:10]

[u'pepper', u'salt', u'oil', u'garlic', u'ground', u'fresh', u'sauce', u'sugar', u'onions', u'cheese']


** This seems legit, indeed these are some of the top-most used ingredients in preparation of any cuisine **

In [17]:
## Ingredients per cuisine

def get_ingredients_per_cuisine(grouped_cuisines, names_of_cuisines):
    ingredients_per_cuisine = {}
    
    for name in names_of_cuisines:
        cuisine_group = grouped_cuisines.get_group(name)
        ingredients_per_cuisine[name] = list(get_unique_ingredients(cuisine_group))
    
    return ingredients_per_cuisine

In [18]:
grouped_cuisines = whats_cooking_train_df.groupby(['cuisine'])

In [19]:
names_of_cuisines = whats_cooking_train_df.cuisine.unique()

In [20]:
ingredients_per_cuisine = get_ingredients_per_cuisine(grouped_cuisines, names_of_cuisines)

In [21]:
print ingredients_per_cuisine.keys()

[u'irish', u'mexican', u'chinese', u'filipino', u'vietnamese', u'moroccan', u'brazilian', u'japanese', u'british', u'greek', u'indian', u'jamaican', u'french', u'spanish', u'russian', u'cajun_creole', u'thai', u'southern_us', u'korean', u'italian']


In [22]:
## Lets check out what ingredients define indian cuisines
print ingredients_per_cuisine['indian'][:50]

[u'freerange', u'monterey', u'all', u'portabello', u'chinese', u'mackerel', u'yellow', u'soften', u'olive', u'mild', u'fivespice', u'skim', u'shortgrain', u'gluten', u'skin', u'roots', u'mascarpone', u'milk', u'cummin', u'preserves', u'grape', u'sago', u'pattypan', u'assam', u'peanut', u'sparkling', u'granular', u'curds', u'dressing', u'couscous', u'tzatziki', u'brown', u'turnips', u'demerara', u'quorn', u'garden', u'yeast', u'citrus', u'kewra', u'vegan', u'baton', u'vadouvan', u'jalape', u'figs', u'softened', u'mooli', u'kappa', u'bhindi', u'minute', u'tortillas']


## Preprocessing

In [23]:
cuisines_train = whats_cooking_train_df.copy()
cuisines_test = whats_cooking_test_df.copy()

In [24]:
def process_ingredient_name(ingredient_name):
    ingredient_name = re.sub(r'^A-Za-z', '', ingredient_name.lower())
    return ingredient_name

cuisines_train['ingredients'] = cuisines_train.ingredients.map(process_ingredient_name)
cuisines_test['ingredients'] = cuisines_test.ingredients.map(process_ingredient_name)

## Encoding labels

In [25]:
from sklearn.preprocessing import LabelEncoder

In [26]:
## training labels
train_labels = cuisines_train.cuisine

In [27]:
lbl_encoder = LabelEncoder()
lbl_encoder.fit(train_labels)

LabelEncoder()

In [28]:
target = lbl_encoder.transform(train_labels)

## Modelling

In [29]:
## online learning algorithm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.cross_validation import StratifiedShuffleSplit

In [30]:
from nltk.corpus import stopwords

In [31]:
english_stopwords = stopwords.words('english')

In [32]:
sss = StratifiedShuffleSplit(target, test_size=0.3)

train_index, test_index = next(iter(sss))

In [33]:
train_X = cuisines_train.iloc[train_index, 1]
train_target = target[train_index]

test_X = cuisines_train.iloc[test_index, 1]
test_target = target[test_index]

In [80]:
vec = TfidfVectorizer(ngram_range=(1, 2), stop_words=english_stopwords)
X_train = vec.fit_transform(train_X)
y_train = train_target

In [85]:
pac = PassiveAggressiveClassifier(C=0.5)
pac.fit(X_train, y_train)

PassiveAggressiveClassifier(C=0.5, fit_intercept=True, loss='hinge', n_iter=5,
              n_jobs=1, random_state=None, shuffle=True, verbose=0,
              warm_start=False)

In [86]:
print 'Training score %f ', pac.score(X_train, y_train)

Training score %f  0.988289388606


In [87]:
X_test = vec.transform(test_X)
y_test = test_target

In [88]:
print 'Test score %f ', pac.score(X_test, y_test)

Test score %f  0.780328418231


## One-hot encoding

In [90]:
from collections import defaultdict

In [212]:
def prepare_bag_of_ingredients(cuisines):
    one_hot_encoded = defaultdict(list)

    for i in range(cuisines.shape[0]):
        ingredient = cuisines.iloc[i, 1]
        ingredient = re.sub(r'^A-Za-z', '', ingredient)
        ingredient = ingredient.lower()

        for u_ingr in top_most_used_ingredients[:500]:
            if u_ingr in ingredient:
                one_hot_encoded[u_ingr].append(1)
            else:
                one_hot_encoded[u_ingr].append(0)
    
    return one_hot_encoded

In [258]:
bag_of_ingredients = prepare_bag_of_ingredients(cuisines_train.head(15000))

In [259]:
bag_of_ingredients_df = pd.DataFrame(bag_of_ingredients)

In [260]:
bag_of_ingredients_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 15000 entries, 0 to 14999
Columns: 500 entries, active to zucchini
dtypes: int64(500)
memory usage: 57.3 MB


In [261]:
from scipy import sparse

In [262]:
bag_of_ingredients_sparse = sparse.csr_matrix(bag_of_ingredients_df.values)

In [263]:
from sklearn.cross_validation import train_test_split

In [265]:
train_X, test_X, train_target, test_target = train_test_split(bag_of_ingredients_sparse, target[:15000], test_size=0.2)

In [266]:
import xgboost as xgb

In [267]:
xg_train = xgb.DMatrix( train_X, label=train_target )
xg_test = xgb.DMatrix( test_X, label=test_target )

In [268]:
# setup parameters for xgboost
param = {}
# use softmax multi-class classification
param['objective'] = 'multi:softmax'
# scale weight of positive examples
param['eta'] = 0.07
param['max_depth'] = 6
param['silent'] = 1
param['nthread'] = 4
param['num_class'] = 20
param['colsample_bytree'] = 0.7

In [269]:
watchlist = [ (xg_train,'train'), (xg_test, 'test') ]
num_round = 150
bst = xgb.train(param, xg_train, num_round, watchlist );

[0]	train-merror:0.383583	test-merror:0.418667
[1]	train-merror:0.345167	test-merror:0.394000
[2]	train-merror:0.315333	test-merror:0.364333
[3]	train-merror:0.308833	test-merror:0.359000
[4]	train-merror:0.299167	test-merror:0.353333
[5]	train-merror:0.293167	test-merror:0.347333
[6]	train-merror:0.290000	test-merror:0.351000
[7]	train-merror:0.283917	test-merror:0.346333
[8]	train-merror:0.279917	test-merror:0.340333
[9]	train-merror:0.274750	test-merror:0.334667
[10]	train-merror:0.271583	test-merror:0.333000
[11]	train-merror:0.267667	test-merror:0.331333
[12]	train-merror:0.266667	test-merror:0.328333
[13]	train-merror:0.264167	test-merror:0.328333
[14]	train-merror:0.261500	test-merror:0.328667
[15]	train-merror:0.259000	test-merror:0.325667
[16]	train-merror:0.256500	test-merror:0.324667
[17]	train-merror:0.253833	test-merror:0.321667
[18]	train-merror:0.253083	test-merror:0.323000
[19]	train-merror:0.252500	test-merror:0.322333
[20]	train-merror:0.250583	test-merror:0.319333
[2

## Predictions

In [64]:
train_X_feat = vec.fit_transform(train_X)

In [65]:
pac.fit(train_X_feat, train_target)

PassiveAggressiveClassifier(C=0.2, fit_intercept=True, loss='hinge', n_iter=5,
              n_jobs=1, random_state=None, shuffle=True, verbose=0,
              warm_start=False)

In [66]:
X_test = vec.transform(cuisines_test.ingredients)

In [67]:
preds = pac.predict(X_test)

## Submission

In [68]:
preds_labels = lbl_encoder.inverse_transform(preds)

In [69]:
test_ids = cuisines_test.index.values
submission_df = pd.DataFrame({'id': test_ids, 'cuisine': preds_labels})
submission_df.to_csv('./submissions/stratified_split.csv', index=False)