# What's is cooking?

The task is to guess the cuisine based on cooking receipts.

Data files are provided on Kaggle https://www.kaggle.com/c/whats-cooking

In [157]:
# we will be using these packages

import pandas as pnd

import numpy as np

import sklearn

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.model_selection import train_test_split

from sklearn.svm import LinearSVC

import re

## Data Extraction

In [27]:
# path to data files
train_data_path = 'train.json'

data = pnd.read_json(train_data_path)

# have a look
data[:10]

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."
5,jamaican,6602,"[plain flour, sugar, butter, eggs, fresh ginge..."
6,spanish,42779,"[olive oil, salt, medium shrimp, pepper, garli..."
7,italian,3735,"[sugar, pistachio nuts, white almond bark, flo..."
8,mexican,16903,"[olive oil, purple onion, fresh pineapple, por..."
9,italian,12734,"[chopped tomatoes, fresh basil, garlic, extra-..."


In [125]:
print("lists of cuisines: %s"%y_train.unique())
print("total records: %s"%y_train.shape[0])

y = data['cuisine']
X = data.drop('cuisine',axis=1)

# relatively small number of classes (cuisines) and lots of samples -> we can separate a hold-out set for testing

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.7,test_size=0.3,random_state=11)

lists of cuisines: ['greek' 'southern_us' 'filipino' 'indian' 'jamaican' 'spanish' 'italian'
 'mexican' 'chinese' 'british' 'thai' 'vietnamese' 'cajun_creole'
 'brazilian' 'french' 'japanese' 'irish' 'korean' 'moroccan' 'russian']
total records: 39774


## Bag-of-words model


In [137]:
# we can see individual ingradients as "words" and receipts as "texts"

word_separator = '|'

def preprocess(dt):
    dt.loc[:,'receipt_text'] = dt['ingredients'].apply(lambda t: word_separator.join(t))
    return dt
    
pnd.options.mode.chained_assignment = None # disables a general warning that doesn't apply to our case
X_train = preprocess(X_train)
X_test = preprocess(X_test)


In [144]:
# the standard approach is to build term-document matrix

# note: we need to make sure that our words can be extracted properly - by our separator

pattern = r'[^|]+'
pattern = re.compile(pattern)
def tokenize(text):
    return pattern.findall(text)

# build the matrix with the above extraction function

vect = CountVectorizer(tokenizer=tokenize)
X_train_f = vect.fit_transform(X_train['receipt_text'])

# the matrix is sparse, because every receipt uses just a few out of many available ingredients

X_train_f

# note: we have 27841 receipts and 6076 ingredients in total

<27841x6076 sparse matrix of type '<class 'numpy.int64'>'
	with 299985 stored elements in Compressed Sparse Row format>

In [139]:
# the vocabulary consists of all ingredients - see the first 10

list(vect.vocabulary_.keys())[:10]

['soy vay® hoisin garlic marinade & sauce',
 'one third less sodium chicken broth',
 'ground sausage',
 'chipotle puree',
 'maggi',
 'fresh orange juice',
 'dandelion greens',
 'low-fat cream cheese',
 'fudge brownie mix',
 'coarse kosher salt']

In [145]:
# the next standard step is to assign bigger weights to less frequent (=more selective) terms

tfidf = TfidfTransformer()
X_train_idf = tfidf.fit_transform(X_train_counts)

## Classifier

In [146]:
# finally we can apply a classifier such as SVM

clf = LinearSVC()

clf.fit(X_train_idf, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

## Evaluation


In [148]:
X_test_f = vect.transform(X_test['receipt_text'])
X_test_idf = tfidf.transform(X_test_f)

print("The score is %s"%clf.score(X_test_idf,y_test))

The score is 0.782368222576


## Example - top ingredients

In [174]:
# for every cuisine, print top 5 ingredients

for i in range(len(clf.classes_)):
    cuisine_name = clf.classes_[i]
    top_ingred = clf.coef_[i].argsort()[-5:] # extract by indices
    top_ingred_names = np.array(vect.get_feature_names())[top_ingred]
    print('%s:  %s'% (cuisine_name,','.join(top_ingred_names)))

brazilian:  chocolate sprinkles,palm oil,manioc flour,hearts of palm,cachaca
british:  stilton,suet,marmite,raspberry jam,stilton cheese
cajun_creole:  creole style seasoning,blackening seasoning,andouille sausage,creole seasoning,cajun seasoning
chinese:  mandarin oranges,hoisin sauce,chinese rice wine,szechwan peppercorns,chinese five-spice powder
filipino:  edam,lumpia skins,lumpia wrappers,fruit cocktail,calamansi juice
french:  swiss cheese,herbes de provence,grated gruyère cheese,crepes,gruyere cheese
greek:  phyllo dough,greek yogurt,greek seasoning,feta cheese,feta cheese crumbles
indian:  ground cardamom,tandoori spices,masala,garam masala,curry powder
irish:  corned beef,irish cream liqueur,irish bacon,guinness beer,irish whiskey
italian:  arborio rice,pesto,italian sausage,mascarpone,ricotta cheese
jamaican:  ground allspice,jamaican jerk spice,jerk sauce,jerk seasoning,jamaican jerk season
japanese:  matcha green tea powder,sake,dashi,mirin,miso paste
korean:  dark sesame o

In [163]:
clf.coef_

array(['granola', 'dende oil', 'dried black beans', 'tapioca flour',
       'açai', 'chocolate sprinkles', 'palm oil', 'manioc flour',
       'hearts of palm', 'cachaca'], 
      dtype='<U71')