In [74]:
import pandas as pd
import numpy as np
train = pd.read_json('../MLtext2/MLtext2/data/train.json')
train.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [75]:
def make_features(df):
    
    df['num_ingredients'] = df.ingredients.apply(len)
    df['ingredient_length'] = df.ingredients.apply(lambda x: np.mean([len(item) for item in x]))
    df['ingredients_str'] = df.ingredients.astype(str)
    
    return df

In [76]:
train = make_features(pd.read_json('../MLtext2/MLtext2/data/train.json'))
new = make_features(pd.read_json('../MLtext2/MLtext2/data/test.json'))

In [77]:
train.head()

Unnamed: 0,cuisine,id,ingredients,num_ingredients,ingredient_length,ingredients_str
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",9,12.0,"['romaine lettuce', 'black olives', 'grape tom..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",11,10.090909,"['plain flour', 'ground pepper', 'salt', 'toma..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",12,10.333333,"['eggs', 'pepper', 'salt', 'mayonaise', 'cooki..."
3,indian,22213,"[water, vegetable oil, wheat, salt]",4,6.75,"['water', 'vegetable oil', 'wheat', 'salt']"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",20,10.1,"['black pepper', 'shallots', 'cornflour', 'cay..."


In [78]:
train.shape

(39774, 6)

In [79]:
new.head()

Unnamed: 0,id,ingredients,num_ingredients,ingredient_length,ingredients_str
0,18009,"[baking powder, eggs, all-purpose flour, raisi...",6,9.333333,"['baking powder', 'eggs', 'all-purpose flour',..."
1,28583,"[sugar, egg yolks, corn starch, cream of tarta...",11,10.272727,"['sugar', 'egg yolks', 'corn starch', 'cream o..."
2,41580,"[sausage links, fennel bulb, fronds, olive oil...",6,9.666667,"['sausage links', 'fennel bulb', 'fronds', 'ol..."
3,29752,"[meat cuts, file powder, smoked sausage, okra,...",21,12.0,"['meat cuts', 'file powder', 'smoked sausage',..."
4,35687,"[ground black pepper, salt, sausage casings, l...",8,13.0,"['ground black pepper', 'salt', 'sausage casin..."


Part 2: Using pipeline for proper cross-validation

In [80]:
X = train.ingredients_str
y = train.cuisine

In [81]:
X.head()

0    ['romaine lettuce', 'black olives', 'grape tom...
1    ['plain flour', 'ground pepper', 'salt', 'toma...
2    ['eggs', 'pepper', 'salt', 'mayonaise', 'cooki...
3          ['water', 'vegetable oil', 'wheat', 'salt']
4    ['black pepper', 'shallots', 'cornflour', 'cay...
Name: ingredients_str, dtype: object

In [82]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer(token_pattern="'([a-z ]+)'")

In [83]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [84]:
from sklearn.pipeline import make_pipeline
pipe = make_pipeline(vect, nb)

In [85]:
pipe.steps

[('countvectorizer',
  CountVectorizer(analyzer='word', binary=False, decode_error='strict',
          dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
          lowercase=True, max_df=1.0, max_features=None, min_df=1,
          ngram_range=(1, 1), preprocessor=None, stop_words=None,
          strip_accents=None, token_pattern="'([a-z ]+)'", tokenizer=None,
          vocabulary=None)),
 ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]

In [86]:
from sklearn.model_selection import cross_val_score
cross_val_score(pipe, X, y, cv=5, scoring='accuracy').mean()


0.7322884933790151

In [87]:
pipe.named_steps.keys()

dict_keys(['countvectorizer', 'multinomialnb'])

In [95]:
param_grid = {}
param_grid['countvectorizer__token_pattern'] = ["\\b\\w\\w+\\b","'([a-z ]+)'"]
param_grid['multinomialnb__alpha'] = [0.5,1]
param_grid

{'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"],
 'multinomialnb__alpha': [0.5, 1]}

In [96]:
from sklearn.grid_search import GridSearchCV
grid = GridSearchCV(pipe, param_grid, cv=5, scoring='accuracy')

In [97]:
%time grid.fit(X, y)

CPU times: user 25 s, sys: 516 ms, total: 25.5 s
Wall time: 26.7 s


GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), p...  vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"], 'multinomialnb__alpha': [0.5, 1]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [99]:
grid.grid_scores_

[mean: 0.72422, std: 0.00457, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 0.5},
 mean: 0.72351, std: 0.00469, params: {'countvectorizer__token_pattern': '\\b\\w\\w+\\b', 'multinomialnb__alpha': 1},
 mean: 0.74770, std: 0.00460, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.5},
 mean: 0.73229, std: 0.00552, params: {'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 1}]

In [100]:
grid.best_score_
grid.best_params_

{'countvectorizer__token_pattern': "'([a-z ]+)'", 'multinomialnb__alpha': 0.5}

Using RandomizedSearchCV

In [101]:
from sklearn.grid_search import RandomizedSearchCV

In [102]:
import scipy as sp
param_grid = {}
param_grid['countvectorizer__token_pattern'] = ['\\b\\w\\w+\\b', "'([a-z ]+)'"]
param_grid['countvectorizer__min_df'] = [1,2,3]
param_grid['multinomialnb__alpha'] = sp.stats.uniform(scale=1)
param_grid

{'countvectorizer__min_df': [1, 2, 3],
 'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"],
 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen at 0x12b0830f0>}

In [104]:
np.random.seed(1)

In [107]:
rand = RandomizedSearchCV(pipe, param_grid, cv=5, scoring='accuracy', n_iter=5,random_state=1)

In [108]:
rand.fit(X, y)

RandomizedSearchCV(cv=5, error_score='raise',
          estimator=Pipeline(memory=None,
     steps=[('countvectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), p...  vocabulary=None)), ('multinomialnb', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))]),
          fit_params={}, iid=True, n_iter=5, n_jobs=1,
          param_distributions={'countvectorizer__token_pattern': ['\\b\\w\\w+\\b', "'([a-z ]+)'"], 'countvectorizer__min_df': [1, 2, 3], 'multinomialnb__alpha': <scipy.stats._distn_infrastructure.rv_frozen object at 0x12b0830f0>},
          pre_dispatch='2*n_jobs', random_state=1, refit=True,
          scoring='accuracy', verbose=0)

In [109]:
rand.best_score_

0.751370241866546

In [110]:
rand.best_params_

{'countvectorizer__min_df': 2,
 'countvectorizer__token_pattern': "'([a-z ]+)'",
 'multinomialnb__alpha': 0.30233257263183977}