# <img src=https://kaggle2.blob.core.windows.net/competitions/kaggle/4526/logos/front_page.png width=200 align="left">

### What's cooking? 
Use recipe ingredients to categorize the cuisine, dataset provided by [Yummly](http://www.yummly.com/)

More info: [kaggle competition webpage](https://www.kaggle.com/c/whats-cooking)

In [15]:
import pandas as pd
import re
import sklearn.feature_extraction.text
import sklearn.ensemble
import sklearn.cross_validation
import sklearn.naive_bayes
from nltk.stem import WordNetLemmatizer

traindf_full = pd.read_json("train.json")

#sample training set? False: use the whole data set, otherwise, input a fraction to use (value from 0-1)
sample_training=False

if sample_training:
    traindf = traindf_full.sample(frac=sample_training)
else:
    traindf = traindf_full

testdf = pd.read_json("test.json")

In [16]:
#Simple version
#traindf['ingredients_string'] = [' '.join(ingredient).strip() for ingredient in traindf['ingredients']]
#testdf['ingredients_string'] = [' '.join(ingredient).strip() for ingredient in testdf['ingredients']]

#Fancier version: apply a word lemmatizer
traindf['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in traindf['ingredients']]
testdf['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in testdf['ingredients']]


traindf.head()

Unnamed: 0,cuisine,id,ingredients,ingredients_string
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes...",romaine lettuce black olives grape tomatoes ga...
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g...",plain flour ground pepper salt tomato ground b...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g...",egg pepper salt mayonaise cooking oil green ch...
3,indian,22213,"[water, vegetable oil, wheat, salt]",water vegetable oil wheat salt
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe...",black pepper shallot cornflour cayenne pepper ...


In [17]:
# Train
corpus_train = traindf['ingredients_string']
vectorizer_train = sklearn.feature_extraction.text.TfidfVectorizer(stop_words='english', \
                                                        max_df = .6, binary=False, ngram_range = ( 1, 1),analyzer="word")
tfidf_train = vectorizer_train.fit_transform(corpus_train).todense()
X_train = tfidf_train
y_train = traindf['cuisine']

# Test
corpus_test = testdf['ingredients_string']

tfidf_test = vectorizer_train.transform(corpus_test).todense()
X_test = tfidf_test

In [18]:
#Dicts with classifiers and their parameters for Grid Search CV

classifiers = {
    #'Logistic Regression': sklearn.linear_model.LogisticRegression(),
    #'Random Forests': sklearn.ensemble.RandomForestClassifier(n_estimators=500),
    #'SVC': sklearn.svm.SVC(C=1.0, kernel='linear', probability=False),
    'LinearSVC': sklearn.svm.LinearSVC(),
    #'MultinomialNB': sklearn.naive_bayes.BernoulliNB(),
    #'Gradient Boosting': sklearn.ensemble.GradientBoostingClassifier(n_estimators=10, learning_rate=0.1, max_depth=2),
}

classifiers_gridparameters = {
    'Logistic Regression': None,
    'Random Forests': None,
    'SVC': [{'kernel': ['linear'], 'C': [0.2, 0.4, 0.6, 0.8, 1.0]}],
    'LinearSVC': {'C': [0.2, 0.4, 0.6, 0.8, 1.0, 10]},
    'MultinomialNB': None,
    'Gradient Boosting': None,
    #'Gradient Boosting': {"n_estimators": [10, 50, 100], 'learning_rate': [0.1, 0.2, 0.3, 0.5], 'max_depth': [1, 2, 4],},
}

for clf_name, clf_notoptimized in classifiers.iteritems():
    
    skf = sklearn.cross_validation.StratifiedKFold(y_train, n_folds=5)

    param_grid = classifiers_gridparameters[clf_name]
    
    if param_grid is None:
        print "Skipping grid search for %s" %clf_name
        clf_fitted = clf_notoptimized.fit(X_train, y_train)
    else:
        print "Doing grid search for %s" %clf_name
        clf = sklearn.grid_search.GridSearchCV(estimator=clf_notoptimized, param_grid=param_grid, cv=skf, scoring='accuracy')
        clf_fitted = clf.fit(X_train, y_train).best_estimator_
        clf_optimal_params = clf.best_params_
        print "Best parameters:", clf_optimal_params
    
    scores = sklearn.cross_validation.cross_val_score(clf_fitted, X_train, y_train, cv=skf, scoring='accuracy')
    print("CV Accuracy: %0.4f (+/- %0.4f) %s" % (scores.mean(), scores.std(), clf_name))

Doing grid search for LinearSVC
Best parameters: {'C': 0.4}
CV Accuracy: 0.7906 (+/- 0.0047) LinearSVC


In [19]:
y_pred = clf_fitted.predict(X_test)
testdf['cuisine'] = y_pred

testdf = testdf.sort_values(by='id' , ascending=True)
testdf[['id' , 'cuisine' ]].to_csv("yummly_submission.csv", index=False)