In [1]:
import numpy as np
import pandas as pd
import nltk
import re
from nltk.stem import WordNetLemmatizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
import sklearn.metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import grid_search
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [2]:
%ls

ingredients.ipynb  submission.csv     test.json          train.json


In [3]:
train_df = pd.read_json("train.json")
train_df.head()

test_df = pd.read_json("test.json")

In [4]:
train_df['raw_text'] = train_df.ingredients.apply(lambda x: ", ".join(x))
train_df['raw_text'].head()

0    romaine lettuce, black olives, grape tomatoes,...
1    plain flour, ground pepper, salt, tomatoes, gr...
2    eggs, pepper, salt, mayonaise, cooking oil, gr...
3                    water, vegetable oil, wheat, salt
4    black pepper, shallots, cornflour, cayenne pep...
Name: raw_text, dtype: object

In [5]:
train_df.cuisine.head()

0          greek
1    southern_us
2       filipino
3         indian
4         indian
Name: cuisine, dtype: object

In [6]:
train_df['ingredients_clean_text'] = [', '.join(x).strip() for x in train_df['ingredients']]
train_df['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in train_df['ingredients']]

In [7]:
train_df['ingredients_string'].head()

0    romaine lettuce black olives grape tomatoes ga...
1    plain flour ground pepper salt tomato ground b...
2    egg pepper salt mayonaise cooking oil green ch...
3                       water vegetable oil wheat salt
4    black pepper shallot cornflour cayenne pepper ...
Name: ingredients_string, dtype: object

In [8]:
test_df['ingredients_clean_text'] = [', '.join(x).strip() for x in test_df['ingredients']]
test_df['ingredients_string'] = [' '.join([WordNetLemmatizer().lemmatize(re.sub('[^A-Za-z]', ' ', line)) for line in lists]).strip() for lists in test_df['ingredients']]
test_df['ingredients_string'].head()

0    baking powder egg all purpose flour raisin mil...
1    sugar egg yolks corn starch cream of tartar ba...
2    sausage links fennel bulb frond olive oil cuba...
3    meat cuts file powder smoked sausage okra shri...
4    ground black pepper salt sausage casings leek ...
Name: ingredients_string, dtype: object

In [9]:
corpus_train = train_df['ingredients_string']
vectorizer_train = TfidfVectorizer(stop_words='english')
tfidf_train = vectorizer_train.fit_transform(corpus_train)

In [11]:
corpus_test = test_df['ingredients_string']
vectorizer_test = TfidfVectorizer(stop_words='english')
tfidf_test=vectorizer_train.transform(corpus_test)

In [12]:
predictors_train = tfidf_train
targets_train = train_df['cuisine']

predictors_test = tfidf_test

In [14]:
parameters = {'C':[1, 100]}
clf = LinearSVC()
#clf = LogisticRegression()
#classifier = RandomForestClassifier(n_estimators=100)

classifier = grid_search.GridSearchCV(clf, parameters)
classifier = classifier.fit(predictors_train,targets_train)

In [16]:
classifier

GridSearchCV(cv=None, error_score='raise',
       estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0),
       fit_params={}, iid=True, loss_func=None, n_jobs=1,
       param_grid={'C': [1, 100]}, pre_dispatch='2*n_jobs', refit=True,
       score_func=None, scoring=None, verbose=0)

In [17]:
predictions = classifier.predict(predictors_test)
test_df['cuisine'] = predictions
test_df = test_df.sort('id', ascending=True)

In [18]:
test_df.head()

Unnamed: 0,id,ingredients,ingredients_clean_text,ingredients_string,cuisine
4987,5,"[mushrooms, chopped onion, tomato sauce, chees...","mushrooms, chopped onion, tomato sauce, cheese...",mushroom chopped onion tomato sauce cheese dri...,mexican
9232,7,"[minced garlic, brown rice, sour cream, chicke...","minced garlic, brown rice, sour cream, chicken...",minced garlic brown rice sour cream chicken re...,indian
9638,11,"[lime juice, sesame oil, garlic cloves, fish s...","lime juice, sesame oil, garlic cloves, fish sa...",lime juice sesame oil garlic cloves fish sauce...,vietnamese
4927,12,"[sugar, vanilla extract, corn starch, coffee g...","sugar, vanilla extract, corn starch, coffee gr...",sugar vanilla extract corn starch coffee granu...,italian
3280,13,"[frozen pie crust, bourbon whiskey, powdered s...","frozen pie crust, bourbon whiskey, powdered su...",frozen pie crust bourbon whiskey powdered suga...,southern_us


In [19]:
test_df[['id','cuisine']].to_csv('submission.csv', index=False)

In [20]:
%ls

ingredients.ipynb  submission.csv     test.json          train.json
