In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
from nltk import WordNetLemmatizer
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

from matplotlib import pyplot as plt
% matplotlib inline

In [2]:
full_ingred_df = pd.read_csv('../ingredients_combined/full_ingredients.csv')

In [45]:
df = full_ingred_df[['id', 'ingredient_list', 'cuisine']]

In [46]:
# Remove non-alpha chcaracters and convert to lower case
# Note, this is still in string form here
ingredient_string = [re.sub('[^A-Za-z]', ' ', z).strip().lower() for z in df['ingredient_list']]

# To lemmatize each word, need to convert the string to a list to allow lemmatizer to iterate through
convert_to_list = [z.split() for z in ingredient_string]
lemmatized = [WordNetLemmatizer().lemmatize(z) for z in ingredient_string]

# Here lematizer iterates through list; couldn't get it to work with list comprehension
lemmatized = []
for _list in convert_to_list:
    sub_list = []
    for word in _list:
        sub_list.append(WordNetLemmatizer().lemmatize(word))
    lemmatized.append(sub_list)

# Then need to convert the list back into a string
df['ingredient_string'] = [' '.join(z) for z in lemmatized]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [24]:
# Send df to csv
# df.to_csv('../ingredients_combined/ingredients_reduced.csv', encoding='utf-8')

In [47]:
vect = TfidfVectorizer(stop_words='english')
# ngram range to two doesn't do much

In [56]:
le = LabelEncoder()
encoded_cuisine = le.fit_transform(df.cuisine)

In [57]:
X = df.ingredient_string
y = encoded_cuisine

In [58]:
# X_dtm = vect.fit_transform(X)

In [59]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [60]:
X_train_dtm = vect.fit_transform(X_train)

In [63]:
X_test_dtm = vect.transform(X_test)

In [61]:
logit = LogisticRegression()

In [62]:
logit.fit(X_train_dtm, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [64]:
y_pred = logit.predict(X_test_dtm)

In [65]:
accuracy_score(y_test, y_pred)

0.93191140278917151

In [66]:
print confusion_matrix(y_test, y_pred)

[[428   7  13  14  26]
 [  2 510   1   1   2]
 [ 16   2 482  20   2]
 [ 17   1  29 400   0]
 [  8   2   1   2 452]]


In [23]:
print classification_report(y_test, y_pred)

             precision    recall  f1-score   support

          0       0.89      0.87      0.88       500
          1       0.97      0.97      0.97       490
          2       0.92      0.93      0.93       492
          3       0.92      0.89      0.91       439
          4       0.95      0.97      0.96       517

avg / total       0.93      0.93      0.93      2438



In [72]:
# lets try this out on out of sample data
new_data = ['tomato sauce onion garlic carrot parmesan cheese']
new_data_dtm = vect.transform(new_data)

In [73]:
new_data_dtm.shape

(1, 1596)

In [74]:
X_test_dtm.shape

(2438, 1596)

In [75]:
logit.predict(new_data_dtm)

array([2])

In [45]:
lr = LogisticRegression()

In [46]:
# Create gridsearch parameters

param_grid = {"C":[0.01,0.05, 0.1,0.5, 1.0, 5.0, 10.0,20.0,30.0,50.0], 'penalty':['l1', 'l2']}

In [47]:
clf = GridSearchCV(lr, param_grid, cv=10, scoring='accuracy')

In [48]:
print X_dtm.shape
print y.shape

(9751, 1718)
(9751,)


In [49]:
clf.fit(X_dtm, y)

GridSearchCV(cv=10, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.01, 0.05, 0.1, 0.5, 1.0, 5.0, 10.0, 20.0, 30.0, 50.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring='accuracy', verbose=0)

In [50]:
clf.best_estimator_

LogisticRegression(C=10.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [51]:
clf.best_score_

0.93518613475540968

In [52]:
clf.best_params_

{'C': 10.0, 'penalty': 'l2'}