## Imports

In [1]:
import pandas as pd
import numpy as np
import re
from nltk import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.naive_bayes import BernoulliNB

## Read in data

In [2]:
df = pd.read_csv('../ingredients_combined/ingredients_reduced.csv')

## View head

In [3]:
df.head(3)

Unnamed: 0.1,Unnamed: 0,id,ingredient_list,cuisine,ingredient_string
0,0,Baby-Back-Ribs-with-Bourbon-Barbecue-Sauce-170...,"[baby back ribs, bourbon whiskey, ketchup, gar...",American,baby back rib bourbon whiskey ketchup garlic d...
1,1,Roast-Turkey-With-Apple-Cider-Gravy-772029,"[Honeysuckle White® Frozen Whole Turkey, butte...",American,honeysuckle white frozen whole turkey butter s...
2,2,Better-Than-Sex-Brisket-1060441,"[brisket, beef base, garlic salt, ground peppe...",American,brisket beef base garlic salt ground pepper ch...


## Create new column with concatenated strings made up of ingredients and recipe names

In [3]:
# Remove non-alpha characters
name_string1 = [re.sub('[^A-Za-z]', ' ', z).strip().lower() for z in df['id']]

# Remove extra whitespace between words
name_string2 = [' '.join(z.split()) for z in name_string1]

# Lemmatize each word - start by converting the string to a list to iterate through
as_list = [z.split() for z in name_string2]

# Iterate through list, using WordNetLemmatizer
lemmatized = []
for _list in as_list:
    sub_list = []
    for word in _list:
        sub_list.append(WordNetLemmatizer().lemmatize(word))
    lemmatized.append(sub_list)

# Then need to convert the list back into a string
name_string3 = [' '.join(z) for z in lemmatized]

# Zip together recipe name string and ingredient string
zipped = zip(name_string3, df.ingredient_string)

# Concatenate recipe name string and ingredient string
ingredient_id_string = [y + ' ' + z for y,z in zipped]    

# Add new column to dataframe with concatenated strings
df['ingredient_id_string'] = ingredient_id_string

In [4]:
# Use label encoder to encode cuisine
le = LabelEncoder()
df['encoded_cuisine'] = le.fit_transform(df.cuisine)

## Create feature and target vectors

In [5]:
# Assign X and y
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X = vect.fit_transform(df.ingredient_id_string)
y = df.encoded_cuisine

## Create model eval function

In [6]:
# Create a dictionary to hold model scores
all_models = {}

# Create a function to evaluate
def model_predictions(model):
    predictions = cross_val_predict(model, X, y)
    return predictions

## Logistic Regression

In [7]:
# Logistic regression
logit = LogisticRegression()
params = {'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
          'penalty': ['l1', 'l2']}
grid = GridSearchCV(logit, params)
grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [8]:
# Generate model predictions
predictions = model_predictions(grid.best_estimator_)

# Print classification report
print classification_report(y, predictions)

             precision    recall  f1-score   support

          0       0.97      0.94      0.95      2000
          1       0.99      0.99      0.99      2000
          2       0.97      0.98      0.97      2000
          3       0.99      0.99      0.99      1753
          4       0.97      0.99      0.98      1998

avg / total       0.98      0.98      0.98      9751



## Support Vector Machine

In [17]:
# Defaults to one-vs-rest
svm = LinearSVC()
params = {'C': [1.0, 10.0],
          'penalty': ['l1', 'l2']}
grid = GridSearchCV(logit, params)
grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [19]:
# Generate model predictions
predictions = model_predictions(grid.best_estimator_)

# Print classification report
print classification_report(y, predictions)

             precision    recall  f1-score   support

          0       0.96      0.94      0.95      2000
          1       0.99      0.99      0.99      2000
          2       0.96      0.98      0.97      2000
          3       0.99      0.99      0.99      1753
          4       0.97      0.98      0.98      1998

avg / total       0.97      0.97      0.97      9751



## Multinomial Naive Bayes

In [20]:
nb = MultinomialNB()
predictions = model_predictions(nb)
print classification_report(y, predictions)

             precision    recall  f1-score   support

          0       0.95      0.87      0.91      2000
          1       0.98      0.99      0.98      2000
          2       0.93      0.94      0.94      2000
          3       0.92      0.94      0.93      1753
          4       0.94      0.98      0.96      1998

avg / total       0.94      0.94      0.94      9751

