In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import requests
import json
import re
import operator

In [72]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC, LinearSVC
from sklearn.model_selection import StratifiedShuffleSplit, train_test_split
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

### Kaggle recipe ingredients training dataset

In [3]:
with open('ingredient_train.json', 'r') as json_f:
    json_train_data = json.load(json_f)

In [4]:
with open('ingredient_test.json', 'r') as json_f:
    json_test_data = json.load(json_f)

In [5]:
type(json_train_data)

list

In [6]:
len(json_train_data), len(json_test_data)

(39774, 9944)

In [7]:
json_train_data[0]

{'id': 10259,
 'cuisine': 'greek',
 'ingredients': ['romaine lettuce',
  'black olives',
  'grape tomatoes',
  'garlic',
  'pepper',
  'purple onion',
  'seasoning',
  'garbanzo beans',
  'feta cheese crumbles']}

In [8]:
type(json_train_data[0])

dict

#### get training and test data into separate pandas df

In [9]:
cuisine_data = pd.DataFrame(json_train_data)

In [10]:
cuisine_data.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [11]:
cuisine_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 39774 entries, 0 to 39773
Data columns (total 3 columns):
cuisine        39774 non-null object
id             39774 non-null int64
ingredients    39774 non-null object
dtypes: int64(1), object(2)
memory usage: 932.3+ KB


In [12]:
# cuisine_data.drop('id', axis=1, inplace=True)

In [13]:
cuisine_data.head()

Unnamed: 0,cuisine,id,ingredients
0,greek,10259,"[romaine lettuce, black olives, grape tomatoes..."
1,southern_us,25693,"[plain flour, ground pepper, salt, tomatoes, g..."
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cooking oil, g..."
3,indian,22213,"[water, vegetable oil, wheat, salt]"
4,indian,13162,"[black pepper, shallots, cornflour, cayenne pe..."


In [14]:
cuisine_data['cuisine'].value_counts()

italian         7838
mexican         6438
southern_us     4320
indian          3003
chinese         2673
french          2646
cajun_creole    1546
thai            1539
japanese        1423
greek           1175
spanish          989
korean           830
vietnamese       825
moroccan         821
british          804
filipino         755
irish            667
jamaican         526
russian          489
brazilian        467
Name: cuisine, dtype: int64

In [15]:
# split into X and y
X = cuisine_data['ingredients']
y = cuisine_data['cuisine']

In [16]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [17]:
X_train.shape, type(X_train)

((29830,), pandas.core.series.Series)

In [18]:
labels = y_train.unique()

In [19]:
labels

array(['mexican', 'japanese', 'southern_us', 'italian', 'indian',
       'chinese', 'filipino', 'spanish', 'vietnamese', 'french',
       'moroccan', 'greek', 'russian', 'jamaican', 'cajun_creole',
       'korean', 'irish', 'thai', 'british', 'brazilian'], dtype=object)

In [20]:
ids = cuisine_data['id'].unique()
len(ids)

39774

In [21]:
# to get on list containing all words in all lists in X
corpus = [i for sublist in X for i in sublist]

In [22]:
# tfidf = TfidfVectorizer(tokenizer=lambda i:i.split(","))

In [23]:
# converted_train = map(lambda i:",".join(i), X_train)

#### using Tfidf vectorizer

In [24]:
vec = TfidfVectorizer(stop_words='english', tokenizer=lambda x:x, lowercase=False,\
                      token_pattern='\w+', ngram_range=(1,1))

In [25]:
train_data = vec.fit_transform(X_train.values)

In [26]:
test_data = vec.transform(X_test.values)

In [27]:
# get top 10 features of vec
vec.get_feature_names()[:10]

['(    oz.) tomato sauce',
 '(   oz.) tomato paste',
 '(10 oz.) frozen chopped spinach',
 '(10 oz.) frozen chopped spinach, thawed and squeezed dry',
 '(14 oz.) sweetened condensed milk',
 '(14.5 oz.) diced tomatoes',
 '(15 oz.) refried beans',
 '1% low-fat buttermilk',
 '1% low-fat chocolate milk',
 '1% low-fat cottage cheese']

In [28]:
vec.vocabulary_

{'quinoa': 4542,
 'purple onion': 4516,
 'pepper': 4199,
 'jalapeno chilies': 3165,
 'chicken broth': 1369,
 'roma tomatoes': 4805,
 'salt': 4890,
 'lime': 3450,
 'cilantro': 1605,
 'minced garlic': 3784,
 'ginger': 2681,
 'canola oil': 1218,
 'sugar': 5422,
 'napa cabbage': 3896,
 'scallions': 4951,
 'miso paste': 3813,
 'ground pork': 2911,
 'carrots': 1255,
 'milk': 3773,
 'bacon': 651,
 'whole wheat breadcrumbs': 6055,
 'eggs': 2229,
 'baking potatoes': 673,
 'all-purpose flour': 481,
 'half & half': 2959,
 'shredded sharp cheddar cheese': 5095,
 'italian seasoning': 3151,
 'butter': 1105,
 'fresh herbs': 2485,
 'white onion': 5993,
 'button mushrooms': 1129,
 'boneless skinless chicken breast halves': 959,
 'grated parmesan cheese': 2802,
 'rotini': 4829,
 'crushed tomatoes': 1926,
 'garlic': 2647,
 'vegetable oil': 5871,
 'white sugar': 6005,
 'water': 5924,
 'tamarind concentrate': 5541,
 'cayenne pepper': 1282,
 'dates': 2012,
 'sesame seeds': 5015,
 'oil': 3979,
 'white pepper

In [29]:
# get inverse document frequency
idf = vec.idf_

In [30]:
# make dictionary with features and respective idfs
d = dict(zip(vec.get_feature_names(), idf))

In [31]:
sorted_d = sorted(d.items(), key=operator.itemgetter(1))
sorted_d[:10]

[('salt', 1.7921919912759123),
 ('onions', 2.6007931430556033),
 ('olive oil', 2.623311318524266),
 ('water', 2.673496064460222),
 ('garlic', 2.6797701730580297),
 ('sugar', 2.827140558387348),
 ('garlic cloves', 2.858035948400945),
 ('butter', 3.0964469718459435),
 ('ground black pepper', 3.110456265652729),
 ('all-purpose flour', 3.1467931741656274)]

#### try naive bayes classifier

#### test a range of alphas

In [32]:
alphas = np.arange(0, 1, 0.1)

In [33]:
def get_nb_train_and_predictions(alpha):
    clf = MultinomialNB(alpha=alpha)
    clf.fit(train_data, y_train)
    pred = clf.predict(test_data)
    score = accuracy_score(y_test, pred)
    return score

In [34]:
for alpha in alphas:
    score = get_nb_train_and_predictions(alpha)
    print('Alpha:', alpha)
    print('Accuracy score:', score, '\n')
    

  'setting alpha = %.1e' % _ALPHA_MIN)


Alpha: 0.0
Accuracy score: 0.707864038616251 

Alpha: 0.1
Accuracy score: 0.7470836685438456 

Alpha: 0.2
Accuracy score: 0.7360217216411906 

Alpha: 0.30000000000000004
Accuracy score: 0.7200321802091714 

Alpha: 0.4
Accuracy score: 0.7085679806918745 

Alpha: 0.5
Accuracy score: 0.6992156074014481 

Alpha: 0.6000000000000001
Accuracy score: 0.6899637972646823 

Alpha: 0.7000000000000001
Accuracy score: 0.6827232502011263 

Alpha: 0.8
Accuracy score: 0.6740748189863234 

Alpha: 0.9
Accuracy score: 0.6682421560740145 



In [35]:
# use alpha = 0.1
nb_classifier = MultinomialNB(alpha=0.1)

In [36]:
nb_classifier.fit(train_data, y_train)

MultinomialNB(alpha=0.1, class_prior=None, fit_prior=True)

In [37]:
pred = nb_classifier.predict(test_data)

In [38]:
accuracy_score(y_test, pred)

0.7470836685438456

In [39]:
# confusion_matrix(y_test, pred, labels=labels)

In [40]:
class_labels = nb_classifier.classes_

In [41]:
# extract feature
features = vec.get_feature_names()

In [42]:
# zip features and weights together for first class
feat_w_weights = sorted(zip(nb_classifier.coef_[0], features))

In [43]:
# top 10 features for first class
class_labels[0], feat_w_weights[:10]

('brazilian',
 [(-9.677381891052145, '(    oz.) tomato sauce'),
  (-9.677381891052145, '(   oz.) tomato paste'),
  (-9.677381891052145, '(10 oz.) frozen chopped spinach'),
  (-9.677381891052145,
   '(10 oz.) frozen chopped spinach, thawed and squeezed dry'),
  (-9.677381891052145, '(14 oz.) sweetened condensed milk'),
  (-9.677381891052145, '(14.5 oz.) diced tomatoes'),
  (-9.677381891052145, '(15 oz.) refried beans'),
  (-9.677381891052145, '1% low-fat buttermilk'),
  (-9.677381891052145, '1% low-fat chocolate milk'),
  (-9.677381891052145, '1% low-fat cottage cheese')])

In [44]:
# bottom 10 features for second class
class_labels[1], feat_w_weights[-10:]

('british',
 [(-5.030267324099119, 'garlic cloves'),
  (-4.914952103984577, 'water'),
  (-4.863621416835011, 'sugar'),
  (-4.8311057823858095, 'coconut milk'),
  (-4.723446697678546, 'olive oil'),
  (-4.61841226879129, 'onions'),
  (-4.569570693306051, 'salt'),
  (-4.4419091232820005, 'lime'),
  (-4.302054633712999, 'sweetened condensed milk'),
  (-4.197400945851531, 'cachaca')])

#### use pipeline with TFIDF and Logistic Regression classifier

In [62]:
logreg = OneVsRestClassifier(LogisticRegression(random_state=42))

In [63]:
logreg.fit(train_data, y_train)

OneVsRestClassifier(estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
          n_jobs=1)

In [64]:
# get accurracy 
logreg.score(test_data, y_test)

0.7676991150442478

In [65]:
y_prob = logreg.predict_proba(test_data)

In [66]:
labels = logreg.classes_

In [67]:
features = vec.get_feature_names()

In [68]:
# zip features and weights together for first class
feat_w_weights2 = sorted(zip(logreg.coef_[0], features))

#### try support vector classifier

In [73]:
svc = OneVsRestClassifier(LinearSVC(random_state=42))

In [74]:
svc.fit(train_data, y_train)

OneVsRestClassifier(estimator=LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=42, tol=0.0001,
     verbose=0),
          n_jobs=1)

In [75]:
svc.score(test_data, y_test)

0.7844931617055511

#### RandomForestClassifier

In [76]:
from sklearn.ensemble import RandomForestClassifier

In [77]:
rf = RandomForestClassifier(random_state=42)

In [78]:
rf.fit(train_data, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [79]:
rf.score(test_data, y_test)

0.6601971037811746

#### using multioutput classifier

In [80]:
from sklearn.multioutput import MultiOutputClassifier

In [81]:
moc_rf = MultiOutputClassifier(rf)

In [83]:
y_train.shape

(29830,)

In [86]:
# reshape target to be 2D
y_train = y_train.reshape(-1, 1)

  """Entry point for launching an IPython kernel.


In [87]:
moc_rf.fit(train_data, y_train)

MultiOutputClassifier(estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=42, verbose=0, warm_start=False),
           n_jobs=1)

In [88]:
y_test = y_test.reshape(-1, 1)

  """Entry point for launching an IPython kernel.


In [89]:
moc_rf.score(test_data, y_test)

0.6601971037811746

#### KNN

In [91]:
from sklearn.neighbors import KNeighborsClassifier

In [92]:
knn = KNeighborsClassifier()

In [93]:
knn.fit(train_data, y_train)

  """Entry point for launching an IPython kernel.


KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [94]:
knn.score(test_data, y_test)

0.7120876910699919

#### initial model using Logistic Regression and Tfidf vectorizer

In [116]:
corpus

['romaine lettuce',
 'black olives',
 'grape tomatoes',
 'garlic',
 'pepper',
 'purple onion',
 'seasoning',
 'garbanzo beans',
 'feta cheese crumbles',
 'plain flour',
 'ground pepper',
 'salt',
 'tomatoes',
 'ground black pepper',
 'thyme',
 'eggs',
 'green tomatoes',
 'yellow corn meal',
 'milk',
 'vegetable oil',
 'eggs',
 'pepper',
 'salt',
 'mayonaise',
 'cooking oil',
 'green chilies',
 'grilled chicken breasts',
 'garlic powder',
 'yellow onion',
 'soy sauce',
 'butter',
 'chicken livers',
 'water',
 'vegetable oil',
 'wheat',
 'salt',
 'black pepper',
 'shallots',
 'cornflour',
 'cayenne pepper',
 'onions',
 'garlic paste',
 'milk',
 'butter',
 'salt',
 'lemon juice',
 'water',
 'chili powder',
 'passata',
 'oil',
 'ground cumin',
 'boneless chicken skinless thigh',
 'garam masala',
 'double cream',
 'natural yogurt',
 'bay leaf',
 'plain flour',
 'sugar',
 'butter',
 'eggs',
 'fresh ginger root',
 'salt',
 'ground cinnamon',
 'milk',
 'vanilla extract',
 'ground ginger',
 'po

In [66]:
vectorizer = TfidfVectorizer(min_df=1, stop_words='english')
X = vectorizer.fit_transform(corpus)
idf = vectorizer.idf_
d = dict(zip(vectorizer.get_feature_names(), idf))
sorted_d = sorted(d.items(), key=operator.itemgetter(1))
sorted_d[:50]

In [None]:
def plot_tfidf_classfeats_h(dfs):
    ''' Plot the data frames returned by the function plot_tfidf_classfeats(). '''
    fig = plt.figure(figsize=(12, 9), facecolor="w")
    x = np.arange(len(dfs[0]))
    for i, df in enumerate(dfs):
        ax = fig.add_subplot(1, len(dfs), i+1)
        ax.spines["top"].set_visible(False)
        ax.spines["right"].set_visible(False)
        ax.set_frame_on(False)
        ax.get_xaxis().tick_bottom()
        ax.get_yaxis().tick_left()
        ax.set_xlabel("Mean Tf-Idf Score", labelpad=16, fontsize=14)
        ax.set_title("label = " + str(df.label), fontsize=16)
        ax.ticklabel_format(axis='x', style='sci', scilimits=(-2,2))
        ax.barh(x, df.tfidf, align='center', color='#3F5D7D')
        ax.set_yticks(x)
        ax.set_ylim([-1, x[-1]+1])
        yticks = ax.set_yticklabels(df.feature)
        plt.subplots_adjust(bottom=0.09, right=0.97, left=0.15, top=0.95, wspace=0.52)
    plt.show()

In [59]:
# to get top features
def top_tfidf_feats(row, features, top_n=25):
    ''' Get top n tfidf values in row and return them with their corresponding feature names.'''
    topn_ids = np.argsort(row)[::-1][:top_n]
    top_feats = [(features[i], row[i]) for i in topn_ids]
    df = pd.DataFrame(top_feats)
    df.columns = ['feature', 'tfidf']
    return df

In [None]:
cuisine_ingredients = cuisine_train.groupby('cuisine')['ingredients'].apply(list) 

In [None]:
cuisine_ingredients[0]
