In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
from nltk import WordNetLemmatizer
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans
from sklearn.pipeline import make_pipeline
from sklearn import metrics

In [2]:
df = pd.read_csv('../ingredients_combined/ingredients_reduced.csv')
print df.shape

(9751, 5)


In [3]:
# Remove non-alpha characters
name_string1 = [re.sub('[^A-Za-z]', ' ', z).strip().lower() for z in df['id']]

# Remove extra whitespace between words
name_string2 = [' '.join(z.split()) for z in name_string1]

# Lemmatize each word - start by converting the string to a list to iterate through
as_list = [z.split() for z in name_string2]

# Iterate through list, using WordNetLemmatizer
lemmatized = []
for _list in as_list:
    sub_list = []
    for word in _list:
        sub_list.append(WordNetLemmatizer().lemmatize(word))
    lemmatized.append(sub_list)

# Then need to convert the list back into a string
name_string3 = [' '.join(z) for z in lemmatized]

# Zip together recipe name string and ingredient string
zipped = zip(name_string3, df.ingredient_string)

# Concatenate recipe name string and ingredient string
ingredient_id_string = [y + ' ' + z for y,z in zipped]    

# Add new column to dataframe with concatenated strings
df['ingredient_id_string'] = ingredient_id_string

In [4]:
# Use label encoder to encode cuisine
le = LabelEncoder()
df['encoded_cuisine'] = le.fit_transform(df.cuisine)

In [5]:
# Assign X and y
vect = TfidfVectorizer(stop_words='english', max_df=.5, ngram_range=(2,2))
X = vect.fit_transform(df.ingredient_id_string)
y = df.encoded_cuisine

In [64]:
svd = TruncatedSVD(n_components=8)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
print X.shape

(9751, 8)


In [65]:
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

Explained variance of the SVD step: 2%


In [66]:
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=6, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [67]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(y, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(y, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(y, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.603
Completeness: 0.549
V-measure: 0.575
Adjusted Rand-Index: 0.572
Silhouette Coefficient: 0.308


In [68]:
# transform svd back to the original features
# create array with original shape containing cluster centers
original_space_centroids = svd.inverse_transform(km.cluster_centers_)

# argsort provides the indices of the array if it were sorted
order_centroids = original_space_centroids.argsort()[:, ::-1]

In [69]:
terms = vect.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind])
    print()

Cluster 0:
 black pepper
 ground black
 salt ground
 garlic clove
 cooking spray
 chicken broth
 purpose flour
 fat free
 olive oil
 kosher salt
 sodium chicken
 chopped fresh
 salt black
 vegetable oil
 ground cumin
()
Cluster 1:
 feta cheese
 olive oil
 garlic clove
 lemon juice
 cheese crumbles
 dried oregano
 chicken breast
 fresh parsley
 salt pepper
 fresh lemon
 black pepper
 purple onion
 skinless chicken
 bell pepper
 boneless skinless
()
Cluster 2:
 soy sauce
 sesame oil
 green onion
 brown sugar
 sesame seed
 fresh ginger
 rice vinegar
 corn starch
 red pepper
 sodium soy
 vegetable oil
 pepper flake
 toasted sesame
 low sodium
 chicken breast
()
Cluster 3:
 parmesan cheese
 mozzarella cheese
 grated parmesan
 ricotta cheese
 shredded mozzarella
 cheese grated
 lasagna noodle
 olive oil
 ground beef
 tomato sauce
 pasta sauce
 fresh basil
 italian sausage
 italian seasoning
 cheese shredded
()
Cluster 4:
 sour cream
 enchilada sauce
 cheddar cheese
 black bean
 chili powder


## Think it works better without SVD

In [14]:
vect = TfidfVectorizer(stop_words='english', max_df=.5, ngram_range=(2,2), max_features=2500)
X = vect.fit_transform(df.ingredient_id_string)
y = df.encoded_cuisine

In [15]:
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=6, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [16]:
km.cluster_centers_.argsort()[:, ::-1]

array([[1482,  897, 1228, ...,  960,  962, 1249],
       [2128, 2053, 2054, ..., 1763, 1761,    0],
       [ 128, 1365,  805, ..., 1603,  561, 1866],
       [2126,  260,  125, ...,  995,  996,    0],
       [1365,  694, 2411, ...,  448,  447, 1249],
       [2100,  141,  334, ..., 1443, 1444, 1701]])

In [17]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [54]:
km.cluster_centers_

array([[ 0.00164901,  0.        ,  0.        , ...,  0.        ,
         0.00061411,  0.00059338],
       [ 0.        ,  0.        ,  0.00341329, ...,  0.        ,
         0.00091503,  0.00022413],
       [ 0.00132041,  0.00135454,  0.00025743, ...,  0.00032526,
         0.00118554,  0.00055052],
       [ 0.        ,  0.00163576,  0.00079827, ...,  0.        ,
         0.00056362,  0.00083629],
       [ 0.00254687,  0.00030604,  0.00014123, ...,  0.00135345,
         0.00155919,  0.00156518],
       [ 0.00032137,  0.00026991,  0.00029575, ...,  0.00078387,
         0.00028881,  0.00154209]])

In [64]:
terms = vect.get_feature_names()
for i in range(6):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

Cluster 0:
 parmesan cheese
 grated parmesan
 mozzarella cheese
 ricotta cheese
 shredded mozzarella
 olive oil
 cheese grated
 lasagna noodle
 fresh basil
 italian sausage
()
Cluster 1:
 soy sauce
 sesame oil
 sesame seed
 green onion
 rice vinegar
 fresh ginger
 brown sugar
 corn starch
 sodium soy
 toasted sesame
()
Cluster 2:
 black pepper
 olive oil
 garlic clove
 bell pepper
 cooking spray
 salt pepper
 purpose flour
 vegetable oil
 chicken broth
 ground black
()
Cluster 3:
 sour cream
 cheddar cheese
 black bean
 enchilada sauce
 corn tortilla
 chili powder
 ground beef
 flour tortilla
 shredded cheddar
 chicken enchilada
()
Cluster 4:
 olive oil
 feta cheese
 virgin olive
 extra virgin
 garlic clove
 lemon juice
 black pepper
 cheese crumbles
 ground black
 dried oregano
()
Cluster 5:
 skinless chicken
 boneless skinless
 chicken breast
 breast half
 olive oil
 chicken boneless
 oil boneless
 black pepper
 chicken broth
 chicken thigh
()


### Playing around

In [88]:
new_thing = vect.transform('parsley').toarray()

In [87]:
km.transform(new_thing)

array([[ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203]])

In [90]:
new_thing.shape

(7, 2500)

In [95]:
thing = ['parsley orzo olive oil tomato', 'tomato onion mozzarella olive', 
         'soy sauce ginger sesame seeds', 'chili seeds ginger root']

In [96]:
vect = TfidfVectorizer()
bb = vect.fit_transform(thing)

In [97]:
bb

<4x14 sparse matrix of type '<type 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [98]:
km_2 = KMeans(n_clusters=2)
km_2.fit(bb)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [99]:
km_2.labels_

array([1, 1, 0, 0], dtype=int32)

In [100]:
km_2.cluster_centers_

array([[ 0.27764133,  0.41026698,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.27764133,  0.24273031,
         0.41026698,  0.24273031,  0.24273031,  0.        ],
       [ 0.        ,  0.        ,  0.27764133,  0.24273031,  0.41026698,
         0.27764133,  0.24273031,  0.24273031,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.41026698]])

In [104]:
sauce = vect.transform(['sauce'])

In [110]:
cluster_pred = km_2.predict(sauce)
cluster_pred

array([0], dtype=int32)

In [111]:
order_centroids = km_2.cluster_centers_.argsort()[:, ::-1]
order_centroids

array([[10,  1,  8,  0, 12, 11,  9, 13,  7,  6,  5,  4,  3,  2],
       [13,  4,  5,  2,  7,  6,  3, 12, 11, 10,  9,  8,  1,  0]])

In [114]:
terms = vect.get_feature_names()

print("Cluster %d:" % cluster_pred)
for ind in order_centroids[cluster_pred, :]:
    print terms

Cluster 0:
[u'chili', u'ginger', u'mozzarella', u'oil', u'olive', u'onion', u'orzo', u'parsley', u'root', u'sauce', u'seeds', u'sesame', u'soy', u'tomato']


### Real Function

In [48]:
# transform new word to vectorizer
orzo = vect.transform(['garlic clove'])

In [49]:
# predict which cluster it falls into
cluster_pred = km.predict(orzo)

In [50]:
cluster_pred

array([4], dtype=int32)

In [51]:
for ind in order_centroids[cluster_pred[0], :15]:
        print(' %s' % terms[ind])

 olive oil
 feta cheese
 virgin olive
 extra virgin
 garlic clove
 lemon juice
 black pepper
 cheese crumbles
 ground black
 dried oregano
 red pepper
 white wine
 fresh lemon
 oil garlic
 fresh parsley


In [31]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

def recommend_pantry(item):
    #generate list of ingredient recommendations
    transformed = vect.transform([item]) # transform to vectorizer
    cluster_pred = km.predict(transformed) # predict cluster
    for ind in order_centroids[cluster_pred[0], :15]:
        print(' %s' % terms[ind])

In [67]:
recommend_pantry('black bean')

 sour cream
 cheddar cheese
 black bean
 enchilada sauce
 corn tortilla
 chili powder
 ground beef
 flour tortilla
 shredded cheddar
 chicken enchilada
 taco seasoning
 ground cumin
 green chilies
 jack cheese
 monterey jack
