In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
from nltk import WordNetLemmatizer
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.pipeline import make_pipeline

In [11]:
from sklearn import metrics

In [73]:
df = pd.read_csv('../ingredients_combined/ingredients_reduced.csv')
print df.shape

(9751, 5)


In [74]:
# Remove non-alpha characters
name_string1 = [re.sub('[^A-Za-z]', ' ', z).strip().lower() for z in df['id']]

# Remove extra whitespace between words
name_string2 = [' '.join(z.split()) for z in name_string1]

# Lemmatize each word - start by converting the string to a list to iterate through
as_list = [z.split() for z in name_string2]

# Iterate through list, using WordNetLemmatizer
lemmatized = []
for _list in as_list:
    sub_list = []
    for word in _list:
        sub_list.append(WordNetLemmatizer().lemmatize(word))
    lemmatized.append(sub_list)

# Then need to convert the list back into a string
name_string3 = [' '.join(z) for z in lemmatized]

# Zip together recipe name string and ingredient string
zipped = zip(name_string3, df.ingredient_string)

# Concatenate recipe name string and ingredient string
ingredient_id_string = [y + ' ' + z for y,z in zipped]    

# Add new column to dataframe with concatenated strings
df['ingredient_id_string'] = ingredient_id_string

In [75]:
# Use label encoder to encode cuisine
le = LabelEncoder()
df['encoded_cuisine'] = le.fit_transform(df.cuisine)

In [63]:
# Assign X and y
vect = TfidfVectorizer(stop_words='english', max_df=.5, ngram_range=(2,2))
X = vect.fit_transform(df.ingredient_id_string)
y = df.encoded_cuisine

In [64]:
svd = TruncatedSVD(n_components=8)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)
print X.shape

(9751, 8)


In [65]:
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

Explained variance of the SVD step: 2%


In [66]:
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=6, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [67]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(y, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(y, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(y, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.603
Completeness: 0.549
V-measure: 0.575
Adjusted Rand-Index: 0.572
Silhouette Coefficient: 0.308


In [68]:
# transform svd back to the original features
# create array with original shape containing cluster centers
original_space_centroids = svd.inverse_transform(km.cluster_centers_)

# argsort provides the indices of the array if it were sorted
order_centroids = original_space_centroids.argsort()[:, ::-1]

In [69]:
terms = vect.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind])
    print()

Cluster 0:
 black pepper
 ground black
 salt ground
 garlic clove
 cooking spray
 chicken broth
 purpose flour
 fat free
 olive oil
 kosher salt
 sodium chicken
 chopped fresh
 salt black
 vegetable oil
 ground cumin
()
Cluster 1:
 feta cheese
 olive oil
 garlic clove
 lemon juice
 cheese crumbles
 dried oregano
 chicken breast
 fresh parsley
 salt pepper
 fresh lemon
 black pepper
 purple onion
 skinless chicken
 bell pepper
 boneless skinless
()
Cluster 2:
 soy sauce
 sesame oil
 green onion
 brown sugar
 sesame seed
 fresh ginger
 rice vinegar
 corn starch
 red pepper
 sodium soy
 vegetable oil
 pepper flake
 toasted sesame
 low sodium
 chicken breast
()
Cluster 3:
 parmesan cheese
 mozzarella cheese
 grated parmesan
 ricotta cheese
 shredded mozzarella
 cheese grated
 lasagna noodle
 olive oil
 ground beef
 tomato sauce
 pasta sauce
 fresh basil
 italian sausage
 italian seasoning
 cheese shredded
()
Cluster 4:
 sour cream
 enchilada sauce
 cheddar cheese
 black bean
 chili powder


## Think it works better without SVD

In [76]:
vect = TfidfVectorizer(stop_words='english', max_df=.5, ngram_range=(2,2), max_features=2500)
X = vect.fit_transform(df.ingredient_id_string)
y = df.encoded_cuisine

In [77]:
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=6, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [78]:
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

In [79]:
terms = vect.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :15]:
        print(' %s' % terms[ind])
    print()

Cluster 0:
 soy sauce
 sesame oil
 green onion
 fresh ginger
 sesame seed
 brown sugar
 rice vinegar
 fish sauce
 corn starch
 sodium soy
 vegetable oil
 stir fry
 toasted sesame
 chicken breast
 red pepper
()
Cluster 1:
 olive oil
 feta cheese
 virgin olive
 extra virgin
 garlic clove
 cheese crumbles
 lemon juice
 black pepper
 dried oregano
 red pepper
 ground black
 chicken breast
 fresh lemon
 purple onion
 dried tomato
()
Cluster 2:
 enchilada sauce
 cheddar cheese
 sour cream
 taco seasoning
 flour tortilla
 shredded cheddar
 chicken enchilada
 ground beef
 corn tortilla
 green chilies
 black bean
 jack cheese
 shredded cheese
 tortilla shredded
 monterey jack
()
Cluster 3:
 black pepper
 olive oil
 garlic clove
 cooking spray
 purpose flour
 chicken broth
 salt pepper
 ground black
 vegetable oil
 salt ground
 pepper salt
 salt black
 fresh parsley
 white wine
 lemon juice
()
Cluster 4:
 chili powder
 ground cumin
 bell pepper
 black bean
 corn tortilla
 jalapeno chilies
 fresh

In [88]:
new_thing = vect.transform('parsley').toarray()

In [87]:
km.transform(new_thing)

array([[ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203],
       [ 0.23013461,  0.23632981,  0.26763056,  0.14726674,  0.21274974,
         0.27052203]])

In [90]:
new_thing.shape

(7, 2500)

In [95]:
thing = ['parsley orzo olive oil tomato', 'tomato onion mozzarella olive', 
         'soy sauce ginger sesame seeds', 'chili seeds ginger root']

In [96]:
vect = TfidfVectorizer()
bb = vect.fit_transform(thing)

In [97]:
bb

<4x14 sparse matrix of type '<type 'numpy.float64'>'
	with 18 stored elements in Compressed Sparse Row format>

In [98]:
km_2 = KMeans(n_clusters=2)
km_2.fit(bb)

KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=10,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [99]:
km_2.labels_

array([1, 1, 0, 0], dtype=int32)

In [100]:
km_2.cluster_centers_

array([[ 0.27764133,  0.41026698,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.27764133,  0.24273031,
         0.41026698,  0.24273031,  0.24273031,  0.        ],
       [ 0.        ,  0.        ,  0.27764133,  0.24273031,  0.41026698,
         0.27764133,  0.24273031,  0.24273031,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.41026698]])

In [104]:
sauce = vect.transform(['sauce'])

In [110]:
cluster_pred = km_2.predict(sauce)
cluster_pred

array([0], dtype=int32)

In [111]:
order_centroids = km_2.cluster_centers_.argsort()[:, ::-1]
order_centroids

array([[10,  1,  8,  0, 12, 11,  9, 13,  7,  6,  5,  4,  3,  2],
       [13,  4,  5,  2,  7,  6,  3, 12, 11, 10,  9,  8,  1,  0]])

In [114]:
terms = vect.get_feature_names()

print("Cluster %d:" % cluster_pred)
for ind in order_centroids[cluster_pred, :]:
    print terms

Cluster 0:
[u'chili', u'ginger', u'mozzarella', u'oil', u'olive', u'onion', u'orzo', u'parsley', u'root', u'sauce', u'seeds', u'sesame', u'soy', u'tomato']
