In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import re
from nltk import WordNetLemmatizer
from sklearn.cross_validation import train_test_split
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.cluster import KMeans, MiniBatchKMeans

from matplotlib import pyplot as plt
% matplotlib inline

In [7]:
from sklearn.pipeline import make_pipeline

In [2]:
df = pd.read_csv('../ingredients_combined/ingredients_reduced.csv')


In [4]:
# Remove non-alpha characters
name_string1 = [re.sub('[^A-Za-z]', ' ', z).strip().lower() for z in df['id']]

# Remove extra whitespace between words
name_string2 = [' '.join(z.split()) for z in name_string1]

# Lemmatize each word - start by converting the string to a list to iterate through
as_list = [z.split() for z in name_string2]

# Iterate through list, using WordNetLemmatizer
lemmatized = []
for _list in as_list:
    sub_list = []
    for word in _list:
        sub_list.append(WordNetLemmatizer().lemmatize(word))
    lemmatized.append(sub_list)

# Then need to convert the list back into a string
name_string3 = [' '.join(z) for z in lemmatized]

# Zip together recipe name string and ingredient string
zipped = zip(name_string3, df.ingredient_string)

# Concatenate recipe name string and ingredient string
ingredient_id_string = [y + ' ' + z for y,z in zipped]    

# Add new column to dataframe with concatenated strings
df['ingredient_id_string'] = ingredient_id_string

In [5]:
# Use label encoder to encode cuisine
le = LabelEncoder()
df['encoded_cuisine'] = le.fit_transform(df.cuisine)

In [51]:
# Assign X and y
vect = TfidfVectorizer(stop_words='english', max_df=.5, ngram_range=(1,2))
X = vect.fit_transform(df.ingredient_id_string)
y = df.encoded_cuisine

In [52]:
svd = TruncatedSVD(n_components=5)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)
X = lsa.fit_transform(X)

In [53]:
explained_variance = svd.explained_variance_ratio_.sum()
print("Explained variance of the SVD step: {}%".format(int(explained_variance * 100)))

Explained variance of the SVD step: 3%


In [54]:
km = KMeans(n_clusters=5, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

KMeans(copy_x=True, init='k-means++', max_iter=100, n_clusters=5, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [55]:
from sklearn import metrics

In [56]:
print("Homogeneity: %0.3f" % metrics.homogeneity_score(y, km.labels_))
print("Completeness: %0.3f" % metrics.completeness_score(y, km.labels_))
print("V-measure: %0.3f" % metrics.v_measure_score(y, km.labels_))
print("Adjusted Rand-Index: %.3f"
      % metrics.adjusted_rand_score(y, km.labels_))
print("Silhouette Coefficient: %0.3f"
      % metrics.silhouette_score(X, km.labels_, sample_size=1000))

Homogeneity: 0.494
Completeness: 0.503
V-measure: 0.499
Adjusted Rand-Index: 0.460
Silhouette Coefficient: 0.404


In [57]:
original_space_centroids = svd.inverse_transform(km.cluster_centers_)
order_centroids = original_space_centroids.argsort()[:, ::-1]

In [58]:
terms = vect.get_feature_names()
for i in range(5):
    print("Cluster %d:" % i)
    for ind in order_centroids[i, :10]:
        print(' %s' % terms[ind])
    print()

Cluster 0:
 chicken
 chicken breast
 breast
 boneless
 skinless
 skinless chicken
 boneless skinless
 olive
 fresh
 lemon
()
Cluster 1:
 enchilada
 tortilla
 ground
 powder
 onion
 bean
 cheese
 corn
 chicken
 cream
()
Cluster 2:
 fresh
 olive
 ground
 lemon
 olive oil
 tomato
 black
 dried
 onion
 clove
()
Cluster 3:
 sauce
 sesame
 soy
 soy sauce
 ginger
 rice
 sugar
 sesame oil
 chicken
 onion
()
Cluster 4:
 cheese
 parmesan
 parmesan cheese
 tomato
 pasta
 mozzarella
 italian
 lasagna
 mozzarella cheese
 grated parmesan
()
