In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [59]:
from sklearn.naive_bayes import MultinomialNB

In [86]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [105]:
from sklearn.decomposition import TruncatedSVD

### Establish db connection

In [2]:
engine = create_engine('postgresql://treytrey3:113315th3@recipeproject3.czcsc2tr7kct.us-east-1.rds.amazonaws.com:5432/dsicapstone3')

### Read in data from sql 

In [3]:
df = pd.read_sql('SELECT * FROM ingredients INNER JOIN flavors ON ingredients.id = flavors.index;', engine)

### Clean up dataframe

In [4]:
df.drop(['Unnamed: 0', 'level_0', 'index', 'ingredient_list'], axis = 1, inplace=True)

In [5]:
df.dropna(axis=0, inplace=True)

### EDA

Having dropped NaNs, do we still have an even distribution of cuisines? Answer is yes:

In [6]:
df.cuisine.value_counts()

Italian          1399
Mediterranean    1387
Asian            1354
Mexican          1341
American         1325
Name: cuisine, dtype: int64

What are flavor distributions?

In [7]:
flavs = ['bitter', 'piquant', 'salty', 'sour', 'sweet', 'meaty']

In [8]:
for val in flavs:
    print "%r value counts: \n" % val, df[val].value_counts()

'bitter' value counts: 
0.166667    2280
0.833333    1728
0.666667    1019
0.500000     966
0.333333     809
1.000000       3
0.000000       1
Name: bitter, dtype: int64
'piquant' value counts: 
0.000000    2672
0.166667    1794
0.833333    1128
0.666667     514
0.500000     371
0.333333     326
1.000000       1
Name: piquant, dtype: int64
'salty' value counts: 
0.833333    2237
0.166667    1987
0.666667    1040
0.500000     832
0.333333     702
0.000000       5
1.000000       3
Name: salty, dtype: int64
'sour' value counts: 
0.166667    2691
0.833333    1450
0.333333     919
0.500000     910
0.666667     801
0.000000      32
1.000000       3
Name: sour, dtype: int64
'sweet' value counts: 
0.166667    5150
0.333333     896
0.500000     458
0.000000     152
0.666667     104
0.833333      43
1.000000       3
Name: sweet, dtype: int64
'meaty' value counts: 
0.166667    3014
0.833333    1091
0.333333     960
0.500000     913
0.666667     821
0.000000       4
1.000000       3
Name: meaty, d

From this we can glean that flavor values are categorical. Which makes prediction straightforward.

### Prepare df for prediction task

Label encode each of the flavor columns

In [9]:
le = LabelEncoder()

In [10]:
for val in flavs:
    df[val+'_encode'] = le.fit_transform(df[val])

Fit document term matrix

In [62]:
vect = CountVectorizer()

In [97]:
X = df.ingredient_string
y = df.bitter_encode

In [98]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [99]:
X_train_dtm = vect.fit_transform(X_train)

### KNN

In [100]:
knn = KNeighborsClassifier(n_neighbors=50)

In [101]:
knn.fit(X_train_dtm, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=50, p=2,
           weights='uniform')

In [102]:
X_test_dtm = vect.transform(X_test)

In [103]:
predictions = knn.predict(X_test_dtm)

In [104]:
accuracy_score(y_test, predictions)

0.46944770857814339

In [46]:
# view predicted vs actual for bitter prediction
view = pd.DataFrame(zip(X_test, le.inverse_transform(predictions), le.inverse_transform(y_test)))
# assign column headers
view.columns = ['ingred', 'pred', 'actual']
# view slice of predictions where we were off by a lot
# put np.absolute in the square brackets if you want all off predictions
off_by_a_lot = view[view.pred-view.actual > .5]
off_by_a_lot.head()

Unnamed: 0,ingred,pred,actual
5,goat cheese corn tortilla shrimp shredded cabb...,0.833333,0.166667
40,skirt steak worcestershire sauce oil onion pow...,0.833333,0.166667
55,boneless skinless chicken thigh arrabbiata sau...,0.833333,0.166667
69,stir fry beef meat water gluten free soy sauce...,0.833333,0.166667
84,vegetable oil vidalia onion garlic clove peach...,0.833333,0.166667


In [47]:
# see if we can investigate why we're being thrown off here
# vectorize this set of ingredients
vect = CountVectorizer()
dense = vect.fit_transform(off_by_a_lot.ingred).todense()

In [48]:
dense_df = pd.DataFrame(dense)

In [49]:
dense_df.columns = vect.get_feature_names()

In [58]:
# Not getting much from this...
dense_df.sum().sort_values(ascending=False)[:10]

pepper     88
oil        86
sauce      77
garlic     64
onion      59
ground     59
salt       52
cheese     49
fresh      45
chicken    45
dtype: int64

### Naive Bayes - does a little worse, as you can see

In [89]:
nb = MultinomialNB()

In [90]:
X = df.ingredient_string
y = df.bitter_encode

In [76]:
vect = CountVectorizer(stop_words='english')

In [91]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [92]:
X_train_dtm = vect.fit_transform(X_train)

In [93]:
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [94]:
X_test_dtm = vect.transform(X_test)

In [95]:
pred = nb.predict(X_test_dtm)

In [96]:
accuracy_score(y_test, pred)

0.43478260869565216

#### Curious if a TfidfVectorizer would perform better

In [88]:
vect = TfidfVectorizer()
# then run above code -  it went up by exactly one percent

### Would dimensionality reduction help?

In [123]:
# LDA might work, but I'm curious about truncated singular value decomposition
X = df.ingredient_string
vect = CountVectorizer()
X_sparse = vect.fit_transform(X)

In [124]:
svd = TruncatedSVD(n_components=30, random_state=42)

In [125]:
svd.fit(X_sparse)

TruncatedSVD(algorithm='randomized', n_components=30, n_iter=5,
       random_state=42, tol=0.0)

In [126]:
print(svd.explained_variance_ratio_)

[ 0.04755817  0.05884936  0.04353868  0.03846388  0.03239828  0.03066295
  0.02459688  0.02397255  0.02005734  0.01791154  0.01683284  0.01532295
  0.01358824  0.01262775  0.01213879  0.01168747  0.01101024  0.01077734
  0.01020871  0.00999668  0.00949756  0.00896687  0.00875826  0.00868962
  0.00811479  0.00794552  0.00765118  0.00741422  0.00720268  0.00695178]


In [127]:
print(svd.explained_variance_ratio_.sum()) 

0.543393134264


In [122]:
X_reduced = svd.transform(X_sparse)