In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.cross_validation import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.decomposition import TruncatedSVD
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import VarianceThreshold
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [66]:
from sklearn.cross_validation import cross_val_score

### Establish db connection

In [2]:
engine = create_engine('postgresql://treytrey3:113315th3@recipeproject3.czcsc2tr7kct.us-east-1.rds.amazonaws.com:5432/dsicapstone3')

### Read in data from sql 

In [35]:
df = pd.read_sql('SELECT * FROM ingredients INNER JOIN flavors ON ingredients.id = flavors.index;', engine)

### Clean up dataframe

In [36]:
df.drop(['Unnamed: 0', 'level_0', 'index', 'ingredient_list'], axis = 1, inplace=True)

In [37]:
df.dropna(axis=0, inplace=True)

In [38]:
df.rename(columns={'meaty':'savory'}, inplace=True)

## Bin data

In [45]:
bins = [-0.1, 0.2, 0.6, 1.0]
group_names = [1,2,3]

In [46]:
flavors = ['bitter', 'savory', 'piquant', 'salty', 'sour', 'sweet']
for flavor in flavors:
    categories = pd.cut(df[flavor], bins, labels=group_names)
    df['categories'+'_'+flavor] = categories

## EDA

In [59]:
suspects = {}
for flavor in flavors:
    sliced = df[df[flavor] > .7]
    vect = CountVectorizer()
    dtm = vect.fit_transform(sliced.ingredient_string).toarray()
    dtm_df = pd.DataFrame(dtm, columns=vect.get_feature_names())
    suspects[flavor] = dtm_df.sum().sort_values(ascending = False)[:75]

In [61]:
suspects_df = pd.DataFrame(suspects)

### Prepare data for sklearn

Fit document term matrix

In [115]:
vect = CountVectorizer()

In [116]:
X = vect.fit_transform(df.ingredient_string)
y = df.categories_bitter

In [None]:
dtm = X.toarray()

## Naive Bayes

In [94]:
mnb = MultinomialNB()
cross_val_score(mnb, X, y, cv = 5)

array([ 0.51908957,  0.54665687,  0.56429096,  0.4188097 ,  0.51726672])

In [95]:
rfc = RandomForestClassifier()
cross_val_score(rfc, X, y, cv = 5)

array([ 0.52349486,  0.54004409,  0.55841293,  0.44526084,  0.52608376])

In [96]:
dtc = DecisionTreeClassifier()
cross_val_score(dtc, X, y, cv = 5)

array([ 0.46328928,  0.53857458,  0.53269655,  0.4570169 ,  0.47979427])

In [117]:
dtc.fit(X, y)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [118]:
np.max(dtc.feature_importances_)

0.057738953453651258

Left off below.... found feature importances, now I suppose I could reduce features

In [119]:
pd.DataFrame(dtc.feature_importances_, index=vect.get_feature_names(), columns=['thing']).sort_values(by='thing', ascending=False)

Unnamed: 0,thing
sauce,0.057739
cheese,0.028416
wine,0.017917
ground,0.016040
onion,0.013717
pepper,0.012719
tomato,0.012404
chicken,0.012318
bell,0.011528
salt,0.011078


### KNN

In [111]:
knn = KNeighborsClassifier(n_neighbors=40)

In [112]:
knn.fit(X_train_dtm, y_train)
# knn.fit(X_train, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=40, p=2,
           weights='uniform')

In [113]:
X_test_dtm = vect.transform(X_test)

In [114]:
predictions = knn.predict(X_test_dtm)
# predictions = knn.predict(X_test)

In [115]:
accuracy_score(y_test, predictions)

0.42714453584018802

In [116]:
# view predicted vs actual for bitter prediction
view = pd.DataFrame(zip(X_test, le.inverse_transform(predictions), le.inverse_transform(y_test)))
# assign column headers
view.columns = ['ingred', 'pred', 'actual']
# view slice of predictions where we were off by a lot
# put np.absolute in the square brackets if you want all off predictions
off_by_a_lot = view[view.pred-view.actual > .5]
off_by_a_lot.head()

Unnamed: 0,ingred,pred,actual
8,skirt steak worcestershire sauce oil onion pow...,0.833333,0.166667
17,gingerroot minced garlic sugar creamy peanut b...,0.833333,0.166667
29,pork belly onion kimchi garlic juice water gin...,0.833333,0.166667
40,poultry seasoning seasoning salt paprika garli...,0.833333,0.166667
178,all purpose flour kosher salt hot water garlic...,0.833333,0.166667


In [118]:
for i in off_by_a_lot.ingred:
    print i

skirt steak worcestershire sauce oil onion powder salt pepper cooked rice chicken broth mozzarella cheese corn tortilla pinto bean shredded lettuce chopped tomato mushroom guacamole sour cream
gingerroot minced garlic sugar creamy peanut butter rice vinegar gluten free soy sauce sesame oil water zucchini yellow summer squash large carrot cooked chicken chopped fresh cilantro sesame seed
pork belly onion kimchi garlic juice water ginger cooking wine chili paste miso soy sauce chili flake silken tofu green onion butter
poultry seasoning seasoning salt paprika garlic powder black pepper turkey butter
all purpose flour kosher salt hot water garlic jalapeno chilies minced ginger green onion napa cabbage sugar ground chicken vegetable oil hoisin sauce soy sauce honey fish sauce rice vinegar
pink salmon onion green bell pepper egg hot sauce pepper salt plain breadcrumb dried parsley garlic powder flour yellow cornmeal oil
butter shallot fresh shiitake mushroom cabernet sauvignon beef cracked 

In [47]:
# see if we can investigate why we're being thrown off here
# vectorize this set of ingredients
vect = CountVectorizer()
dense = vect.fit_transform(off_by_a_lot.ingred).todense()

In [48]:
dense_df = pd.DataFrame(dense)

In [49]:
dense_df.columns = vect.get_feature_names()

In [58]:
# Not getting much from this...
dense_df.sum().sort_values(ascending=False)[:10]

pepper     88
oil        86
sauce      77
garlic     64
onion      59
ground     59
salt       52
cheese     49
fresh      45
chicken    45
dtype: int64

### Naive Bayes - does a little worse, as you can see

In [82]:
nb = MultinomialNB()

In [83]:
X = df.ingredient_string
y = df.bitter_encode

In [84]:
vect = TfidfVectorizer(stop_words='english', max_df=.7)

In [85]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [86]:
X_train_dtm = vect.fit_transform(X_train)

In [87]:
nb.fit(X_train_dtm, y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [88]:
X_test_dtm = vect.transform(X_test)

In [89]:
pred = nb.predict(X_test_dtm)

In [90]:
accuracy_score(y_test, pred)

0.45475910693301996

## Decision tree, any better? No

In [25]:
dtc = DecisionTreeClassifier(max_depth=10)

In [26]:
vect = CountVectorizer()

In [27]:
X = vect.fit_transform(df.ingredient_string)
y = df.bitter_encode

In [28]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [29]:
dtc.fit(X_train, y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [30]:
pred = dtc.predict(X_test)

In [31]:
accuracy_score(y_test, pred)

0.44065804935370151

In [32]:
feature_import = pd.DataFrame({'feature': vect.get_feature_names(), 'importance': dtc.feature_importances_})

In [34]:
feature_import.sort_values(by = 'importance', ascending=False).head(10)

Unnamed: 0,feature,importance
1126,sauce,0.15063
223,cheese,0.075201
1414,wine,0.046182
671,kosher,0.022956
1328,tomato,0.019457
84,bell,0.018538
406,enchilada,0.018287
859,onion,0.017122
1398,water,0.01688
642,juice,0.016592


## Random Forest?

In [49]:
from sklearn.cross_validation import cross_val_score

In [50]:
# list of values to try for n_estimators
estimator_range = range(10, 210, 10)

# list to store the average RMSE for each value of n_estimators
Accuracy_scores = []

# use 5-fold cross-validation with each value of n_estimators (WARNING: SLOW!)
for estimator in estimator_range:
    rfclass = RandomForestClassifier(n_estimators=estimator, random_state=1)
    Accuracy_score = cross_val_score(rfclass, X, y, cv=5, scoring='accuracy')
    Accuracy_scores.append(Accuracy_score)

KeyboardInterrupt: 

#### Curious if a TfidfVectorizer would perform better

In [88]:
vect = TfidfVectorizer()
# then run above code -  it went up by exactly one percent

### Would dimensionality reduction help? - try truncated singular value decomposition and LDA/PCA if that doesn't work

In [123]:
# LDA might work, but I'm curious about truncated singular value decomposition
X = df.ingredient_string
vect = CountVectorizer()
X_sparse = vect.fit_transform(X)

In [124]:
svd = TruncatedSVD(n_components=30, random_state=42)

In [125]:
svd.fit(X_sparse)

TruncatedSVD(algorithm='randomized', n_components=30, n_iter=5,
       random_state=42, tol=0.0)

In [126]:
print(svd.explained_variance_ratio_)

[ 0.04755817  0.05884936  0.04353868  0.03846388  0.03239828  0.03066295
  0.02459688  0.02397255  0.02005734  0.01791154  0.01683284  0.01532295
  0.01358824  0.01262775  0.01213879  0.01168747  0.01101024  0.01077734
  0.01020871  0.00999668  0.00949756  0.00896687  0.00875826  0.00868962
  0.00811479  0.00794552  0.00765118  0.00741422  0.00720268  0.00695178]


In [127]:
print(svd.explained_variance_ratio_.sum()) 

0.543393134264


In [122]:
X_reduced = svd.transform(X_sparse)

I remember leaving off here, and wondering, when you perform dimensionality reduction and then feed your new features into a classifier, how do you feed your model new data? Do you perform dimensionality reduction on that separately?

### Truncated SVD didn't work so well - maybe try LDA

In [69]:
X = df.ingredient_string
y = df.bitter_encode

In [70]:
vect = CountVectorizer()

In [71]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [72]:
X_train_dtm = vect.fit_transform(X_train).toarray()

In [37]:
clf = LDA()

In [38]:
clf.fit(X_train_dtm, y_train)

LinearDiscriminantAnalysis(n_components=None, priors=None, shrinkage=None,
              solver='svd', store_covariance=False, tol=0.0001)

In [73]:
X_test_dtm = vect.transform(X_test).toarray()

In [40]:
clf.score(X_test_dtm, y_test)

0.44418331374853115

So above I used the LDA classifier, which I was hoping would work well, since it seems to be compatible with document term matrices, but it scored about the same as the other models. Maybe we should try feature selection, rather than dimensionality reduction. And if that doesn't work, binning.

Note that I also used the countvectorizer, because intuitively it makes sense when identifying flavor -- you want the most common ingredients to have the most weight.

### Try Feature selection

#### Variance threshold

In [106]:
sel = VarianceThreshold(threshold=(.8 * (1 - .8)))

In [119]:
# try KNN with reduced features
knn = KNeighborsClassifier(n_neighbors=20)

In [120]:
X = df.ingredient_string
y = df.bitter_encode

In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [122]:
vect = CountVectorizer(stop_words='english')

In [123]:
X_train_dtm = vect.fit_transform(X_train).toarray()

In [124]:
reduced = sel.fit_transform(X_train_dtm)

In [125]:
knn.fit(reduced, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=20, p=2,
           weights='uniform')

In [126]:
X_test_dtm = vect.transform(X_test).toarray()

In [127]:
reduced_test = sel.transform(X_test_dtm)

In [128]:
predictions = knn.predict(reduced_test)

In [129]:
accuracy_score(y_test, predictions)

0.38719153936545242

Left off with the variance threshold feature selection tactic not working well. Either tune it to allow more features, or move onto select percentile of kbest features.

#### chi2 feature selection - try this to "weed out" features that are independent of class

In [20]:
X = df.ingredient_string
y = df.bitter_encode

In [None]:
X_new = SelectKBest(chi2, k=).fit_transform(X, y)

In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [22]:
vect = CountVectorizer(stop_words='english')

In [23]:
X_train_dtm = vect.fit_transform(X_train)

## Look at bitterest recipes, try to engineer a feature or two

In [27]:
bitterest = df[df.bitter>.8]

In [28]:
vect = CountVectorizer()

In [31]:
dtm = vect.fit_transform(bitterest.ingredient_string).todense()

In [38]:
df_bitter = pd.DataFrame(dtm, columns=vect.get_feature_names())

In [41]:
df_bitter.sum().sort_values(ascending=False)[40:60]

vegetable    212
oregano      202
basil        199
bean         197
butter       196
seed         194
white        193
leaf         189
parmesan     186
flake        183
parsley      178
cumin        177
lime         176
bell         165
feta         163
skinless     156
grated       156
cheddar      155
seasoning    155
extra        150
dtype: int64

In [105]:
counter = 0
bitterest = []
for i in df.ingredient_string:
    if 'vinegar' in i and 'ginger'in i and 'clove' in i and 'lime' in i:
        bitterest.append(counter)
    counter += 1

In [106]:
df.iloc[bitterest,:]

Unnamed: 0,id,cuisine,ingredient_string,bitter,meaty,piquant,salty,sour,sweet,bitter_encode,piquant_encode,salty_encode,sour_encode,sweet_encode,meaty_encode
1025,Flank-Steak-with-Grilled-Mango-and-Watermelon-...,American,brown sugar salt ground cumin garlic clove fla...,0.166667,0.166667,0.166667,0.333333,0.666667,0.5,1,1,2,4,3,1
2382,Mince-Pork-Rice-Bowl-1695254,Asian,minced pork onion ginger garlic clove sesame o...,0.166667,0.166667,0.833333,0.166667,0.333333,0.166667,1,5,1,2,1,1
2815,Asian-Grilled-Flank-Steak-1632515,Asian,rice vinegar sesame oil fresh lime juice tamar...,0.5,0.833333,0.0,0.5,0.166667,0.166667,3,0,3,1,1,5
3352,Thai-Basil-Chicken-1630624,Asian,ground chicken thigh shiitake red pepper onion...,0.666667,0.333333,0.166667,0.833333,0.666667,0.166667,4,1,5,4,1,2
3530,Chicken-Satay-with-Peanut-Noodles-513555,Asian,reduced sodium soy sauce cooking oil garlic cl...,0.166667,0.166667,0.166667,0.166667,0.166667,0.333333,1,1,1,1,2,1
3835,Spicy-Thai-Grilled-Chicken-1536483,Asian,boneless skinless chicken thigh fresh lime jui...,0.166667,0.833333,0.0,0.166667,0.166667,0.166667,1,0,1,1,1,5
3865,Asian-Pork-Tenderloin-with-Ginger-Glaze-_Slow-...,Asian,brown sugar salt ground ginger cinnamon garlic...,0.833333,0.333333,0.166667,0.833333,0.166667,0.5,5,1,5,1,3,2
