# Content Tagging and Recommendation Engine Demo
### Michael Shea, General Assembly

# 1. Import Libraries

In [1]:
from sqlalchemy import create_engine
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.grid_search import GridSearchCV
from sklearn.cluster import KMeans
import requests
import json

# 2. Retrieve data from API

![alt text](http://www.yummly.com/wp-content/uploads/2013/03/Screen-Shot-2013-03-17-at-9.17.28-PM.png)

In [11]:
url = 'http://api.yummly.com/v1/api/recipes?'
headers = {'X-Yummly-App-ID':'e2b9bebc', 'X-Yummly-App-Key':'4193215272970d956cfd5384a08580a9'}
parameters = {'allowedCourse[]': 'course^course-Main Dishes', 
              'excludedCourse[]': ['course^course-Appetizers', 'course^course-Salads', 
                                   'course^course-Condiments and Sauces',
                                   'course^course-Lunch', 'course^course-Soups', 'course^course-Snacks',
                                   'course^course-Breakfast and Brunch', 'course^course-Side Dishes',],
              'allowedCuisine[]': 'cuisine^cuisine-asian',
              'excludedCuisine[]': ['cuisine^cuisine-american', 'cuisine^cuisine-italian', 'cuisine^cuisine-indian',
                                    'cuisine^cuisine-mexican', 'cuisine^cuisine-mediterranean', 'cuisine^cuisine-chinese',
                                   'cuisine^cuisine-japanese'],
              'maxResult': 500,
             'start': 1500}
response = requests.get(url, headers=headers, params=parameters)

# 3. Parse JSON

In [None]:
api_call = response.json()

cuisine_dict={}
for food in api_call['matches']:
    cuisine_dict[food.get('id')]= food['attributes'].get('cuisine')
        
_cuisines= {}       

for k, v in cuisine_dict.iteritems():
    cuisine_val = {}
    for course in cuisine_list:
        try:
            if course in v :
                cuisine_val[course] = 1
            else:
                cuisine_val[course] = 0
        except TypeError:
            cuisine_val[course] = 0
    
        _cuisines[k] = cuisine_val

# 4. Connect to PostgreSQL Instance and Read in Data

In [2]:
engine = create_engine('postgresql://treytrey3:113315th3@recipeproject3.czcsc2tr7kct.us-east-1.rds.amazonaws.com:5432/dsicapstone3')

df = pd.read_sql('SELECT * FROM ingredients INNER JOIN flavors ON ingredients.id = flavors.index;', engine)

![alt text](data-workflow.png)

# 5. Modelling - Create feature and target vectors

In [103]:
# Assign X and y
vect = TfidfVectorizer(stop_words='english', ngram_range=(1,2))
X = vect.fit_transform(df.ingredient_string)
y = df.cuisine

# 6. Predict cuisine from ingredients, comparing three models

In [48]:
# Create a dictionary to hold model scores
all_models = {}

# Create a function to evaluate
def model_predictions(model):
    predictions = cross_val_predict(model, X, y)
    return predictions

In [104]:
# Logistic regression
logit = LogisticRegression()
params = {'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0],
          'penalty': ['l1', 'l2']}
grid = GridSearchCV(logit, params)
grid.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [0.001, 0.01, 0.1, 1.0, 10.0, 100.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [12]:
# Support Vector Machine
svm = LinearSVC()
params = {'C': [1.0, 10.0],
          'penalty': ['l1', 'l2']}
grid_svm = GridSearchCV(logit, params)
grid_svm.fit(X, y)

GridSearchCV(cv=None, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'penalty': ['l1', 'l2'], 'C': [1.0, 10.0]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [22]:
# Naive Bayes
nb = MultinomialNB()
predictions = cross_val_predict(nb, X, y)

# 7. Demo - choose a few ingredients
- ## "e.g. tomato sauce onion garlic carrot parmesan cheese"

In [107]:
# lets try this out on out of sample data
new_data = ['peanut butter chicken broth flour']
new_data_dtm = vect.transform(new_data)

In [108]:
# Predict cuisine
grid.best_estimator_.predict(new_data_dtm)

array([u'Asian'], dtype=object)

# 8. Build Recommender

In [3]:
# Assign X and y
vect = TfidfVectorizer(stop_words='english', max_df=.5, ngram_range=(2,2))
X = vect.fit_transform(df.ingredient_string)
y = df.cuisine
terms = vect.get_feature_names()

# Cluster data
km = KMeans(n_clusters=6, init='k-means++', max_iter=100, n_init=1)
km.fit(X)

# Calculate ingredients closest to cluster centers
order_centroids = km.cluster_centers_.argsort()[:, ::-1]

# Build recommender function
def recommend_pantry(item):
    #generate list of ingredient recommendations
    transformed = vect.transform([item]) # transform to vectorizer
    cluster_pred = km.predict(transformed) # predict cluster
    for ind in order_centroids[cluster_pred[0], :15]:
        print(' %s' % terms[ind])

# 9. Demo Recommender
- ## e.g. "black bean" or "green onion"

In [24]:
recommend_pantry('butter')

 black pepper
 olive oil
 garlic clove
 cooking spray
 salt pepper
 bell pepper
 purpose flour
 vegetable oil
 chicken broth
 ground black
 salt ground
 salt black
 kosher salt
 pepper salt
 chopped fresh
