# Building a Model Using Spruce Eats Data
I used the scraped and cleaned Spruce Eats data to build a recommender engine in this notebook.

### 1. Imports and Functions
* **var_to_pickle**: Writes the given variable to a pickle file
* **read_pickle**: Reads the given pickle file

In [1]:
import pandas as pd
import numpy as np
import re
import spacy
from sklearn.feature_extraction import text
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import pairwise_distances
from sklearn.decomposition import NMF

from code.lw_pickle import var_to_pickle, read_pickle

### 2. Load DataFrame From Pickle

In [2]:
df_pk = '../data/se_df.pk'
df = read_pickle(df_pk)

### 3. Pre-process Descriptions
In this section I lemmatize descriptions using Spacy.

In [3]:
scy = spacy.load("en_core_web_sm")

In [4]:
# Simple script that lemmatizes lists of names and base spirits
def list_prepro(items):
    item_str = ' '.join(set([i for row in items for i in row]))
    doc = scy(item_str)
    words = [token.lemma_ for token in doc]
    words = list(set(filter(lambda w: '-' not in w, words)))
    return words

# Simple script that lemmatizes a description
def desc_prepro(desc):
    pos_keep = ['ADJ', 'NOUN', 'PROPN']
    doc = scy(desc)
    words = [token.lemma_ for token in doc if token.pos_ in pos_keep]
    words = list(filter(lambda w: '-' not in w, words))
    return ' '.join(words)

In [5]:
descriptions = df['description'].map(desc_prepro)

### 4. Create Lists of Stop Words
I created separate lists of stop words for two models: one includes several shared stop words and the other is more aggressive, containing drink names and base spirits.

In [6]:
# Manually-populated list of generic stop words
gen_stop_words = ['cocktail', 'drink', 'recipe', 'make', 'mix', 'flavor', 'good',
                  'ingredient', 'taste', 'perfect', 'little', 'bar', 'nice', 'blue',
                  'great', 'way', 'favorite', 'new', 'popular', 'delicious', 'green',
                  'party', 'fun', 'black', 'sure', 'time', 'glass', 'woo', 'year',
                  'st', 'shot', 'garnish', 'pink', 'bit', 'different', 'choice',
                  'drink', 'bartender', 'recipe', 'fantastic', 'delicious', 'use',
                  'taste', 'nice', 'liquor', 'drink', 'bit', 'drinker', 'try']
safe_sw = text.ENGLISH_STOP_WORDS.union(gen_stop_words)

# Lemmatized lists of base spirits and drink names
base_spirits = list_prepro(df['base_spirits'].tolist())
name_words = list_prepro(df['name_words'].tolist())

fun_sw = text.ENGLISH_STOP_WORDS.union(gen_stop_words + base_spirits + name_words)

### 5. Create Safe NMF Recommender
This recommender is based on the less aggressive, safe stop words list and returns predictions that contain similar names and base spirits as a given cocktail.

In [7]:
# Create TF-IDF Matrix
safe_tfidf = TfidfVectorizer(stop_words=safe_sw)
safe_mtx = safe_tfidf.fit_transform(descriptions.values)

# Create NMF Vectors
safe_nmf = NMF(n_components = 30)
safe_drink_vec = safe_nmf.fit_transform(safe_mtx)
safe_word_vec = safe_nmf.components_.transpose()

### 6. Create Fun NMF Recommender
This recommender is based on the more aggressive stop words list and returns predictions that can differ wildly from a given cocktail.

In [8]:
# Create TF-IDF Matrix
fun_tfidf = TfidfVectorizer(stop_words=fun_sw)
fun_mtx = fun_tfidf.fit_transform(descriptions.values)

# Create NMF Vectors
fun_nmf = NMF(n_components = 25)
fun_drink_vec = fun_nmf.fit_transform(fun_mtx)
fun_word_vec = fun_nmf.components_.transpose()

### 7. Create Recommender Class
This class serves as a data container for the models, vectors, and DataFrame needed to make the recommendation engine work properly. It also has a recommend function for simple compatibility with the web app.

In [9]:
class cocktail_recommender:
    dist_metric = 'cosine'
    drink_weight = .4
    
    def __init__(self):
        # Connects important data structures to class variables
        self.df = df
        #self.scy = scy
        self.safe_tfidf = safe_tfidf
        self.safe_nmf = safe_nmf
        self.safe_drink_vec = safe_drink_vec
        self.fun_tfidf = fun_tfidf
        self.fun_nmf = fun_nmf
        self.fun_drink_vec = fun_drink_vec
        
        # Creates a series of sets representing drink names
        self.name_sets = self.df['name_words'].apply(set)
        
    # Recommend function
    def recommend(self, input_string, weirdness=.5, num_recos=10, exclude_inputs=True):
        if not input_string:
            return False, None
        weirdness = max(min(weirdness, 1), 0)
        name_set = search_set = self.clean_input(input_string)
        drink_idx = self.name_matches(name_set)
        
        # Calculates safe search vector
        safe_search_vec = (self.safe_nmf.transform(
                           self.safe_tfidf.transform(search_set)))
        safe_search_vec = np.mean(safe_search_vec, axis=0, keepdims=True)
        
        # Calculates fun search vector
        fun_search_vec = (self.fun_nmf.transform(
                          self.fun_tfidf.transform(search_set)))
        fun_search_vec = np.mean(fun_search_vec, axis=0, keepdims=True)
        
        # Averages search vectors with matched drink vectors
        if drink_idx:
            safe_drink_vec = np.mean(self.safe_drink_vec[[drink_idx]],
                                     axis=0,
                                     keepdims=True)
            fun_drink_vec = np.mean(self.fun_drink_vec[[drink_idx]],
                                    axis=0,
                                    keepdims=True)
            safe_search_vec = ((1 - self.drink_weight) * safe_search_vec +
                               self.drink_weight * safe_drink_vec)
            fun_search_vec = ((1 - self.drink_weight) * fun_search_vec +
                               self.drink_weight * fun_drink_vec)
            
        # Calculates pairwise distances between search vectors and recommendations
        if not safe_search_vec.sum():
            return False, None
        elif not fun_search_vec.sum():
            dist = pairwise_distances(X=self.safe_drink_vec,
                                      Y=safe_search_vec,
                                      metric=self.dist_metric)
        else:
            safe_dist = pairwise_distances(X=self.safe_drink_vec,
                                           Y=safe_search_vec,
                                           metric=self.dist_metric)
            fun_dist = pairwise_distances(X=self.fun_drink_vec,
                                          Y=fun_search_vec,
                                          metric=self.dist_metric)
            dist = (1 - weirdness) * safe_dist + weirdness * fun_dist
           
        # Calculates recommendations
        rank_idx = dist.transpose()[0].argsort().tolist()
        if exclude_inputs:
            rank_idx = list(filter(lambda x: x not in drink_idx, rank_idx))
        return True, self.df.loc[rank_idx].head(num_recos)
            
    # Cleans input string for title match and NMF vectorization
    def clean_input(self, input_string):
        clean_str = re.sub('[^a-z0-9 \-]', '', input_string.lower().strip())
        #doc = self.scy(clean_str)
        #words = [token.lemma_ for token in doc]
        return set(clean_str.split())#, set(words)
    
    # Find cocktail name matches in input set
    def name_matches(self, input_set):
        mask = (self.name_sets - input_set) == set()
        return self.name_sets[mask].index.tolist()

In [10]:
cr = cocktail_recommender()

### 8. Recommender Testing
Cell for simple testing calls.

In [11]:
cr.recommend('rum', exclude_inputs=False, weirdness=.5)['name']

613      rustic manhattan
549      plantation fever
45         banana hammock
267    frozen pina colada
772           yellow bird
181       coconut martini
510                paloma
687    surprised cocktail
90            blue blazer
612            rum runner
Name: name, dtype: object

### 9. Pickle DataFrame

In [12]:
reco_pk = '../data/reco.pk'
var_to_pickle(cr, reco_pk)