# Scrape Spruce Eats

### Imports and Functions

In [63]:
import pandas as pd
import numpy as np
import requests
import unicodedata
import re
import time
from bs4 import BeautifulSoup

from code.lw_pickle import var_to_pickle, read_pickle

### Get List of Cocktail Links

In [64]:
links_pk = '../data/se_drink_links.pk'
drink_links = read_pickle(links_pk)

# Scrape data only if pickle of links does not exist
if not drink_links:
    drink_links = []
    url = 'https://www.thespruceeats.com/a-to-z-cocktail-recipes-3962886'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Gets list of all list items containing a beverage
    list_items = soup.findAll('li', {'class':'', 'id':''})
    for item in list_items:
        
        # Gets recipe url and skips item if there isn't one
        a_tag = item.a
        if not a_tag:
            continue
        url = a_tag.get('href').strip()
        
        # Gets base spirits and skips item if there aren't any
        base_spirits = item.text.replace(a_tag.text, '').lower().strip()
        if not base_spirits:
            continue
            
        # Assigns name, base spirit, and recipe link to dictionary
        name = a_tag.text.lower().strip()
        drink_links.append({
            'name':name,
            'base_spirits':base_spirits,
            'url':url
        })
        
    # Writes out pickle of links
    var_to_pickle(drink_links, links_pk)

### Create DataFrame and Clean Data

In [65]:
# Simple function that makes all necessary changes to a list of base spirits
def clean_base_spirits(spirit_list):
    liqueurs = ['chocolate', 'banana', 'melon', 'coffee']
    out_list = []
    for original in spirit_list:
        if not original:
            continue
        revised = (original.strip()
                           .replace('liqueurs', 'liqueur')
                           .replace('add vodka', 'vodka')
                           .replace('knob creek', 'whiskey'))
        if revised in liqueurs:
            revised += ' liqueur'
        out_list.append(revised)
    return tuple(out_list)

In [66]:
df = pd.DataFrame(drink_links)
df.drop_duplicates(inplace=True)

# Cleans up names and adds column for name words
df['name'] = df['name'].str.replace('\(.*\)', '').str.strip()
df['name_words'] = (df['name'].str.replace('[^a-z0-9 \-]', '')
                                          .str.split())

# Adds some measures to check how close name is to url
url_count = []
url_percent = []
for idx, row in df.iterrows():
    matches = [word in row['url'] for word in row['name_words']]
    url_count.append(sum(matches))
    url_percent.append(url_count[-1] / len(matches))
df['url_name_percent'] = url_percent
df['url_name_count'] = url_count

# Splits up base spirits into lists
base_spirits = df['base_spirits'].str.replace('[\(\)]', '')
base_spirits = base_spirits.str.split(',| and | or ')
base_spirits = base_spirits.apply(clean_base_spirits)
df['base_spirits'] = base_spirits

# Remove specific recipes due to various issues
drop_names = [
    'corpse reviver',
    'tornado cocktail']
drop_urls = [
    'https://www.verywellfamily.com/best-diaper-bags-4161109',
    'https://www.thespruceeats.com/popular-brands-of-gin-to-try-4027227',
    'https://www.thespruceeats.com/wonderful-winter-cocktails-4123837',
    'https://www.thespruceeats.com/popular-brands-of-premium-vodka-759245',
    'https://www.thespruceeats.com/hot-toddy-collection-759883',
    'https://www.thespruceeats.com/christmas-cocktail-recipe-collection-759882',
    'https://www.thespruceeats.com/fantastic-sangria-recipes-759875',
    'https://www.thespruceeats.com/irish-whiskey-ginger-ale-beer-drinks-761457',
    'https://www.thespruceeats.com/spring-cocktail-recipes-759873']
df.drop(df[df['name'].isin(drop_names)].index, inplace=True)
df.drop(df[df['url'].isin(drop_urls)].index, inplace=True)

# Removes other duplicates by finding the names that worst match their urls
dup_df = df[df.duplicated(subset='url', keep=False)]
df = df[~df.duplicated(subset='url', keep=False)]
dup_df = dup_df.sort_values(by=['url', 'url_name_percent', 'url_name_count'],
                            ascending=False)
keepers_df = dup_df.groupby('url', as_index=False).first()
df = df.reset_index(drop=True).append(keepers_df, sort=False)

df = df.sort_values(by='name').reset_index(drop=True)
df.drop(['url_name_percent', 'url_name_count'], axis=1, inplace=True)

### Get Recipes and Descriptions

In [67]:
recipes_pk = '../data/se_recipes.pk'
recipes = read_pickle(recipes_pk)

# Scrape data only if pickle of links does not exist
if not recipes:
    recipes = []
    for ind in df.index:
        response = requests.get(df.loc[ind]['url'])
        soup = BeautifulSoup(response.text, 'lxml')
        recipe_dict = {}
        
        # Gets description
        tag = soup.find('div', {'id':'article__header--project_1-0'})
        description = []
        if tag:
            for x in tag.findAll('div', {'class':'comp mntl-sc-block mntl-sc-block-html'}):
                description.append(x.text.strip())
        if description:
            recipe_dict['description'] = ' '.join(description)
        else:
            recipe_dict['description'] = None
            
        # Gets image
        image = soup.find('img', {'class':'figure__image js-figure-image'})
        if image:
            recipe_dict['image'] = image.get('src')
        else:
            recipe_dict['image'] = None
        
        # Gets ingredients
        tag = soup.find('section', {'id':'section--ingredients_1-0'})
        ingredients = []
        if tag:
            for x in tag.findAll('li'):
                ingredients.append(x.text.strip())
        recipe_dict['ingredients'] = ingredients
        
        # Gets prep time
        try:
            recipe_dict['prep_time'] = (soup.find('span', {'id':'meta-text_1-0'})
                                            .find('span', {'class':'meta-text__data'})
                                            .text)
        except:
            recipe_dict['prep_time'] = None
        
        # Gets instructions
        tag = soup.find('section', {'id':'section--instructions_1-0'})
        instructions = []
        if tag:
            for x in tag.findAll('div', {'class':'comp mntl-sc-block mntl-sc-block-html'}):
                instructions.append(x.text.strip())
        if instructions:
            recipe_dict['instructions'] = ' '.join(instructions)
        else:
            recipe_dict['instructions'] = None
        
        recipes.append(recipe_dict)
        
        # Pause every 20 sites
        if ind % 20 == 0:
            time.sleep(10)
        
    # Writes out pickle of recipes
    var_to_pickle(recipes, recipes_pk)

### Merge Dataframe

In [68]:
df = df.merge(pd.DataFrame(recipes), left_index=True, right_index=True)

In [69]:
# Remove any rows that have missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

### Add Description Lengths

In [70]:
df['desc_length'] = df['description'].str.len()

### Pre-process Descriptions

In [71]:
import re
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer

nlp = spacy.load("en_core_web_sm")

In [72]:
def desc_prepro(desc):
    pos_keep = ['ADJ', 'NOUN', 'PROPN']
    doc = nlp(desc)
    words = [token.lemma_ for token in doc if token.pos_ in pos_keep]
    words = list(filter(lambda word: '-' not in word, words))
    return ' '.join(words)

In [73]:
df['proc_desc'] = df['description'].map(desc_prepro)

### Stop Words

In [74]:
from sklearn.feature_extraction import text
stop_words = ['cocktail', 'drink', 'recipe', 'make', 'mix', 'flavor', 'good',
              'ingredient', 'taste', 'perfect', 'little', 'bar', 'nice', 'blue',
              'great', 'way', 'favorite', 'new', 'popular', 'delicious', 'green',
              'party', 'fun', 'black', 'sure', 'time', 'glass', 'woo', 'year',
              'st', 'shot', 'garnish', 'pink', 'bit', 'different', 'choice',]
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

### Cocktail-Based Recommender

In [111]:
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import euclidean_distances

In [108]:
tfv = TfidfVectorizer(stop_words=stop_words)
tf_mtx = tfv.fit_transform(df['proc_desc'].values)

In [139]:
dist_func = cosine_similarity
dist_df = pd.DataFrame(dist_func(tf_mtx.todense()))

In [187]:
def recommend_by_cocktail(cocktails, num_recos=5):
    if type(cocktails) != list:
        cocktails = [cocktails]
    
    # Calculate closest recipes by distance
    dist_sums = dist_df.loc[cocktails].sum(axis=0)
    dist_sums.sort_values(ascending=False, inplace=True)
    ranked_idx = dist_sums[dist_sums.index.isin(cocktails) == False]
    ranked_df = df.loc[ranked_idx.index.tolist()]
    
    return ranked_df.head(num_recos)

In [215]:
sources = df.sample(2)

In [216]:
sources['ingredients'].tolist()

[['2 ounces XUXU Strawberry Liqueur',
  '1 ounce Gran Centenario Reposado Tequila',
  '1/2 ounce fresh lime juice',
  '1/2 ounce simple syrup',
  'Fanned strawberry for garnish',
  '-----',
  'For Rhubarb Soda:',
  '6 cups chopped rhubarb stalks',
  '1 1/2 cup sugar',
  '1 cup waterClub soda'],
 ['1 1/2 ounces whiskey\xa0(bourbon)',
  '1 ounce lemon (juice of)',
  '1/2 ounce simple syrup',
  '2\u200b ounces\xa0\u200bclub soda',
  'Garnish:\xa0\u200b\u200bmaraschino cherry',
  'Garnish:\xa0\u200borange slice']]

In [217]:
recommend_by_cocktail(sources.index.tolist())['ingredients'].tolist()

[['1 1/2 ounces vodka',
  '1 ounce lemon juice',
  '1/2 ounce simple syrup',
  '1 splash of Club soda',
  'Garnish: orange slice',
  'Garnish: maraschino cherry'],
 ['1 1/2 ounces tequila',
  '1 ounce lemon juice',
  '1/2 ounce agave nectar',
  '2 oz.\xa0club soda\xa0(or enough to top the glass)',
  'Garnish: Lemon, lime wedge'],
 ['For the Rhubarb Syrup:',
  '1 1/2 cups rhubarb juice',
  '1/2 cup sugar',
  'For the Rhubarb Collins:',
  "2 1/2 ounces gin (Nolet's Silver Dry Gin)",
  '1 ounce rhubarb syrup',
  '1 ounce\xa0lime juice',
  'Dash\xa0rhubarb bitters\xa0(Fee Brothers)',
  '2 ounces club soda (to fill)',
  '4 dashes cardamom bitters'],
 ['1 1/2 ounces gin',
  '1 ounce lemon juice',
  '1/2 ounce simple syrup',
  '3 ounces club soda (or to fill)',
  'Garnish: maraschino cherry',
  'Garnish:\xa0lemon or orange slice'],
 ['1 1/2 ounces\xa0\u200bwhiskey',
  '1 1/2 ounces\xa0\u200blemon juice\xa0(fresh)',
  '3/4 ounce\xa0\u200bsimple syrup',
  'Optional:\xa0\u200begg white',
  'Garn

### SVD

In [97]:
from scipy.linalg import svd

In [103]:
C, Sigma, VT = svd(td_mtx.todense())

In [107]:
VT = VT[:3,:]
pd.DataFrame(VT)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,3657,3658,3659,3660,3661,3662,3663,3664,3665,3666
0,-0.000261,-0.000239,-0.003224,-0.000676,-0.000458,-0.000458,-0.000327,-0.000526,-0.003623,-0.001449,...,-0.003176,-0.000487,-0.000846,-0.00026,-0.000732,-0.00329,-0.001841,-0.004588,-0.003094,-0.000314
1,-0.000205,-0.000188,0.009479,-0.000535,0.000934,0.000934,-6.2e-05,0.001577,0.003762,0.002453,...,-0.000894,-0.000365,-0.001497,-0.000188,0.002017,-0.000358,-0.002326,-0.004897,-0.003421,-0.00029
2,0.000334,0.000514,0.005506,0.001212,0.001263,0.001263,0.000176,0.000709,0.00227,0.001037,...,-0.005704,0.000836,0.00187,0.000158,0.000332,-0.00525,-0.000137,-0.00087,0.002392,0.000584


### Topic Modeling

In [87]:
from sklearn.feature_extraction.text import CountVectorizer

In [96]:
cv = CountVectorizer(stop_words=stop_words)
td_mtx = cv.fit_transform(df['proc_desc'].values)

In [None]:
from gensim import corpora, models, similarities, matutils

In [78]:
corpus = matutils.Sparse2Corpus(td_mtx.transpose())
id2word = dict((v, k) for k, v in cv.vocabulary_.items())
lda = models.LdaModel(corpus=corpus, num_topics=50, id2word=id2word, passes=5)

In [79]:
lda.print_topics()

[(2,
  '0.024*"gin" + 0.022*"margarita" + 0.020*"ginger" + 0.019*"easy" + 0.015*"spicy" + 0.013*"lime" + 0.012*"refreshing" + 0.011*"orange" + 0.011*"sweet" + 0.011*"spice"'),
 (4,
  '0.029*"gin" + 0.025*"cucumber" + 0.020*"mint" + 0.016*"hot" + 0.015*"wasabi" + 0.013*"japanese" + 0.013*"fresh" + 0.012*"spicy" + 0.012*"vodka" + 0.012*"sauce"'),
 (28,
  '0.023*"sour" + 0.017*"punch" + 0.017*"brandy" + 0.016*"vodka" + 0.016*"fruity" + 0.014*"easy" + 0.014*"margarita" + 0.012*"orange" + 0.012*"peach" + 0.012*"sugar"'),
 (20,
  '0.026*"whiskey" + 0.019*"simple" + 0.018*"sweet" + 0.017*"tall" + 0.016*"liqueur" + 0.015*"bourbon" + 0.014*"ginger" + 0.013*"coffee" + 0.012*"ale" + 0.011*"easy"'),
 (10,
  '0.020*"white" + 0.019*"gin" + 0.018*"chocolate" + 0.015*"liqueur" + 0.014*"rum" + 0.014*"alabama" + 0.011*"sake" + 0.010*"simple" + 0.010*"orange" + 0.009*"sweet"'),
 (22,
  '0.109*"tea" + 0.049*"long" + 0.038*"island" + 0.031*"iced" + 0.020*"vanilla" + 0.016*"liqueur" + 0.013*"liquor" + 0.011

In [80]:
lda_corpus = lda[corpus]
lda_docs = [doc for doc in lda_corpus]

In [81]:
topics = [sorted(doc, key=lambda x: x[1])[-1][0] for doc in lda_docs]
df['topic'] = topics

In [84]:
df['topic'].value_counts()

16    37
41    34
8     33
39    30
48    30
30    25
0     21
37    21
14    20
9     20
45    20
21    19
28    19
43    18
33    18
42    17
34    17
38    17
29    17
19    15
17    15
6     15
13    15
2     14
25    14
32    14
1     13
20    13
7     13
24    13
23    13
44    13
49    12
26    12
46    11
27    11
31    11
12    11
4     10
10    10
40     9
5      9
35     9
18     9
11     9
3      8
47     8
22     7
15     6
36     4
Name: topic, dtype: int64

In [85]:
df[df['topic'] == 36]['proc_desc'].values

array(['blue blazer cocktail advanced bartender rolling flaming whiskey mug story drink jerry thomas 1800 etching drink famous image professor that today blue blazer drink pro anyone flaming drink english christmas punch dr. pepper goblet fire sure safety tip',
       'boulevardier sophisticated classic cocktail that whiskey negroni boulevardier cocktail negroni common cocktail world exact date creation drink little sketchy true negroni drink other base spirit sweet vermouth campari negroni gin boulevardier opt whiskey bourbon vermouth campari natural choice aperitif subtle combination flavor that pleasant host option negroni boulevardier dinner party taste whiskey guest',
       'joy irish whiskey cocktail decade tradition few drink sophisticated design recipe massey cocktail exceptional example art mixology one that connoisseur fine drink sure massey cocktail creation portland bartender jacob grier tribute irish grandmother stunning mix top shelf spirit irish whiskey gin sweet vermou

### Train KMeans Model

In [60]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

%matplotlib inline

In [61]:
tfv = TfidfVectorizer(stop_words='english')
descriptions = tfv.fit_transform(df['proc_desc'].values)

In [62]:
inertias = []
sil_scores = []
k_clusters = range(2, 30)
for k in k_clusters:
    km = KMeans(n_clusters=k)
    km.fit(descriptions)
    inertias.append(km.inertia_)
    sil_scores.append(silhouette_score(descriptions, km.labels_, metric='euclidean'))

KeyboardInterrupt: 

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True)
ax1.plot(k_clusters, sil_scores)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, inertias)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
km = KMeans(n_clusters=15)
km.fit(descriptions)
df['cluster'] = km.labels_

### Examine Results

In [None]:
df['cluster'].value_counts()

In [None]:
for c in sorted(df['cluster'].unique()):
    print(f'Cluster {c}:')
    for x in df[df['cluster'] == c].sample(10)[['name', 'base_spirits']].values:
        print(x[0].ljust(30)[:30], '\t', ', '.join(x[1])[:30])
    print('')