# Scrape Spruce Eats

### Imports and Functions

In [1]:
import pandas as pd
import numpy as np
import requests
import unicodedata
import re
import time
from bs4 import BeautifulSoup

from code.lw_pickle import var_to_pickle, read_pickle

### Get List of Cocktail Links

In [2]:
links_pk = '../data/se_drink_links.pk'
drink_links = read_pickle(links_pk)

# Scrape data only if pickle of links does not exist
if not drink_links:
    drink_links = []
    url = 'https://www.thespruceeats.com/a-to-z-cocktail-recipes-3962886'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Gets list of all list items containing a beverage
    list_items = soup.findAll('li', {'class':'', 'id':''})
    for item in list_items:
        
        # Gets recipe url and skips item if there isn't one
        a_tag = item.a
        if not a_tag:
            continue
        url = a_tag.get('href').strip()
        
        # Gets base spirits and skips item if there aren't any
        base_spirits = item.text.replace(a_tag.text, '').lower().strip()
        if not base_spirits:
            continue
            
        # Assigns name, base spirit, and recipe link to dictionary
        name = a_tag.text.lower().strip()
        drink_links.append({
            'name':name,
            'base_spirits':base_spirits,
            'url':url
        })
        
    # Writes out pickle of links
    var_to_pickle(drink_links, links_pk)

### Create DataFrame and Clean Data

In [3]:
# Simple function that makes all necessary changes to a list of base spirits
def clean_base_spirits(spirit_list):
    liqueurs = ['chocolate', 'banana', 'melon', 'coffee']
    out_list = []
    for original in spirit_list:
        if not original:
            continue
        revised = (original.strip()
                           .replace('liqueurs', 'liqueur')
                           .replace('add vodka', 'vodka')
                           .replace('knob creek', 'whiskey'))
        if revised in liqueurs:
            revised += ' liqueur'
        out_list.append(revised)
    return tuple(out_list)

In [4]:
df = pd.DataFrame(drink_links)
df.drop_duplicates(inplace=True)

# Cleans up names and adds column for name words
df['name'] = df['name'].str.replace('\(.*\)', '').str.strip()
df['name_words'] = (df['name'].str.replace('[^a-z0-9 \-]', '')
                                          .str.split())

# Adds some measures to check how close name is to url
url_count = []
url_percent = []
for idx, row in df.iterrows():
    matches = [word in row['url'] for word in row['name_words']]
    url_count.append(sum(matches))
    url_percent.append(url_count[-1] / len(matches))
df['url_name_percent'] = url_percent
df['url_name_count'] = url_count

# Splits up base spirits into lists
base_spirits = df['base_spirits'].str.replace('[\(\)]', '')
base_spirits = base_spirits.str.split(',| and | or ')
base_spirits = base_spirits.apply(clean_base_spirits)
df['base_spirits'] = base_spirits

# Remove specific recipes due to various issues
drop_names = [
    'corpse reviver',
    'tornado cocktail']
drop_urls = [
    'https://www.verywellfamily.com/best-diaper-bags-4161109',
    'https://www.thespruceeats.com/popular-brands-of-gin-to-try-4027227',
    'https://www.thespruceeats.com/wonderful-winter-cocktails-4123837',
    'https://www.thespruceeats.com/popular-brands-of-premium-vodka-759245',
    'https://www.thespruceeats.com/hot-toddy-collection-759883',
    'https://www.thespruceeats.com/christmas-cocktail-recipe-collection-759882',
    'https://www.thespruceeats.com/fantastic-sangria-recipes-759875',
    'https://www.thespruceeats.com/irish-whiskey-ginger-ale-beer-drinks-761457',
    'https://www.thespruceeats.com/spring-cocktail-recipes-759873']
df.drop(df[df['name'].isin(drop_names)].index, inplace=True)
df.drop(df[df['url'].isin(drop_urls)].index, inplace=True)

# Removes other duplicates by finding the names that worst match their urls
dup_df = df[df.duplicated(subset='url', keep=False)]
df = df[~df.duplicated(subset='url', keep=False)]
dup_df = dup_df.sort_values(by=['url', 'url_name_percent', 'url_name_count'],
                            ascending=False)
keepers_df = dup_df.groupby('url', as_index=False).first()
df = df.reset_index(drop=True).append(keepers_df, sort=False)

df = df.sort_values(by='name').reset_index(drop=True)
df.drop(['url_name_percent', 'url_name_count'], axis=1, inplace=True)

### Get Recipes and Descriptions

In [5]:
recipes_pk = '../data/se_recipes.pk'
recipes = read_pickle(recipes_pk)

# Scrape data only if pickle of links does not exist
if not recipes:
    recipes = []
    for ind in df.index:
        response = requests.get(df.loc[ind]['url'])
        soup = BeautifulSoup(response.text, 'lxml')
        recipe_dict = {}
        
        # Gets description
        tag = soup.find('div', {'id':'article__header--project_1-0'})
        description = []
        if tag:
            for x in tag.findAll('div', {'class':'comp mntl-sc-block mntl-sc-block-html'}):
                description.append(x.text.strip())
        if description:
            recipe_dict['description'] = ' '.join(description)
        else:
            recipe_dict['description'] = None
            
        # Gets image
        image = soup.find('img', {'class':'figure__image js-figure-image'})
        if image:
            recipe_dict['image'] = image.get('src')
        else:
            recipe_dict['image'] = None
        
        # Gets ingredients
        tag = soup.find('section', {'id':'section--ingredients_1-0'})
        ingredients = []
        if tag:
            for x in tag.findAll('li'):
                ingredients.append(x.text.strip())
        recipe_dict['ingredients'] = ingredients
        
        # Gets prep time
        try:
            recipe_dict['prep_time'] = (soup.find('span', {'id':'meta-text_1-0'})
                                            .find('span', {'class':'meta-text__data'})
                                            .text)
        except:
            recipe_dict['prep_time'] = None
        
        # Gets instructions
        tag = soup.find('section', {'id':'section--instructions_1-0'})
        instructions = []
        if tag:
            for x in tag.findAll('div', {'class':'comp mntl-sc-block mntl-sc-block-html'}):
                instructions.append(x.text.strip())
        if instructions:
            recipe_dict['instructions'] = ' '.join(instructions)
        else:
            recipe_dict['instructions'] = None
        
        recipes.append(recipe_dict)
        
        # Pause every 20 sites
        if ind % 20 == 0:
            time.sleep(10)
        
    # Writes out pickle of recipes
    var_to_pickle(recipes, recipes_pk)

### Merge Dataframe

In [6]:
df = df.merge(pd.DataFrame(recipes), left_index=True, right_index=True)

In [7]:
# Remove any rows that have missing values
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

### Add Description Lengths

In [8]:
df['desc_length'] = df['description'].str.len()

### Pre-process Descriptions

In [49]:
import re
from textblob import TextBlob
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [52]:
def desc_prepro(desc):
    pos_keep = ['FW', 'JJ', 'JJR', 'JJS', 'MD', 'NN', 'NNS', 'NNP', 'NNPS', 'RB',
                'RBR', 'RBS', 'UH', 'VB', 'VBZ', 'VBP', 'VBD', 'VBN', 'VBG']
    desc = re.sub('[^[A-za-z ]]*', '', desc)
    tb = TextBlob(desc)
    words = []
    return desc

In [53]:
g = desc_prepro(df.sample(1)['description'].values[0])
tb = TextBlob(g)

In [57]:
tb.tags[0]

('Theres', 'VBZ')

In [10]:
df['proc_desc'] = df['description'].map(desc_prepro)

In [11]:
df.sample(5)

Unnamed: 0,base_spirits,name,url,name_words,description,image,ingredients,instructions,prep_time,desc_length,proc_desc
281,"(gin,)",gin buck,https://www.thespruceeats.com/gin-buck-cocktai...,"[gin, buck]",Many of the best mixed drinks are the easiest ...,https://www.thespruceeats.com/thmb/xT7nMgvlRmb...,"[2 ounces gin, 3​ ounces Ginger ale, Garnish: ...",Gather the ingredients. Pour the gin into an i...,3 mins,815,many of the best mixed drinks are the easiest ...
610,"(whiskey,)",ruby queen,https://www.thespruceeats.com/ruby-queen-cockt...,"[ruby, queen]",Have you ever wondered if you could mix Scotch...,https://www.thespruceeats.com/thmb/27Gty2xCHs2...,[1 1/2 ounces Cutty Sark Blended Scotch Whisky...,Pour the ingredients into a cocktail shaker fi...,3 mins,1315,have you ever wondered if you could mix scotch...
163,"(rum,)",chicago fizz,https://www.thespruceeats.com/classic-chicago-...,"[chicago, fizz]",The Chicago Fizz is a classic mixed drink that...,https://www.thespruceeats.com/thmb/-K3RQMv-2Ok...,"[1 ounce dark rum, 1 ounce ruby port, 1/2 ounc...",Pour all ingredients (except the soda) into a ...,4 mins,769,the chicago fizz is a classic mixed drink that...
410,"(gin,)",lavender sapphire collins,https://www.thespruceeats.com/lavender-sapphir...,"[lavender, sapphire, collins]",The Lavender Sapphire Collins is a delightful ...,https://www.thespruceeats.com/thmb/8Co8cvZ3sgC...,"[1 1/2 ounces Bombay Sapphire Gin, 3/4 ounce f...",Squeeze the juice of half a lemon into a colli...,3 mins,746,the lavender sapphire collins is a delightful ...
560,"(vodka,)",pot of gold,https://www.thespruceeats.com/pot-of-gold-cock...,"[pot, of, gold]","Simple, elegant, refreshing, and quite lovely,...",https://www.thespruceeats.com/thmb/y6wSdUAyLBv...,"[2 ounces vodka, 1/2 ounce elderflower liqueur...",Gather the ingredients. In the bottom of a coc...,3 mins,623,simple elegant refreshing and quite lovely tha...


### Stop Words

In [30]:
from sklearn.feature_extraction import text
stop_words = ['cocktail', 'drink', 'recipe', 'make', 'mix', 'flavor']
stop_words = text.ENGLISH_STOP_WORDS.union(stop_words)

### Topic Modeling

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from gensim import corpora, models, similarities, matutils

In [32]:
cv = CountVectorizer(stop_words=stop_words)
td_mtx = cv.fit_transform(df['proc_desc'].values).transpose()

In [33]:
corpus = matutils.Sparse2Corpus(td_mtx)
id2word = dict((v, k) for k, v in cv.vocabulary_.items())

In [34]:
lda = models.LdaModel(corpus=corpus, num_topics=10, id2word=id2word, passes=5)

In [35]:
lda.print_topics()

[(0,
  '0.011*"vodka" + 0.011*"mix" + 0.007*"perfect" + 0.007*"easy" + 0.006*"just" + 0.006*"make" + 0.006*"champagne" + 0.005*"blue" + 0.005*"flavor" + 0.005*"wine"'),
 (1,
  '0.008*"punch" + 0.008*"simple" + 0.008*"make" + 0.007*"mix" + 0.006*"orange" + 0.006*"little" + 0.006*"great" + 0.006*"sangria" + 0.005*"liqueur" + 0.005*"makes"'),
 (2,
  '0.011*"vodka" + 0.010*"irish" + 0.007*"cream" + 0.007*"whiskey" + 0.007*"great" + 0.007*"perfect" + 0.007*"just" + 0.007*"coffee" + 0.007*"easy" + 0.006*"gin"'),
 (3,
  '0.013*"gin" + 0.009*"vodka" + 0.008*"cocktails" + 0.008*"flavor" + 0.008*"martini" + 0.007*"simple" + 0.007*"mix" + 0.007*"just" + 0.007*"drinks" + 0.007*"sweet"'),
 (4,
  '0.010*"rum" + 0.009*"mix" + 0.008*"great" + 0.008*"like" + 0.008*"classic" + 0.007*"drinks" + 0.006*"recipes" + 0.006*"just" + 0.006*"perfect" + 0.006*"ingredients"'),
 (5,
  '0.011*"vodka" + 0.011*"martini" + 0.009*"simple" + 0.007*"flavor" + 0.006*"make" + 0.006*"coffee" + 0.006*"liqueur" + 0.005*"really

### Train KMeans Model

In [None]:
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

%matplotlib inline

In [None]:
tfv = TfidfVectorizer(stop_words='english')
descriptions = tfv.fit_transform(df['proc_desc'].values)

In [None]:
inertias = []
sil_scores = []
k_clusters = range(2, 30)
for k in k_clusters:
    km = KMeans(n_clusters=k)
    km.fit(descriptions)
    inertias.append(km.inertia_)
    sil_scores.append(silhouette_score(descriptions, km.labels_, metric='euclidean'))

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2, figsize=(15,5), sharex=True)
ax1.plot(k_clusters, sil_scores)
ax1.set_xlabel('number of clusters')
ax1.set_ylabel('silhouette coefficient')

# plot here on ax2
ax2.plot(k_clusters, inertias)
ax2.set_xlabel('number of clusters')
ax2.set_ylabel('SSE');

In [None]:
km = KMeans(n_clusters=15)
km.fit(descriptions)
df['cluster'] = km.labels_

### Examine Results

In [None]:
df['cluster'].value_counts()

In [None]:
for c in sorted(df['cluster'].unique()):
    print(f'Cluster {c}:')
    for x in df[df['cluster'] == c].sample(10)[['name', 'base_spirits']].values:
        print(x[0].ljust(30)[:30], '\t', ', '.join(x[1])[:30])
    print('')