# Scraping Spruce Eats

I scraped drink recipes from the website www.thespruceeats.com in a two-stage process. I first collected a list of drink names along with their individual page URLs and base spirits. I then looped through the page URLs and scraped recipes, descriptions, and image links for each drink. From there I cleaned the data before pickling it out.

### 1. Imports and Functions
* **var_to_pickle**: Writes the given variable to a pickle file
* **read_pickle**: Reads the given pickle file

In [1]:
import pandas as pd
import numpy as np
import requests
import time
import re
from unidecode import unidecode
from bs4 import BeautifulSoup

from code.lw_pickle import var_to_pickle, read_pickle

### 2. Get List of Recipe Links
Here I scrape drink names, URLs, and base spirits.

In [2]:
links_pk = '../data/se_drink_links.pk'
drink_links = read_pickle(links_pk)

# Scrape data only if pickle of links does not exist
if not drink_links:
    drink_links = []
    url = 'https://www.thespruceeats.com/a-to-z-cocktail-recipes-3962886'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'lxml')
    
    # Gets list of all list items containing a beverage
    list_items = soup.findAll('li', {'class':'', 'id':''})
    for item in list_items:
        
        # Gets recipe url and skips item if there isn't one
        a_tag = item.a
        if not a_tag:
            continue
        url = a_tag.get('href').strip()
        
        # Gets base spirits and skips item if there aren't any
        base_spirits = item.text.replace(a_tag.text, '').lower().strip()
        if not base_spirits:
            continue
            
        # Assigns name, base spirit, and recipe link to dictionary
        name = a_tag.text.lower().strip()
        drink_links.append({
            'name':name,
            'base_spirits':base_spirits,
            'url':url
        })
        
    # Writes out pickle of links
    var_to_pickle(drink_links, links_pk)

### 3. Create DataFrame and Clean Data
I save the scraped data into a DataFrame and then clean the names and base spirits. I also eliminate any duplicates and URLs that do not lead to an individual recipe page.

In [3]:
# Simple function that makes all necessary changes to a list of base spirits
def clean_base_spirits(spirit_list):
    liqueurs = ['chocolate', 'banana', 'melon', 'coffee']
    out_list = []
    for original in spirit_list:
        if not original:
            continue
        revised = (unidecode(original).strip()
                                      .replace('liqueurs', 'liqueur')
                                      .replace('add vodka', 'vodka')
                                      .replace('knob creek', 'whiskey'))
        if revised in liqueurs:
            revised += ' liqueur'
        out_list.append(revised)
    return out_list

In [4]:
df = pd.DataFrame(drink_links)
df.drop_duplicates(inplace=True)

# Cleans up names and adds column for name words
df['name'] = (df['name'].str.replace('\(.*\)', '')
                        .str.replace('\s*&\s*', ' and ')
                        .str.replace('old fashioned', 'old-fashioned')
                        .apply(unidecode)
                        .str.strip())
df['name_words'] = (df['name'].str.replace('[^a-z0-9 \-]', '')
                              .str.split())

# Drops an extra, branded old-fashioned entry
drop_idx = df[(df['name'] == 'old-fashioned') &
              (df['url'].str.contains('knob-creek'))].index
df.drop(drop_idx, inplace=True)

# Adds some measures to check how close name is to url
url_count = []
url_percent = []
for idx, row in df.iterrows():
    matches = [word in row['url'] for word in row['name_words']]
    url_count.append(sum(matches))
    url_percent.append(url_count[-1] / len(matches))
df['url_name_percent'] = url_percent
df['url_name_count'] = url_count

# Splits up base spirits into lists
df['base_spirits'] = (df['base_spirits'].str.replace('[\(\)]', '')
                                        .str.split(',| and | or ')
                                        .apply(clean_base_spirits))

# Remove specific recipes due to various issues
drop_names = [
    'corpse reviver',
    'tornado cocktail']
drop_urls = [
    'https://www.verywellfamily.com/best-diaper-bags-4161109',
    'https://www.thespruceeats.com/popular-brands-of-gin-to-try-4027227',
    'https://www.thespruceeats.com/wonderful-winter-cocktails-4123837',
    'https://www.thespruceeats.com/popular-brands-of-premium-vodka-759245',
    'https://www.thespruceeats.com/hot-toddy-collection-759883',
    'https://www.thespruceeats.com/christmas-cocktail-recipe-collection-759882',
    'https://www.thespruceeats.com/fantastic-sangria-recipes-759875',
    'https://www.thespruceeats.com/irish-whiskey-ginger-ale-beer-drinks-761457',
    'https://www.thespruceeats.com/spring-cocktail-recipes-759873']
df.drop(df[df['name'].isin(drop_names)].index, inplace=True)
df.drop(df[df['url'].isin(drop_urls)].index, inplace=True)

# Removes other duplicates by keeping only the names that best match their urls
dup_df = df[df.duplicated(subset='url', keep=False)]
df = df[~df.duplicated(subset='url', keep=False)]
dup_df = dup_df.sort_values(by=['url', 'url_name_percent', 'url_name_count'],
                            ascending=False)
keepers_df = dup_df.groupby('url', as_index=False).first()
df = df.reset_index(drop=True).append(keepers_df, sort=False)

# Cleans up DataFrame by removing columns that are no longer necessary
df = df.sort_values(by='name').reset_index(drop=True)
df.drop(['url_name_percent', 'url_name_count'], axis=1, inplace=True)

### 4. Get Recipes and Descriptions
Next I scraped all individual drink page data, including descriptions, image URLs, ingredients, prep time, and instructions.

In [5]:
recipes_pk = '../data/se_recipes.pk'
recipes = read_pickle(recipes_pk)

# Scrape data only if pickle of links does not exist
if not recipes:
    recipes = []
    for ind in df.index:
        response = requests.get(df.loc[ind]['url'])
        soup = BeautifulSoup(response.text, 'lxml')
        recipe_dict = {}
        
        # Gets description
        tag = soup.find('div', {'id':'article__header--project_1-0'})
        description = []
        if tag:
            for x in tag.findAll('div', {'class':'comp mntl-sc-block mntl-sc-block-html'}):
                description.append(x.text.strip())
        if description:
            recipe_dict['description'] = ' '.join(description)
        else:
            recipe_dict['description'] = None
            
        # Gets image
        image = soup.find('img', {'class':'figure__image js-figure-image'})
        if image:
            recipe_dict['image'] = image.get('src')
        else:
            recipe_dict['image'] = None
        
        # Gets ingredients
        tag = soup.find('section', {'id':'section--ingredients_1-0'})
        ingredients = []
        if tag:
            for x in tag.findAll('li'):
                ingredients.append(x.text.strip())
        recipe_dict['ingredients'] = ingredients
        
        # Gets prep time
        try:
            recipe_dict['prep_time'] = (soup.find('span', {'id':'meta-text_1-0'})
                                            .find('span', {'class':'meta-text__data'})
                                            .text)
        except:
            recipe_dict['prep_time'] = None
        
        # Gets instructions
        tag = soup.find('section', {'id':'section--instructions_1-0'})
        instructions = []
        if tag:
            for x in tag.findAll('div', {'class':'comp mntl-sc-block mntl-sc-block-html'}):
                instructions.append(x.text.strip())
        if instructions:
            recipe_dict['instructions'] = ' '.join(instructions)
        else:
            recipe_dict['instructions'] = None
        
        recipes.append(recipe_dict)
        
        # Pause every 20 sites
        if ind % 20 == 0:
            time.sleep(10)
        
    # Writes out pickle of recipes
    var_to_pickle(recipes, recipes_pk)

### 5. Merge Dataframe
Here I merge the per-drink data into the original DataFrame and remove the few entries that are missing values.

In [6]:
df = df.merge(pd.DataFrame(recipes), left_index=True, right_index=True)
df.dropna(inplace=True)
df.reset_index(drop=True, inplace=True)

### 6. Clean Recipe Data
I only cleaned the recipe columns that I ended up using for my model and app: description and ingredients. Later iterations could incorporate instructions and prep time, but here I just drop those columns.

In [7]:
# Simple function that cleans lists of ingredients
def clean_ingredients(ingredient_list):
    out_list = []
    for item in ingredient_list:
        revised = unidecode(item)
        revised = re.sub('\s+', ' ', revised)
        out_list.append(revised)
    return out_list

In [8]:
df.drop(['instructions', 'prep_time'], axis=1, inplace=True)
df['description'] = df['description'].apply(unidecode)
df['ingredients'] = df['ingredients'].apply(clean_ingredients)

### 7. Pickle DataFrame

In [9]:
df_pk = '../data/se_df.pk'
var_to_pickle(df, df_pk)