In [12]:
from bs4 import BeautifulSoup
from selenium import webdriver
import csv
import requests
from tqdm import tqdm_notebook as tqdm
import re
import time
import numpy.random as nrand

In [2]:
MAIN_URL = 'https://www.cellartracker.com/list.asp'
HEADERS = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36'}

In [3]:
DIR = '../data/cellar-tracker-scrape/'
URL_FILE = DIR + 'wines_url.csv'
WINE_DATA_FILE = DIR + 'cellar_tracker_wine_data.csv'
TASTING_NOTES_FILE = DIR + 'cellar_tracker_reviews.csv'

In [4]:
def scrape(driver, max_page):
    '''Master function that handles the actual scraping'''
    for page_number in range(1,max_page):
        winelist_url = get_wine_list_url(page_number)
        wines_urls = get_wines_url(driver, winelist_url)
        display('wines_urls: '+str(wines_urls))
        write_to_csv(wines_urls, URL_FILE)
        for wine_url in wines_urls:
            scrape_wine(wine_url)
            scrape_reviews(wine_url)
            time.sleep(2)

def scrape_wine(wine_url):
    '''Scraper for one single wine, getting basic infos'''
    wine_soup = get_wine_soup(wine_url)
    wine_infos = get_wine_infos(wine_soup)
    write_to_csv(wine_infos,WINE_DATA_FILE)

def scrape_reviews(wine_url):
    '''Scraper for one single wine, getting reviews'''
    wine_id, soup = get_tasting_notes(wine_url)
    reviews = parse_tasting_notes(wine_id,soup)
    write_to_csv(reviews,TASTING_NOTES_FILE)

In [5]:
def get_wine_list_url(page_number):
    '''Parse URL of page of list of wines, given page number'''
    payload = {'Table': 'List',
                'iUserOverride': 0,
                'O': 'Quantity DESC',
                'page':page_number}
    r = requests.get(MAIN_URL, params=payload, headers=HEADERS)
    if r.status_code == 200:
        return r.url
    else:
        print(r.status_code)
        print(r.text)
        return ''

def get_wines_url(driver, wine_list_url):
    '''Get list of wines urls given an url of a page of 
    wine list'''
    url_list = []
    driver.get(wine_list_url)
    wines = driver.find_elements_by_class_name('more')
    for wine in wines:
        url_list.append(wine.get_attribute('href'))
    return url_list

In [11]:
def get_wine_soup(wine_url):
    '''Given url for single wine, get content as BeautifulSoup'''
    driver.get(wine_url)
    source = driver.page_source
    soup = BeautifulSoup(source)
    display('wine_soup: '+soup.prettify())
    return soup

def strip_white_space(text):
    """Clean up \n, \xa0 and the likes"""
    to_replace = ['\n', '\xa0', '\t']
    for unwanted in to_replace:
        text = text.replace(unwanted,' ') # replace with space
    return text

def get_main_score(soup):
    '''Get wine main score'''
    scorebox = soup.find('div', class_='scorebox')
    s = scorebox.text
    s = s[s.find('CT')+3:]
    s = s[:s.find('\n')]
    return s

def find_href_by_keyword(soup, keyword):
    return soup.find('a', href=re.compile(keyword)).text

def get_wine_infos(soup):
    '''Get basic wine infos from a BeautifulSoup parse'''
    res = {}
    res['title'] = soup.title.text
    res['varietal'] = soup.h2.text
    res['score'] = get_main_score(soup)
    res['country'] = find_href_by_keyword(soup, 'Country')
    res['region_1'] = find_href_by_keyword(soup, 'Region')
    res['region_2'] = find_href_by_keyword(soup, 'SubRegion')
    return [res]

In [7]:
def get_tasting_notes(driver,wine_url):
    '''Given wine url, get all available tasting notes'''
    wine_id = wine_url[wine_url.rfind('iWine=')+6:]
    review_url = 'https://www.cellartracker.com/notes.asp?iWine='+wine_id
    driver.get(review_url)
    source = driver.page_source
    soup = BeautifulSoup(r.text)
    display('wine_tasting_soup: '+soup.prettify())
    return wine_id,soup

def parse_tasting_notes(wine_id,soup):
    '''Given BeautifulSoup of tasting notes,
    Parse note & corresponding score'''
    res = []
    notes = soup.findAll('p', itemprop='reviewBody')
    for note in notes:
        review = note.text
        score = note.parent.find('span', itemprop='ratingValue')
        if score:
            score = score.text
        res.append({'wine':wine_id,'note':review, 'score':score})
    return res

In [8]:
def write_to_csv(data, file):
    with open(file, 'a+') as f:
        writer = csv.writer(f)
        for row in data:
            writer.writerow(row)

In [9]:
driver = webdriver.Chrome('/usr/local/Caskroom/chromedriver/2.41/chromedriver')

In [14]:
nrand.choice(wine_urls, replace=False)

NameError: name 'wine_urls' is not defined