In [17]:
import time
from urllib import urlencode  # TODO(Miles): Python 3
import selenium.webdriver
from bs4 import BeautifulSoup
import itertools
import pandas as pd
import numpy as np

In [3]:
def search_boardgamegeek(query, browser, delay=10):
    """Query walmart.com for a keyword and return raw HTML content.
    
    Arguments
    ---------
    browser : selenium.webdriver.firefox.webdriver.WebDriver
        Constructed with browser=selenium.webdriver.Firefox()
    query : str

    """
    search_url = "https://boardgamegeek.com/browse/boardgame/page/{}".format(query)
    #search_url = base_url.format(
        #urlencode([('query', query)])  # TODO(Miles): Python 3
        #) 
    browser.get(search_url)
    time.sleep(delay)  # Wait a few seconds before getting the HTML source
    return browser.page_source
    

In [4]:
def get_product_price(product_tag):
    """Return the price of a product selected from the walmart.com site.
    
    If a range of prices is shown, return the first price.
    If no price element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
        div.search-result-gridview-item (from walmart.com)
    
    Returns
    -------
    product_price : float
    """
    price_elements = product_tag.select('td.collection_bggrating')
    if price_elements:
        # If we found prices, return the first one
        product_price = price_elements[0].text.strip()
        return float(product_price)

In [5]:
def get_product_price3(product_tag):
    """Return the price of a product selected from the walmart.com site.
    
    If a range of prices is shown, return the first price.
    If no price element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
        div.search-result-gridview-item (from walmart.com)
    
    Returns
    -------
    product_price : float
    """
    price_elements = product_tag.select('td.collection_bggrating')
    if price_elements:
        # If we found prices, return the first one
        product_price = price_elements[2].text.strip()
        return int(product_price)

In [6]:
def get_product_price2(product_tag):
    """Return the price of a product selected from the walmart.com site.
    
    If a range of prices is shown, return the first price.
    If no price element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
        div.search-result-gridview-item (from walmart.com)
    
    Returns
    -------
    product_price : float
    """
    price_elements = product_tag.select('td.collection_bggrating')
    if price_elements:
        # If we found prices, return the first one
        product_price = price_elements[1].text.strip()
        return float(product_price)

In [7]:
def get_product_rank(product_tag):
    """Return the price of a product selected from the walmart.com site.
    
    If a range of prices is shown, return the first price.
    If no price element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
        div.search-result-gridview-item (from walmart.com)
    
    Returns
    -------
    product_price : float
    """
    rank_elements = product_tag.select('td.collection_rank')
    if rank_elements:
        # If we found prices, return the first one
        product_rank = rank_elements[0].text.strip()
        return int(product_rank)

In [8]:
def get_product_title(product_tag, num):
    """Return the title of a product selected from the walmart.com site.
    
    If no title element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
        div.search-result-gridview-item (from walmart.com)
    
    Returns
    -------
    product_title : str
    """
    thing = 'div#results_objectname{}'.format(num)
    title_elements = product_tag.select(thing)
    if title_elements:
        product_title = title_elements[0].text.strip().split('\n')
        if type(product_title) == unicode:  #TODO(Miles): Python 3
            product_title = product_title.encode('utf-8')
        return product_title

In [9]:
def get_boardgamegeek_product_details(query, browser):
    """Search walmart.com and return product details from the first page.
    
    Arguments
    ---------
    query : str
        Search keyword
    browser : selenium.webdriver.firefox.webdriver.WebDriver
        Constructed with browser=selenium.webdriver.Firefox()      
        
    Returns
    -------
    product_details : lst
        (product_title : str, product_price : float)
    """
    html = search_boardgamegeek(query, browser)
    soup = BeautifulSoup(html, 'html.parser')
    product_tags = soup.find_all("tr", id="row_")
    product_tags.extend(soup.find_all("tr", id="row_"))
    return [(get_product_rank(t), get_product_title(t, i+1), get_product_price(t), get_product_price2(t), get_product_price3(t)) \
            for i, t in enumerate(product_tags)]
    

In [26]:
browser = selenium.webdriver.Firefox()
product_details = []
for i in xrange(1, 135):
    product_details.append(get_boardgamegeek_product_details(i, browser)[:100])
browser.quit()

In [21]:
product_details

[[(1, [u'Pandemic Legacy: Season 1', u'(2015)'], 8.469, 8.66, 15980),
  (2,
   [u'Through the Ages: A New Story of Civilization', u'(2015)'],
   8.306,
   8.77,
   6288),
  (3, [u'Twilight Struggle', u'(2005)'], 8.225, 8.36, 26436),
  (4, [u'Terra Mystica', u'(2012)'], 8.152, 8.29, 22808),
  (5, [u'Star Wars: Rebellion', u'(2016)'], 8.13, 8.56, 7306),
  (6, [u'Scythe', u'(2016)'], 8.083, 8.35, 12755),
  (7, [u'7 Wonders Duel', u'(2015)'], 8.029, 8.2, 18366),
  (8, [u'Caverna: The Cave Farmers', u'(2013)'], 8.021, 8.19, 16174),
  (9, [u'The Castles of Burgundy', u'(2011)'], 7.995, 8.11, 24030),
  (10, [u'Puerto Rico', u'(2002)'], 7.991, 8.1, 47089),
  (11, [u'Agricola', u'(2007)'], 7.973, 8.06, 47536),
  (12, [u'War of the Ring (Second Edition)', u'(2012)'], 7.945, 8.38, 6019),
  (13, [u'Mage Knight Board Game', u'(2011)'], 7.943, 8.14, 17684),
  (14,
   [u'Through the Ages: A Story of Civilization', u'(2006)'],
   7.94,
   8.1,
   15228),
  (15, [u'Blood Rage', u'(2015)'], 7.934, 8.18,

In [27]:
product_details_new = list(itertools.chain.from_iterable(product_details))

In [28]:
game_ratings = list(product_details_new)
for i, line in enumerate(product_details_new):
    try:
        game_ratings[i] = (line[0], line[1][0], line[1][1].strip('\)\('), line[2], line[3], line[4])
    except:
        game_ratings[i] = (line[0], line[1][0], 'N/A', line[2], line[3], line[4])

In [29]:
game_ratings

[(1, u'Pandemic Legacy: Season 1', u'2015', 8.469, 8.66, 15951),
 (2,
  u'Through the Ages: A New Story of Civilization',
  u'2015',
  8.306,
  8.77,
  6279),
 (3, u'Twilight Struggle', u'2005', 8.226, 8.36, 26424),
 (4, u'Terra Mystica', u'2012', 8.153, 8.29, 22791),
 (5, u'Star Wars: Rebellion', u'2016', 8.13, 8.56, 7286),
 (6, u'Scythe', u'2016', 8.082, 8.35, 12711),
 (7, u'7 Wonders Duel', u'2015', 8.029, 8.2, 18324),
 (8, u'Caverna: The Cave Farmers', u'2013', 8.022, 8.19, 16163),
 (9, u'The Castles of Burgundy', u'2011', 7.994, 8.11, 24014),
 (10, u'Puerto Rico', u'2002', 7.991, 8.1, 47075),
 (11, u'Agricola', u'2007', 7.973, 8.06, 47516),
 (12, u'War of the Ring (Second Edition)', u'2012', 7.944, 8.38, 6009),
 (13, u'Mage Knight Board Game', u'2011', 7.943, 8.14, 17674),
 (14, u'Through the Ages: A Story of Civilization', u'2006', 7.94, 8.1, 15226),
 (15, u'Blood Rage', u'2015', 7.934, 8.18, 12126),
 (16, u'Star Wars: Imperial Assault', u'2014', 7.914, 8.22, 10533),
 (17, u'Terr

In [30]:
first_13400 = pd.DataFrame(game_ratings, index=range(1, 13401), columns=['Rank', 'Title', 'Year Published', 'Geek Rating', 'Avg Rating', 'Num Ratings'])

In [31]:
first_13400

Unnamed: 0,Rank,Title,Year Published,Geek Rating,Avg Rating,Num Ratings
1,1,Pandemic Legacy: Season 1,2015,8.469,8.66,15951
2,2,Through the Ages: A New Story of Civilization,2015,8.306,8.77,6279
3,3,Twilight Struggle,2005,8.226,8.36,26424
4,4,Terra Mystica,2012,8.153,8.29,22791
5,5,Star Wars: Rebellion,2016,8.130,8.56,7286
6,6,Scythe,2016,8.082,8.35,12711
7,7,7 Wonders Duel,2015,8.029,8.20,18324
8,8,Caverna: The Cave Farmers,2013,8.022,8.19,16163
9,9,The Castles of Burgundy,2011,7.994,8.11,24014
10,10,Puerto Rico,2002,7.991,8.10,47075


In [32]:
first_13400.to_csv('~/galvanize_work/capstone/game_ratings.csv', encoding='utf-8', index=False)