In [17]:
import time
from urllib import urlencode
import selenium.webdriver
from bs4 import BeautifulSoup
import itertools
import pandas as pd
import numpy as np

def search_boardgamegeek(query, browser, delay=10):
    """Query boardgamegeek.com with search and return raw HTML content.
    
    Arguments
    ---------
    browser : selenium.webdriver.firefox.webdriver.WebDriver
        Constructed with browser=selenium.webdriver.Firefox()
    query : str

    Returns
    -------
    browser.page_source : raw HTML
    """
    search_url = "https://boardgamegeek.com/browse/boardgame/page/{}".format(query)
    browser.get(search_url)
    #time.sleep(delay)  # No delay necessary on this site
    return browser.page_source

def get_game_rank(game_tag):
    """Return the price of a product selected from the walmart.com site.
    
    If a range of prices is shown, return the first price.
    If no price element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
    
    Returns
    -------
    game_rank : float
    """
    rank_elements = game_tag.select('td.collection_rank')
    if rank_elements:
        # If we found prices, return the first one
        game_rank = rank_elements[0].text.strip()
        return int(game_rank)
    
def get_game_title_year(game_tag, num):
    """Return the title of a game selected from the BGG site.
    
    If no title element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
    
    Returns
    -------
    game_title_year : unicode (game title and year it was published)
    """
    thing = 'div#results_objectname{}'.format(num)
    title_elements = game_tag.select(thing)
    if title_elements:
        game_title = title_elements[0].text.strip().split('\n')
        if type(game_title) == unicode:
            game_title = product_title.encode('utf-8')
        return game_title_year

def get_bgg_rating(game_tag):
    """Return the BBG rating of a game selected from the BGG site.

    If no ratings element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
    
    Returns
    -------
    bgg_rating : float
    """
    rating_elements = game_tag.select('td.collection_bggrating')
    if rating_elements:
        # If we found ratings, return the first one
        bgg_rating = rating_elements[0].text.strip()
        return float(bgg_rating)
    
def get_user_rating(game_tag):
    """Return the user rating of a game selected from the BGG site.
    
    If no ratings element is found, return None.
    
    Arguments
    ---------
    product_tag : bs4.element.Tag
    
    Returns
    -------
    user_rating : float
    """
    rating_elements = game_tag.select('td.collection_bggrating')
    if rating_elements:
        # If we found ratings, return the second one
        user_rating = rating_elements[1].text.strip()
        return float(user_rating)
    
def get_num_ratings(game_tag):
    """Return the user rating of a game selected from the BGG site.

    If no ratings element is found, return None.

    Arguments
    ---------
    product_tag : bs4.element.Tag
    
    Returns
    -------
    num_ratings : float
    """
    rating_elements = game_tag.select('td.collection_bggrating')
    if rating_elements:
        # If we found ratings, return the third element
        num_ratings = rating_elements[2].text.strip()
        return int(num_ratings)
    
def get_boardgamegeek_game_details(query, browser):
    """Search BGG and return game details from the appropriate page.
    
    Arguments
    ---------
    query : str
        Search keyword
    browser : selenium.webdriver.firefox.webdriver.WebDriver
        Constructed with browser=selenium.webdriver.Firefox()      
        
    Returns
    -------
    product_details : lst
        (product_title : str, product_price : float)
    """
    html = search_boardgamegeek(query, browser)
    soup = BeautifulSoup(html, 'html.parser')
    game_tags = soup.find_all("tr", id="row_")
    game_tags.extend(soup.find_all("tr", id="row_"))
    return [(get_game_rank(t), get_game_title_year(t, i+1), get_bgg_rating(t), get_user_rating(t), get_num_ratings(t)) \
            for i, t in enumerate(game_tags)]

browser = selenium.webdriver.Firefox()
game_details = []
for i in xrange(1, 135):
    game_details.append(get_boardgamegeek_game_details(i, browser)[:100])
browser.quit()

game_details_new = list(itertools.chain.from_iterable(game_details))
game_ratings = list(game_details_new)
for i, line in enumerate(game_details_new):
    try:
        game_ratings[i] = (line[0], line[1][0], line[1][1].strip('\)\('), line[2], line[3], line[4])
    except:
        game_ratings[i] = (line[0], line[1][0], 'N/A', line[2], line[3], line[4])
#Convert list into pandas dataframe
first_13400 = pd.DataFrame(game_ratings, index=range(1, 13401), \
                           columns=['Rank', 'Title', 'Year Published', 'Geek Rating', 'Avg Rating', 'Num Ratings'])
#Save pandas dataframe to a csv file
first_13400.to_csv('game_ratings.csv', encoding='utf-8', index=False)

In [1]:
import time
from urllib import urlencode
import selenium.webdriver
from bs4 import BeautifulSoup
import itertools
import pandas as pd
import numpy as np

In [2]:
first_13400 = pd.read_csv('data/game_rankings.csv')

In [27]:
first_13400

Unnamed: 0,Rank,Title,Year Published,Geek Rating,Avg Rating,Num Ratings
0,1,Pandemic Legacy: Season 1,2015.0,8.469,8.66,15951
1,2,Through the Ages: A New Story of Civilization,2015.0,8.306,8.77,6279
2,3,Twilight Struggle,2005.0,8.226,8.36,26424
3,4,Terra Mystica,2012.0,8.153,8.29,22791
4,5,Star Wars: Rebellion,2016.0,8.130,8.56,7286
5,6,Scythe,2016.0,8.082,8.35,12711
6,7,7 Wonders Duel,2015.0,8.029,8.20,18324
7,8,Caverna: The Cave Farmers,2013.0,8.022,8.19,16163
8,9,The Castles of Burgundy,2011.0,7.994,8.11,24014
9,10,Puerto Rico,2002.0,7.991,8.10,47075


In [4]:
first_13400.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13400 entries, 0 to 13399
Data columns (total 6 columns):
Rank              13400 non-null int64
Title             13400 non-null object
Year Published    13292 non-null float64
Geek Rating       13400 non-null float64
Avg Rating        13400 non-null float64
Num Ratings       13400 non-null int64
dtypes: float64(3), int64(2), object(1)
memory usage: 628.2+ KB


In [13]:
print first_13400['Year Published'].unique()

[ 2015.  2005.  2012.  2016.  2013.  2011.  2002.  2007.  2006.  2014.
  2004.  2008.  2017.  2010.  1995.  2009.  1981.  1997.  1876.  1999.
  2000. -2200.  1991.  1996.  2003.  1986.  1993.  1998.  1985.  2001.
  1979.  1980.  1964.  1982.  1994.  1992.  1990.  1983.  1989.  1475.
  1977.  1925.  1959.  1800.  1630.  1850.  1988.  1984.  1987.  1810.
  1978.  1971.   762.  1974.  1973.  1000. -3000.  1962.    nan  1947.
  1938.  1848.  1903.  1948.  1976.  1745.  1967.  1895.  1930.  1904.
  1972.  1906.  1960.  1970.  1966.  1975.  1965.   400.  1944.  1942.
  1969.  1883.  1963.  1600.  1932.  1939.  1701.  1780.  1968.  1430.
  1909.  1921.  1940.  1956.  1951.  1870.  1663.  1715.  1885.  1425.
   550.  1860.  1958.   700.  2018.  1955.  1680.  1890. -3500.  1796.
  1889.  1887.  1954.  1911.  1881.  1892.  1802.  1830.  1700.  1950.
  1913.  1742.  1961.   600.  1910.  1869.  1915.  1900.  1949.  1534.
  1943.  1952.  1783.   650.  1775.  1825.  1945.  1919.  1941.  -100.
  1933