In [20]:
import pandas as pd
import numpy as np
import boardgamegeek as geek
from selenium.webdriver import Chrome 
import time
import re

#### Scraping the names from a query for games from 2015-2019

In [21]:
# This is the full query URL, splitting and adding the page number to iterate through for games.
url_start = "https://boardgamegeek.com/search/boardgame/page/"
url_end = """?advsearch=1&q=&include%5Bdesignerid%5D=&include%5Bpublisherid%5D=&geekitemname=&
            range%5Byearpublished%5D%5Bmin%5D=2015&range%5Byearpublished%5D%5Bmax%5D=2019&
            range%5Bminage%5D%5Bmax%5D=&range%5Bnumvoters%5D%5Bmin%5D=&range%5Bnumweights%5D%5Bmin%5D=&
            range%5Bminplayers%5D%5Bmax%5D=&range%5Bmaxplayers%5D%5Bmin%5D=&
            range%5Bleastplaytime%5D%5Bmin%5D=&range%5Bplaytime%5D%5Bmax%5D=&
            floatrange%5Bavgrating%5D%5Bmin%5D=&floatrange%5Bavgrating%5D%5Bmax%5D=&
            floatrange%5Bavgweight%5D%5Bmin%5D=&floatrange%5Bavgweight%5D%5Bmax%5D=&
            colfiltertype=&searchuser=&playerrangetype=normal&B1=Submit"""

In [22]:
# Initiate Selenium browser
browser = Chrome()

In [None]:
# games_all = []

# for i in range(1, 51):
#     game = []
    
#     # Creating the URL for each page and going to the page
#     url = url_start + str(i) + url_end
#     browser.get(url)
    
#     # Getting the name from each row in the table
#     game = [name.text for name in browser.find_elements_by_class_name("collection_objectname")]
#     games_all.append(game)
    
#     # Wait and restart process
#     time.sleep(10)

In [23]:
games_all = []

for i in range(1, 51):
    game = []

    # Creating the URL for each page and going to the page
    url = url_start + str(i) + url_end
    browser.get(url)

    # Getting the name from each row in the table
    game = [name.text.split('\n')[1:3] for name in browser.find_elements_by_id('row_')]

    games_all.append(game)

    time.sleep(10)

In [24]:
# Combining all games into one list, instead of lists for each page
games_combined = [game for subgames in games_all for game in subgames]

In [43]:
games_combined[1][0]

'Scythe (2016)'

In [53]:
new = []
for i, games in enumerate(games_combined):
    name = re.sub(" \(([^)]+)\)", "", games_combined[i][0]).rstrip()
    scores = games_combined[i][1].split(" ")
    new_vals = [name] + scores
    new.append(new_vals)
    
df = pd.DataFrame(new, columns=['Name', 'geek','avg','num_rev', 'o'])

In [57]:
df.drop('o', axis=1, inplace=True)
df.head()

In [58]:
df.to_csv('bgg_correct_ratings.csv')

In [None]:
# Temporarily dropping into csv to preserve scrap results
with open('all_game_names.csv','w', newline="") as f:
    for game in games_combined:
        f.write(game + "\n")

#### Using boardgamegeek API Python package to retreive details for each game

In [None]:
all_games = []

# The names all have a date at the end (ex. "Risk (1988)"). This is to strip that date and remove space.
for game in games_combined:
    clean_game = re.sub(" \(([^)]+)\)", "", game).rstrip()
    all_games.append(clean_game)

In [None]:
# Initiates and instance of API connection
req = geek.BoardGameGeek()

# Limiting the amount of API requests to not overload server
geek.utils.RateLimitingAdapter(rpm=50)

In [None]:
game_data = []
not_working = []


# Creating a loop for API requests to retrieve info for each game.
for game in all_games:
    try:
    # Try to get information and load into a temp dictionary for each
        v = req.game(game)
        game_dict = {'id': v.id,'rank': v.boardgame_rank, 'name': v.name,
                     'min_players': v.min_players, 'max_players': v.max_players,
                    'avg_time': v.playing_time, 'geek_rating': v.rating_average,
                     'rating_count': v.rating_num_weights,'age': v.min_age,
                     'mechanic': v.mechanics, 'category': v.categories,
                    'year': v.year, 'weight': v.rating_average_weight }
        temp_.append(game_dict)
        time.sleep(5)
    except Exception:
    # If for some reason the game is not coming up in the search, pass and add to this list
    # The boardgamegeek API package also gives a warning for games it can't find. 
        not_working.append(game)

In [None]:
# Checking the length of our results
len(game_data), len(not_working)

In [None]:
# Dropping data into a temporary Dataframe for formatting and outputting to CSV
temp_df = pd.DataFrame(game_data)
tempy.to_csv('bg14_19.csv')