# Board Game Recommender - Data
## Extract board game data by web scraping and using the BGG API

In [None]:
import requests
from bs4 import BeautifulSoup
from lxml import etree

import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options

import os
from pathlib import Path

import time
from time import sleep

In [None]:
work_dir = Path().resolve()
username=os.environ.get("localBGGUser")
password=os.environ.get("localBGGPassword")

In [None]:
# Path to the ChromeDriver executable
driver_path = work_dir/"chromedriver/chromedriver"

# Configure Chrome options
options = Options()
#options.binary_location = "C:/Program Files (x86)/Google/Chrome/Application/chrome.exe"
options.add_argument('--headless')  # Run Chrome in headless mode

# Set up the Chrome driver
service = Service(driver_path)
driver = webdriver.Chrome(service=service, options=options)

In [None]:
login_url = 'https://www.boardgamegeek.com/login'
driver.get(login_url)

# Find and interact with the login form elements
username_input = driver.find_element(By.NAME, 'username')
password_input = driver.find_element(By.NAME, 'password')
login_button = driver.find_element(By.CSS_SELECTOR, 'button[type="submit"]')

# Enter the username and password
username_input.send_keys(username)
password_input.send_keys(password)

# Click the login button
login_button.click()

In [None]:
# Scanned on 20230712

df_all = pd.DataFrame(columns=["id", "name", "rank"])
npage = 1
min_rank = 1e5
max_rank = 1e6

while min_rank != max_rank:
    driver.get(f"https://boardgamegeek.com/browse/boardgame/page/{npage}?sort=rank&sortdir=asc")
    
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    table = soup.find_all("tr", attrs={"id": "row_"})
    
    df = pd.DataFrame(columns=["id", "name", "rank"], index = range(len(table)))
    
    for idx, row in enumerate(table):
        links = row.find_all("a")
        if "name" in links[0].attrs.keys():
            rank = int(links[0]["name"])
            gamelink = links[2] # Get relative url of each game
            gameid = int(gamelink["href"].split("/")[2]) # Get game id
            gamename = gamelink.contents[0] # Get the actual name of the game
                
        df.iloc[idx, :] = [gameid, gamename, rank]
    min_rank = df["rank"].min()
    max_rank = df["rank"].max()

    if min_rank != max_rank:
        print(f"Page {npage} scraped, games betweens ranks {min_rank} and {max_rank}")
        df_all = pd.concat([df_all, df], axis=0)
        npage += 1
    
    sleep(2)
    
driver.quit()

df_all = df_all.drop_duplicates()
df = df_all.copy()
df.reset_index(inplace=True, drop=True)
df.to_csv(work_dir/"data/bgg_gamelist.csv", index=False, encoding="utf-8")

In [None]:
df = pd.read_csv(work_dir/"data/bgg_gamelist.csv")

In [None]:
def request(msg, slp=2):
    '''A wrapper for robust https requests'''
    status_code = 500 
    while status_code != 200:
        sleep(slp) # Avoid pinging the server too often to not get blacklisted
        try:
            r = requests.get(msg)
            status_code = r.status_code
            if status_code != 200:
                print(f"Server Error! Response code {status_code}. Retrying...")
        except:
            print("An exceptions has occurred, probably momentary loss of connection. Waiting for a second...")
            sleep(1)
    return r

In [None]:
def retrieve_game_info(game_ids):

    id_list = ','.join(str(game_id) for game_id in game_ids)
    url = f'https://boardgamegeek.com/xmlapi2/thing?id={id_list}&stats=1'
    response = request(url)
    return response.content # Change when done debugging

In [None]:
def parse_bgg_xml(xml_data):
    
    game_info = []
    
    root = etree.fromstring(xml_data)

    for item in root.xpath('//item'):
        id = item.xpath('@id')[0]
        name = item.xpath('.//name[@type="primary"]/@value')[0] if bool(item.xpath('.//name[@type="primary"]/@value')) else None
        thumbnail = item.xpath('thumbnail/text()')[0] if bool(item.xpath('thumbnail/text()')) else None
        image = item.xpath('image/text()')[0] if bool(item.xpath('image/text()')) else None
        description = item.xpath('description/text()')[0] if bool(item.xpath('description/text()')) else None
        year_published = item.xpath('.//yearpublished/@value')[0] if bool(item.xpath('.//yearpublished/@value')) else None
        min_players = item.xpath('.//minplayers/@value')[0] if bool(item.xpath('.//minplayers/@value')) else None
        max_players = item.xpath('.//maxplayers/@value')[0] if bool(item.xpath('.//maxplayers/@value')) else None
        playing_time = item.xpath('.//playingtime/@value')[0] if bool(item.xpath('.//playingtime/@value')) else None
        min_playtime = item.xpath('.//minplaytime/@value')[0] if bool(item.xpath('.//minplaytime/@value')) else None
        max_playtime = item.xpath('.//maxplaytime/@value')[0] if bool(item.xpath('.//maxplaytime/@value')) else None
        min_age = item.xpath('.//minage/@value')[0] if bool(item.xpath('.//minage/@value')) else None
        average_rating = item.xpath('.//statistics/ratings/average/@value')[0] if bool(item.xpath('.//statistics/ratings/average/@value')) else None
        bgg_rating = item.xpath('.//statistics/ratings/bayesaverage/@value')[0] if bool(item.xpath('.//statistics/ratings/bayesaverage/@value')) else None
        rank = item.xpath('.//statistics/ratings/ranks/rank[@type="subtype"]/@value')[0] if bool(item.xpath('.//statistics/ratings/ranks/rank[@type="subtype"]/@value')) else None
        
        categories = []
        mechanics = []
        publishers = []
        designers = []
        artists = []
        expansions = []

        id = item.xpath('@id')[0]
        for category in item.xpath('.//link[@type="boardgamecategory"]'):
            categories.append(category.xpath('@value')[0])    
        categories = ", ".join(categories)

        for mechanic in item.xpath('.//link[@type="boardgamemechanic"]'):
            mechanics.append(mechanic.xpath('@value')[0])
        mechanics = ", ".join(mechanics)

        for publisher in item.xpath('.//link[@type="boardgamepublisher"]'):
            publishers.append(publisher.xpath('@value')[0])    
        publishers = ", ".join(publishers)

        for designer in item.xpath('.//link[@type="boardgamedesigner"]'):
            designers.append(designer.xpath('@value')[0])    
        designers = ", ".join(designers)

        for artist in item.xpath('.//link[@type="boardgameartist"]'):
            artists.append(artist.xpath('@value')[0])    
        artists = ", ".join(artists)

        for expansion in item.xpath('.//link[@type="boardgameexpansion"]'):
            expansion_id = expansion.xpath('@id')[0]
            expansion_name = expansion.xpath('@value')[0]
            expansion_combo = expansion_id + "__" + expansion_name
            expansions.append(expansion_combo)    
        expansions = ", ".join(expansions)
        
        
        game_info.append({
            'id':id,
            'name':name, 
            'image':image,
            'thumbnail':thumbnail,
            'description':description,
            'min_players':min_players, 
            'max_players':max_players,
            'playing_time':playing_time,
            'year_published':year_published,    
            'bgg_rating':bgg_rating,
            'avg_rating':average_rating, 
            'rank':rank, 
            'mechanics':mechanics,
            'designers':designers,
            'artists':artists,
            'publishers':publishers,
            'min_playtime':min_playtime,
            'max_playtime':max_playtime,
            'min_age':min_age,
            })
        
    return game_info

In [None]:
def get_bgg_api_data(game_ids, ids_each_time = 5):
    
    df_games = pd.DataFrame(columns=['id', 'name', 'image', 'thumbnail', 'description', 'min_players',  'max_players', 
                                 'playing_time', 'year_published', 'bgg_rating', 'avg_rating', 'rank', 'mechanics', 
                                 'designers', 'artists', 'publishers', 'min_playtime', 'max_playtime', 'min_age'])
    
    num_game_ids = len(game_ids)
    
    for i in range(0, num_game_ids, ids_each_time):
        
        start = i
        end = i + ids_each_time
        
        if end > num_game_ids:
            end = num_game_ids
        
        id_subset = game_ids[start:end]
        id_subset_str = ", ".join([str(id) for id in id_subset])
        
        xml_data = retrieve_game_info(id_subset)
        game_info = parse_bgg_xml(xml_data)
        df = pd.DataFrame(game_info)
        
        df_games = pd.concat([df_games, df])
              
        print(f"Data for game IDs: {id_subset_str} acquired.")
        
    return df_games
    

In [None]:
# Scanned on 20230712
game_ids = pd.read_csv(work_dir/"data/bgg_gamelist.csv")["id"].sort_values().to_list()

start_time = time.time()

df_games = get_bgg_api_data(game_ids, 10)

end_time = time.time()
diff_time = end_time - start_time

print(f"Scraping took {diff_time} seconds to complete.")

df_games.reset_index(inplace=True, drop=True)
df_games.to_csv(work_dir/"data/bgg_gamelist_all_details.csv", index=False, encoding="utf-8")