In [1]:
import logging

logger = logging.getLogger('bgg_dataset')
logger.handlers.clear()
handler = logging.StreamHandler()
formatter = logging.Formatter(
        '%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
handler.setFormatter(formatter)
logger.addHandler(handler)
logger.setLevel(logging.DEBUG)

logger.debug('test')

2022-11-10 07:46:11,861 bgg_dataset  DEBUG    test


In [2]:
from dataclasses import dataclass
import re
from typing import Any, Dict

tag_removal_pattern = re.compile('<[^<]+?>')

@dataclass
class BoardGameFileInfo:
    id: int
    name: str
    description: str
    extension: str
    
    @classmethod
    def from_file_info(cls, file_info: Dict[str, Any]):
        try:
            rendered_description = file_info['description']['rendered']
            # the description contains HTML tags that are removed with this regex
            description = tag_removal_pattern.sub('', rendered_description)
            filename, extension = file_info['filename'].rsplit('.', 1)
            return cls(file_info['fileid'],
                       filename,
                       description,
                       extension.lower()
                      )
        except:
            return None

In [3]:
import aiohttp
from typing import List

internal_bgg_api = 'https://api.geekdo.com/api'

async def get_bgg_filelist(client, thing_id: int) -> List[BoardGameFileInfo]:
    url = f"{internal_bgg_api}/files?ajax=1&nosession=1&objectid={thing_id}&objecttype=thing&pageid=1&showcount=25&sort=hot&languageid=2184"
    async with client.get(url) as response:
        content = await response.json()
        files = content['files']
        file_list = filter(lambda x: x is not None and x.extension == 'pdf', [BoardGameFileInfo.from_file_info(file) for file in files])
        return file_list

async with aiohttp.ClientSession() as client:
    print(list(await get_bgg_filelist(client, 1))[0:3])

[BoardGameFileInfo(id='140639', name='Die Macher Brief by Liumas 2015-01', description="Turn sequence, setup, and symbol descriptions.This new 'FINAL' version is all PDF and easier to print and cut out using the guidelines.", extension='pdf'), BoardGameFileInfo(id='139190', name='Die Macher Condensed Rules by Liumas 2015-01', description='Complete 3rd Edition rules condensed onto 6 pages, sans Comic Sans.', extension='pdf'), BoardGameFileInfo(id='62318', name='Die Macher player aide_BW', description='Die Macher player aide black & white', extension='pdf')]


In [4]:
from string import punctuation
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_sm')

async def get_rulebook_info(client, thing_id: int) -> BoardGameFileInfo:
    '''return the rulebook info or None if the boardgame has no rulebook'''
    def tokenizer(raw_doc: str):
        doc = nlp(raw_doc)
        lemmas = (token.lemma_ for token in doc if token.lemma_ not in punctuation)
        return filter(lambda x: not nlp.vocab[x].is_stop, lemmas)
    
    raw_docs = [x for x in await get_bgg_filelist(client, thing_id)]
    if len(raw_docs) == 0:
        return None
    vectorizer = TfidfVectorizer(strip_accents='ascii', lowercase=True, tokenizer=tokenizer)
    doc_vectors = vectorizer.fit_transform(f'{x.name} {x.description}' for x in raw_docs)

    query_vector = vectorizer.transform(["revised official rule rulebook update new"])
    docs_ranked = cosine_similarity(query_vector, doc_vectors)

    return next(raw_docs[i] for i, score in sorted(enumerate(docs_ranked[0]), key=lambda x: -x[1]))

async with aiohttp.ClientSession() as client:
    print(await get_rulebook_info(client, 1))

BoardGameFileInfo(id='139185', name='Die Macher Best Options by Liumas 2015-01', description='The standard official rules updates for 2nd and 3rd editions.', extension='pdf')


In [5]:
import aiohttp
import asyncio
from typing import Callable

class RetryingClientResponse:
    def __init__(self, response_getter: Callable, *, allow_redirects=True, **kwargs):
        self._response_getter = response_getter
        
    async def __aenter__(self):        
        need_new_captcha = True
        while need_new_captcha:
            self._response_coroutine = self._response_getter()
            self._response = await self._response_coroutine.__aenter__()
            need_new_captcha = self._response.status != 200
            if need_new_captcha:
                logger.debug('new captcha is needed')
                await self._response_coroutine.__aexit__(None, None, None)
                # sleeping because the recaptcha needs to be re-validated
                await asyncio.sleep(2)
        
        return self._response        

    async def __aexit__(self, exc_type, exc_value, exc_tb):
        await self._response_coroutine.__aexit__(exc_type, exc_value, exc_tb)

#async with aiohttp.ClientSession() as client:
#    async with RetryingClientResponse(lambda: client.get('https://www.google.it')) as response:
#        print(await response.text())

In [6]:
import os
from datetime import datetime, timedelta
import fitz as PyMuPDF
import re

regex_non_readable_text = re.compile(r'[^\w!"#$%&\'()*+,\-./:;<=>?@[\\\]^`{|}~]', re.ASCII)
regex_useless_periods = re.compile(r'\.[^a-zA-Z0-9]+')
bggusername = os.getenv('BGGUSERNAME')
bggpassword = os.getenv('BGGPASSWORD')

async def boardgamegeek_authenticate(client, bggusername: str, bggpassword: str) -> str:
    '''returns the auth token'''
    url = 'https://boardgamegeek.com/api/accounts/current'
    headers = { 'cookie': f'bggusername={bggusername}; bggpassword={bggpassword}' }
    async with client.get(url, headers=headers) as response:
        content = await response.json()
        return content['authToken']

def clean_text(text: str) -> str:
    cleaned_text = regex_non_readable_text.sub(' ', text)
    cleaned_text = regex_useless_periods.sub('.', cleaned_text)
    return cleaned_text
    
async def get_file_content(client, auth_token: str, file: BoardGameFileInfo) -> str:
    url = f'{internal_bgg_api}/files/downloadurls?ids={file.id}' 
    headers = { 'Authorization': f'GeekAuth {auth_token}' }
    async with RetryingClientResponse(lambda: client.get(url, headers=headers)) as response:
        content = await response.json()        
        download_url = 'https://boardgamegeek.com' + content['downloadUrls'][0]['url']
        logger.debug(download_url)
        async with RetryingClientResponse(lambda: client.get(download_url)) as pdf_response:
            pdf_data = await pdf_response.read()
            with PyMuPDF.open(stream=pdf_data, filetype="pdf") as doc:
                text = ""
                for page in doc:
                    text += page.get_text() + " "
                return clean_text(text)
                  
async with aiohttp.ClientSession() as client:
    try:
        if token_expire <= datetime.now():
            raise NameError
    except NameError:
        token_expire = datetime.now() + timedelta(hours=2)
        token = await boardgamegeek_authenticate(client, bggusername, bggpassword)
    file = list(await get_bgg_filelist(client, 1))[0]
    print(file)

    print(len(await get_file_content(client, token, file)))

2022-11-10 07:50:00,753 bgg_dataset  DEBUG    https://boardgamegeek.com/file/download_redirect/f7b81aa77c41b704648b58bf5c7d3152f36753dd72236def/Die+Macher+Brief+by+Liumas+2015-01.pdf


BoardGameFileInfo(id='140639', name='Die Macher Brief by Liumas 2015-01', description="Turn sequence, setup, and symbol descriptions.This new 'FINAL' version is all PDF and easier to print and cut out using the guidelines.", extension='pdf')
8506


In [7]:
import pandas as pd
from dataclasses import asdict
from typing import List, Any
from models.BoardGameData import *

def write_bgs_to_csv(file_path: str, bgs: List[BoardGame], keep_header: bool) -> None:
    '''write BoardGameList to csv in append mode if keep_header=False or in write mode if keep_header=True'''
    df = pd.json_normalize([asdict(bg) for bg in bgs])
    df.drop(columns=['info.numweights'], inplace=True)
    df.to_csv(file_path, header=keep_header, index=False, mode='w' if keep_header else 'a')    

In [9]:
# !!! IMPORTANT !!! reload a web page of board game geek that contains a recaptcha, for example
#     https://boardgamegeek.com/filepage/23411/complete-condensed-rules-liumas
# this is necessary to re-validate the recaptcha. it can be done with Easy Auto Refresh extension or something like this
import aiohttp
import asyncio
from typing import List
import ipywidgets as widgets
import re
import xml.etree.ElementTree as xe

bgg_api_root = 'https://boardgamegeek.com/xmlapi2'
BGG_COUNT = 95
BGG_THING_IDS_PER_CALL = 100
STARTING_BGG_THING_ID = 180050
BGG_THING_ID_OFFSET = 10
PERIODS_THRESHOLD = 10 # a rulebook must contain at least PERIODS_THRESHOLD sentences
MIN_NUMWEIGHTS = 10 # at least MIN_NUMWEIGHTS users have rated the weight of the game
DATASET_FILE_PATH = 'data/dataset.csv'
regex_check_text_present = re.compile('[a-zA-Z]')

progress_bar = widgets.IntProgress(
    value=0,
    min=0,
    max=BGG_COUNT,
    description='Downloaded BoardGames info:',
    bar_style='success',
    orientation='horizontal'
)
display(progress_bar)

async def fill_dataset():
    async def get_boardgamegeek_thing(client, *thing_ids) -> str:
        url = f'{bgg_api_root}/thing?id={",".join(str(x) for x in thing_ids)}&type=boardgame&pagesize=100&stats=1'
        logger.debug(url)
        async with client.get(url) as response:
            assert response.status == 200
            return (await response.read()).decode('utf-8')

    def is_text_valid(text: str) -> bool:
        return regex_check_text_present.search(text) is not None and \
            text.count('.') >= PERIODS_THRESHOLD

    async def get_bg_file_content(client, fileinfo: BoardGameFileInfo) -> str:
        '''returns the file content only if the fileinfo is not None and at least one word exists, otherwise returns an empty string'''
        if fileinfo is None:
            return ''
        file_content = await get_file_content(client, token, fileinfo)
        return file_content if is_text_valid(file_content) else ''

    async def get_bg(client, bg_info: BoardGameInfo) -> BoardGame:
        thing_id = bg_info.id
        fileinfo = await get_rulebook_info(client, thing_id)
        return BoardGame(bg_info, await get_bg_file_content(client, fileinfo))

    async def get_boardgames(client, *thing_ids) -> List[BoardGame]:    
        '''
        get a list of `BoardGame` that corresponds to the thing_ids requested. The list could be smaller than `len(thing_ids)`
        because filtered using `type=boardgame` and there must be at least one vote on the weight
        '''
        assert len(thing_ids) <= 100

        items = xe.fromstring(await get_boardgamegeek_thing(client, *thing_ids))
        bg_infos = filter(lambda x: x is not None and x.numweights > MIN_NUMWEIGHTS, [BoardGameInfo.from_item(item) for item in items])

        bgs = []
        for bg_info in bg_infos:
            bg = await get_bg(client, bg_info)
            if bg.rulebook != '':
                bgs.append(bg)
                progress_bar.value += 1
                if progress_bar.value == BGG_COUNT:
                    break
                await asyncio.sleep(1)

        return bgs
    while progress_bar.value < BGG_COUNT:
        try:
            async with aiohttp.ClientSession() as client:
                # I take a sample of bgs that are sparse. this is because each bg id is BGG_THING_ID_OFFSET-far from the next one 
                get_thing_id = lambda call_num, thing_id_num: STARTING_BGG_THING_ID + thing_id_num * BGG_THING_ID_OFFSET + \
                                                              call_num * BGG_THING_IDS_PER_CALL * BGG_THING_ID_OFFSET
                call_num = 0
                while progress_bar.value < BGG_COUNT:
                    bgs = await get_boardgames(client, *[get_thing_id(call_num, x) for x in range(BGG_THING_IDS_PER_CALL)])
                    if len(bgs) > 0:
                        write_bgs_to_csv(DATASET_FILE_PATH, bgs, keep_header=call_num == 0)
                    call_num += 1
        except (aiohttp.ServerDisconnectedError, aiohttp.ClientConnectorError):
            logger.warning('Client lost connection')
                
    
await fill_dataset()

IntProgress(value=0, bar_style='success', description='Downloaded BoardGames info:', max=95)

2022-11-10 08:04:09,418 bgg_dataset  DEBUG    https://boardgamegeek.com/xmlapi2/thing?id=180050,180060,180070,180080,180090,180100,180110,180120,180130,180140,180150,180160,180170,180180,180190,180200,180210,180220,180230,180240,180250,180260,180270,180280,180290,180300,180310,180320,180330,180340,180350,180360,180370,180380,180390,180400,180410,180420,180430,180440,180450,180460,180470,180480,180490,180500,180510,180520,180530,180540,180550,180560,180570,180580,180590,180600,180610,180620,180630,180640,180650,180660,180670,180680,180690,180700,180710,180720,180730,180740,180750,180760,180770,180780,180790,180800,180810,180820,180830,180840,180850,180860,180870,180880,180890,180900,180910,180920,180930,180940,180950,180960,180970,180980,180990,181000,181010,181020,181030,181040&type=boardgame&pagesize=100&stats=1
2022-11-10 08:04:10,514 bgg_dataset  DEBUG    https://boardgamegeek.com/file/download_redirect/a863a204f38f85912e80413ddd456cabbcbf9dc6d214bb82/Automobiles+Quick+Rules+Guide.p

In [None]:
"""
async def get_boardgames(*thing_ids) -> List[BoardGame]:    
    '''
    get a list of `BoardGameInfo` that corresponds to the thing_ids requested. The list could be smaller than `len(thing_ids)`
    because filtered using `type=boardgame` and there must be at least one vote on the weight
    '''
    assert len(thing_ids) <= 100

    items = xe.fromstring(await get_boardgamegeek_thing(*thing_ids))
    bg_infos = list(filter(lambda x: x.numweights > 0, [BoardGameInfo.from_item(item) for item in items]))

    return filter(lambda x: x.rulebook != '', await asyncio.gather(*[get_bg(bg_info) for bg_info in bg_infos]))
        
file_lock = asyncio.Lock()
with open('data/dataset.txt', 'w') as f_out:
    async def write_bg_info_on_file(*thing_ids):
        bg_infos = await get_boardgames(*thing_ids)
        async with file_lock:
            f_out.write('\n'.join(map(str, bg_infos)) + '\n')

    await asyncio.gather(*[write_bg_info_on_file(
        *[x + BGG_THING_IDS_PER_CALL * i for x in range(BGG_THING_IDS_PER_CALL)]) for i in range(BGG_THING_ID_CALLS)])
"""
# this is the concurrent code that I cannot execute because of server side checks on the number of downloads.
# There are some limitations that bgg put in order to avoid automated downloads