In [37]:
from dataclasses import dataclass
import re
from typing import Any, Dict

tag_removal_pattern = re.compile('<[^<]+?>')

@dataclass
class BoardGameFileInfo:
    id: int
    name: str
    description: str
    extension: str
    
    @staticmethod
    def from_file_info(file_info: Dict[str, Any]):
        rendered_description = file_info['description']['rendered']
        # the description contains HTML tags that are removed with this regex
        description = tag_removal_pattern.sub('', rendered_description)
        filename, extension = file_info['filename'].rsplit('.', 1)
        return BoardGameFileInfo(file_info['fileid'],
                                 filename,
                                 description,
                                 extension.lower()
                                )

In [38]:
import aiohttp
from typing import List

internal_bgg_api = 'https://api.geekdo.com/api'

async def get_bgg_filelist(client, thing_id: int) -> List[BoardGameFileInfo]:
    url = f"{internal_bgg_api}/files?ajax=1&nosession=1&objectid={thing_id}&objecttype=thing&pageid=1&showcount=25&sort=hot&languageid=2184"
    async with client.get(url) as response:
        content = await response.json()
        files = content['files']
        file_list = filter(lambda x: x.extension == 'pdf', [BoardGameFileInfo.from_file_info(file) for file in files])
        return file_list

async with aiohttp.ClientSession() as client:
    print(list(await get_bgg_filelist(client, 1))[0:3])

[BoardGameFileInfo(id='140639', name='Die Macher Brief by Liumas 2015-01', description="Turn sequence, setup, and symbol descriptions.This new 'FINAL' version is all PDF and easier to print and cut out using the guidelines.", extension='pdf'), BoardGameFileInfo(id='139190', name='Die Macher Condensed Rules by Liumas 2015-01', description='Complete 3rd Edition rules condensed onto 6 pages, sans Comic Sans.', extension='pdf'), BoardGameFileInfo(id='62318', name='Die Macher player aide_BW', description='Die Macher player aide black & white', extension='pdf')]


In [39]:
from string import punctuation
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_sm')

async def get_rulebook_info(client, thing_id: int) -> BoardGameFileInfo:
    '''return the rulebook info or None if the boardgame has no rulebook'''
    def tokenizer(raw_doc: str):
        doc = nlp(raw_doc)
        lemmas = (token.lemma_ for token in doc if token.lemma_ not in punctuation)
        return filter(lambda x: not nlp.vocab[x].is_stop, lemmas)
    
    raw_docs = [x for x in await get_bgg_filelist(client, thing_id)]
    if len(raw_docs) == 0:
        return None
    vectorizer = TfidfVectorizer(strip_accents='ascii', lowercase=True, tokenizer=tokenizer)
    doc_vectors = vectorizer.fit_transform(f'{x.name} {x.description}' for x in raw_docs)

    query_vector = vectorizer.transform(["revised official rule rulebook update new"])
    docs_ranked = cosine_similarity(query_vector, doc_vectors)

    return next(raw_docs[i] for i, score in sorted(enumerate(docs_ranked[0]), key=lambda x: -x[1]))

async with aiohttp.ClientSession() as client:
    print(await get_rulebook_info(client, 1))

BoardGameFileInfo(id='139185', name='Die Macher Best Options by Liumas 2015-01', description='The standard official rules updates for 2nd and 3rd editions.', extension='pdf')


In [40]:
import aiohttp
import asyncio
from typing import Callable

class RetryingClientResponse:
    def __init__(self, response_getter: Callable, *, allow_redirects=True, **kwargs):
        self._response_getter = response_getter
        
    async def __aenter__(self):        
        need_new_captcha = True
        while need_new_captcha:
            self._response_coroutine = self._response_getter()
            self._response = await self._response_coroutine.__aenter__()
            need_new_captcha = self._response.status != 200
            if need_new_captcha:
                print("get new recaptcha")
                await self._response_coroutine.__aexit__(None, None, None)
                await asyncio.sleep(2)
        
        return self._response        

    async def __aexit__(self, exc_type, exc_value, exc_tb):
        await self._response_coroutine.__aexit__(exc_type, exc_value, exc_tb)

#async with aiohttp.ClientSession() as client:
#    async with RetryingClientResponse(lambda: client.get('https://www.google.it')) as response:
#        print(await response.text())

In [41]:
import os
from datetime import datetime, timedelta
import fitz as PyMuPDF

bggusername = os.getenv('BGGUSERNAME')
bggpassword = os.getenv('BGGPASSWORD')

async def boardgamegeek_authenticate(client, bggusername: str, bggpassword: str) -> str:
    '''returns the auth token'''
    url = 'https://boardgamegeek.com/api/accounts/current'
    headers = { 'cookie': f'bggusername={bggusername}; bggpassword={bggpassword}' }
    async with client.get(url, headers=headers) as response:
        content = await response.json()
        return content['authToken']

async def get_file_content(client, auth_token: str, file: BoardGameFileInfo) -> str:
    url = f'{internal_bgg_api}/files/downloadurls?ids={file.id}' 
    headers = { 'Authorization': f'GeekAuth {auth_token}' }
    async with RetryingClientResponse(lambda: client.get(url, headers=headers)) as response:
        content = await response.json()        
        download_url = 'https://boardgamegeek.com' + content['downloadUrls'][0]['url']
        print(download_url)
        async with RetryingClientResponse(lambda: client.get(download_url)) as pdf_response:
            pdf_data = await pdf_response.read()
            with PyMuPDF.open(stream=pdf_data, filetype="pdf") as doc:
                text = ""
                for page in doc:
                    text += page.get_text() + "\n"
                return text
                  
async with aiohttp.ClientSession() as client:
    try:
        if token_expire <= datetime.now():
            raise NameError
    except NameError:
        token_expire = datetime.now() + timedelta(hours=2)
        token = await boardgamegeek_authenticate(client, bggusername, bggpassword)
    file = list(await get_bgg_filelist(client, 1))[0]
    print(file)

    print(len(await get_file_content(client, token, file)))

BoardGameFileInfo(id='140639', name='Die Macher Brief by Liumas 2015-01', description="Turn sequence, setup, and symbol descriptions.This new 'FINAL' version is all PDF and easier to print and cut out using the guidelines.", extension='pdf')
https://boardgamegeek.com/file/download_redirect/074432fbd0f7715cba2cd94da332ce1d48102924d55eeffe/Die+Macher+Brief+by+Liumas+2015-01.pdf
9080


In [42]:
from dataclasses import dataclass
import xml.etree.ElementTree as xe

@dataclass
class BoardGameInfo:
    id: int
    name: str
    numweights: int
    averageweight: float
    
    @staticmethod
    def from_item(item: xe):
        id = item.attrib['id']
        name = item.find('./name[@type=\'primary\']').attrib['value']
        ratings = item.find('./statistics/ratings')
        
        return BoardGameInfo(id, 
                             name, 
                             int(ratings.find('./numweights').attrib['value']), 
                             float(ratings.find('./averageweight').attrib['value']),
                            )
    
@dataclass
class BoardGame:
    info: BoardGameInfo
    rulebook: str
    

In [43]:
# !!! IMPORTANT !!! reload the web page of board game geek using Easy Auto Refresh extension or something like this
# this is necessary to re-validate the recaptcha
import aiohttp
import asyncio
from typing import List

bgg_api_root = 'https://boardgamegeek.com/xmlapi2'
BGG_THING_IDS_PER_CALL = 100
BGG_THING_ID_CALLS = 2

async def fill_dataset():
    async with aiohttp.ClientSession() as client:
        async def get_boardgamegeek_thing(*thing_ids) -> str:
            url = f'{bgg_api_root}/thing?id={",".join(str(x) for x in thing_ids)}&type=boardgame&pagesize=100&stats=1'
            async with client.get(url) as response:
                assert response.status == 200
                return (await response.read()).decode('utf-8')
            
        async def get_bg(bg_info: BoardGameInfo) -> BoardGame:
            thing_id = bg_info.id
            fileinfo = await get_rulebook_info(client, thing_id)
            return BoardGame(bg_info, '' if fileinfo is None else await get_file_content(client, token, fileinfo))
        
        async def get_boardgames(*thing_ids) -> List[BoardGame]:    
            '''
            get a list of `BoardGameInfo` that corresponds to the thing_ids requested. The list could be smaller than `len(thing_ids)`
            because filtered using `type=boardgame` and there must be at least one vote on the weight
            '''
            assert len(thing_ids) <= 100

            items = xe.fromstring(await get_boardgamegeek_thing(*thing_ids))
            bg_infos = list(filter(lambda x: x.numweights > 0, [BoardGameInfo.from_item(item) for item in items]))
            
            bgs = []
            for bg_info in bg_infos:
                bg = await get_bg(bg_info)
                if bg.rulebook != '':
                    bgs.append(bg)
                    await asyncio.sleep(1)
                
            return bgs
        
        with open('data/dataset.txt', 'w') as f_out:
            for i in range(BGG_THING_ID_CALLS):
                bgs = await get_boardgames(*[x + BGG_THING_IDS_PER_CALL * i for x in range(BGG_THING_IDS_PER_CALL)])
                f_out.write('\n'.join(map(str, bgs)) + '\n')
                f_out.flush()
                
    
await fill_dataset()

https://boardgamegeek.com/file/download_redirect/3880d718154c1f0cfafadc286a74b9408dcd12e2235f094b/Die+Macher+Best+Options+by+Liumas+2015-01.pdf
https://boardgamegeek.com/file/download_redirect/1655d777351f9b5a4bf7568eb2423a86196bcaaaad26f795/The+Fool%27s+Gambit+Rules.pdf
https://boardgamegeek.com/file/download_redirect/eb7c3d1581f468ed8727e882ab3f806b504254e9fb493605/Samurai.pdf
https://boardgamegeek.com/file/download_redirect/541deb9f5024c30f59f5b78cf87b0e41ed8e14ea8168c316/Acquire_Information_Card_-_Colorful_Remake+with+information+on+back.pdf
get new recaptcha
get new recaptcha
get new recaptcha
get new recaptcha
get new recaptcha
https://boardgamegeek.com/file/download_redirect/b11fde4872722b63c1e218f355f0ffd023a75a8198a1e18d/S-Cathedral.pdf
https://boardgamegeek.com/file/download_redirect/d413f46069bb9ad7287fe6b63ca5e40f3747b174ccc3a22e/El+Caballero+-+Quick+Reference.pdf
https://boardgamegeek.com/file/download_redirect/8f7a5cfbc54e6c52efb14cd0f6d1e13ff49539d6aa1e3b91/Elfenland+-+Q

In [None]:
"""
async def get_boardgames(*thing_ids) -> List[BoardGame]:    
    '''
    get a list of `BoardGameInfo` that corresponds to the thing_ids requested. The list could be smaller than `len(thing_ids)`
    because filtered using `type=boardgame` and there must be at least one vote on the weight
    '''
    assert len(thing_ids) <= 100

    items = xe.fromstring(await get_boardgamegeek_thing(*thing_ids))
    bg_infos = list(filter(lambda x: x.numweights > 0, [BoardGameInfo.from_item(item) for item in items]))

    return filter(lambda x: x.rulebook != '', await asyncio.gather(*[get_bg(bg_info) for bg_info in bg_infos]))
        
file_lock = asyncio.Lock()
with open('data/dataset.txt', 'w') as f_out:
    async def write_bg_info_on_file(*thing_ids):
        bg_infos = await get_boardgames(*thing_ids)
        async with file_lock:
            f_out.write('\n'.join(map(str, bg_infos)) + '\n')

    await asyncio.gather(*[write_bg_info_on_file(
        *[x + BGG_THING_IDS_PER_CALL * i for x in range(BGG_THING_IDS_PER_CALL)]) for i in range(BGG_THING_ID_CALLS)])
"""
# this is the concurrent code that I cannot execute because of server side checks on the number of downloads.
# There are some limitations that bgg put in order to avoid automated downloads

In [24]:
import fitz as PyMuPDF

#doc_url = 'http://www.arkansasrazorbacks.com/wp-content/uploads/2017/02/Miami-Ohio-Game-2.pdf'
doc_url = 'https://boardgamegeek.com/file/download_redirect/6675164890835b89e169967b0a897839fe2afb0d976c8339/Wildlife+Adventure+-+Quick+Reference.pdf'

async with aiohttp.ClientSession() as client:
    async with client.get(doc_url) as response:
        data = await response.read()
        with PyMuPDF.open(stream=data, filetype="pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text() + "\n"
            print(text)

Corrections or constructive criticisms?     Email: delta1119@earthlink.net
Michael Weston, 2004
Wildlife Adventure Quick Reference
Setup
1 – Shuffle animal cards and deal 12 or 8 cards for 2-3 or 4-6 players.
2 – Each player also gets 10 Travel Vouchers
3 – Deal 6 animal cards face-up along an edge of the board. Check that these 6 animals are at least 3 route extensions away
from the start point. Replace cards as necessary. These are the open discovery missions. (For new players, consider
marking the corresponding board spaces with a coin or chip.)
Turn Overview
Each player in turn must lay 1 route extension of any color, if possible.
- All 3 expeditions start from the same point, going out along 1 of the 4 available lines.
- Route extensions must extend from the last point reached – no branching off is allowed.
- Expeditions are not limited to one continent
- Travel Vouchers may be used to perform additional actions (see below)
- Red spaces: player must take the top chance card. Chanc