In [66]:
from dataclasses import dataclass
import re
from typing import Any, Dict

tag_removal_pattern = re.compile('<[^<]+?>')

@dataclass
class BoardGameFileInfo:
    id: int
    name: str
    description: str
    extension: str
    
    @staticmethod
    def from_file_info(file_info: Dict[str, Any]):
        rendered_description = file_info['description']['rendered']
        # the description contains HTML tags that are removed with this regex
        description = tag_removal_pattern.sub('', rendered_description)
        filename, extension = file_info['filename'].rsplit('.', 1)
        return BoardGameFileInfo(file_info['fileid'],
                                 filename,
                                 description,
                                 extension.lower()
                                )

In [67]:
import aiohttp
from typing import List

internal_bgg_api = 'https://api.geekdo.com/api'

async def get_bgg_filelist(client, thing_id: int) -> List[BoardGameFileInfo]:
    url = f"{internal_bgg_api}/files?ajax=1&nosession=1&objectid={thing_id}&objecttype=thing&pageid=1&showcount=25&sort=hot&languageid=2184"
    async with client.get(url) as response:
        content = await response.json()
        files = content['files']
        file_list = filter(lambda x: x.extension == 'pdf', [BoardGameFileInfo.from_file_info(file) for file in files])
        return file_list

async with aiohttp.ClientSession() as client:
    print(list(await get_bgg_filelist(client, 1))[0:3])

[BoardGameFileInfo(id='140639', name='Die Macher Brief by Liumas 2015-01', description="Turn sequence, setup, and symbol descriptions.This new 'FINAL' version is all PDF and easier to print and cut out using the guidelines.", extension='pdf'), BoardGameFileInfo(id='139190', name='Die Macher Condensed Rules by Liumas 2015-01', description='Complete 3rd Edition rules condensed onto 6 pages, sans Comic Sans.', extension='pdf'), BoardGameFileInfo(id='62318', name='Die Macher player aide_BW', description='Die Macher player aide black & white', extension='pdf')]


In [68]:
from string import punctuation
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_sm')

async def get_rulebook_info(client, thing_id: int) -> BoardGameFileInfo:
    '''return the rulebook info or None if the boardgame has no rulebook'''
    def tokenizer(raw_doc: str):
        doc = nlp(raw_doc)
        lemmas = (token.lemma_ for token in doc if token.lemma_ not in punctuation)
        return filter(lambda x: not nlp.vocab[x].is_stop, lemmas)
    
    raw_docs = [x for x in await get_boardgamegeek_filelist(client, thing_id)]
    if len(raw_docs) == 0:
        return None
    vectorizer = TfidfVectorizer(strip_accents='ascii', lowercase=True, tokenizer=tokenizer)
    doc_vectors = vectorizer.fit_transform(f'{x.name} {x.description}' for x in raw_docs)

    query_vector = vectorizer.transform(["revised official rule rulebook update new"])
    docs_ranked = cosine_similarity(query_vector, doc_vectors)

    return next(raw_docs[i] for i, score in sorted(enumerate(docs_ranked[0]), key=lambda x: -x[1]))

async with aiohttp.ClientSession() as client:
    print(await get_rulebook_info(client, 1))

BoardGameFileInfo(id='139185', name='Die Macher Best Options by Liumas 2015-01', description='The standard official rules updates for 2nd and 3rd editions.', extension='pdf')


In [69]:
import os
from datetime import datetime, timedelta

bggusername = os.getenv('BGGUSERNAME')
bggpassword = os.getenv('BGGPASSWORD')

async def boardgamegeek_authenticate(client, bggusername: str, bggpassword: str) -> str:
    '''returns the auth token'''
    url = 'https://boardgamegeek.com/api/accounts/current'
    headers = { 'cookie': f'bggusername={bggusername}; bggpassword={bggpassword}' }
    async with client.get(url, headers=headers) as response:
        content = await response.json()
        return content['authToken']

async def get_file_content(client, auth_token: str, file: BoardGameFileInfo) -> str:
    url = f'{internal_bgg_api}/files/downloadurls?ids={file.id}' 
    headers = { 'Authorization': f'GeekAuth {auth_token}' }
    async with client.get(url, headers=headers) as response:
        content = await response.json()
        download_url = 'https://boardgamegeek.com' + content['downloadUrls'][0]['url']
        return download_url
                  
async with aiohttp.ClientSession() as client:
    try:
        if token_expire <= datetime.now():
            raise NameError
    except NameError:
        token_expire = datetime.now() + timedelta(hours=2)
        token = await boardgamegeek_authenticate(client, bggusername, bggpassword)
    file = list(await get_boardgamegeek_filelist(client, 1))[0]
    print(await get_file_content(client, token, file))

https://boardgamegeek.com/file/download_redirect/92b54722954ab9ac413689c8793f78b67e10682c02c552f8/Die+Macher+Brief+by+Liumas+2015-01.pdf


In [70]:
from dataclasses import dataclass
import xml.etree.ElementTree as xe

@dataclass
class BoardGameInfo:
    id: int
    name: str
    numweights: int
    averageweight: float
    
    @staticmethod
    def from_item(item: xe):
        id = item.attrib['id']
        name = item.find('./name[@type=\'primary\']').attrib['value']
        ratings = item.find('./statistics/ratings')
        
        return BoardGameInfo(id, 
                             name, 
                             int(ratings.find('./numweights').attrib['value']), 
                             float(ratings.find('./averageweight').attrib['value']),
                            )
    
@dataclass
class BoardGame:
    info: BoardGameInfo
    rulebook: str
    

In [71]:
import aiohttp
import asyncio
from typing import List

bgg_api_root = 'https://boardgamegeek.com/xmlapi2'

async def fill_dataset():
    async with aiohttp.ClientSession() as client:
        async def get_boardgamegeek_thing(*thing_ids) -> str:
            url = f'{bgg_api_root}/thing?id={",".join(str(x) for x in thing_ids)}&type=boardgame&pagesize=100&stats=1'
            async with client.get(url) as response:
                assert response.status == 200
                return (await response.read()).decode('utf-8')
            
        async def get_bg(bg_info: BoardGameInfo) -> BoardGame:
            thing_id = bg_info.id
            fileinfo = await get_rulebook_info(client, thing_id)
            return BoardGame(bg_info, '' if fileinfo is None else await get_file_content(client, token, fileinfo))
        
        async def get_boardgames(*thing_ids) -> List[BoardGame]:    
            '''
            get a list of `BoardGameInfo` that corresponds to the thing_ids requested. The list could be smaller than `len(thing_ids)`
            because filtered using `type=boardgame` and there must be at least one vote on the weight
            '''
            assert len(thing_ids) <= 100

            items = xe.fromstring(await get_boardgamegeek_thing(*thing_ids))
            bg_infos = list(filter(lambda x: x.numweights > 0, [BoardGameInfo.from_item(item) for item in items]))
            
            return filter(lambda x: x.rulebook != '', await asyncio.gather(*[get_bg(bg_info) for bg_info in bg_infos]))
        
        file_lock = asyncio.Lock()
        with open('data/dataset.txt', 'w') as f_out:
            async def write_bg_info_on_file(*thing_ids):
                bg_infos = await get_boardgames(*thing_ids)
                async with file_lock:
                    f_out.write('\n'.join(map(str, bg_infos)) + '\n')
                    
            await asyncio.gather(*[write_bg_info_on_file(*[x + 100 * i for x in range(100)]) for i in range(2)])
                
    
await fill_dataset()