In [33]:
from dataclasses import dataclass
import re

tag_removal_pattern = re.compile('<[^<]+?>')

@dataclass
class BoardGameFileInfo:
    id: int
    name: str
    description: str
    extension: str
    
    @staticmethod
    def from_file_info(file_info: Dict[str, Any]):
        rendered_description = file_info['description']['rendered']
        # the description contains HTML tags that are removed with this regex
        description = tag_removal_pattern.sub('', rendered_description)
        filename, extension = file_info['filename'].rsplit('.', 1)
        return BoardGameFileInfo(file_info['filepageid'],
                                 filename,
                                 description,
                                 extension.lower()
                                )

In [34]:
import aiohttp
from typing import Any, Dict, List

async def get_boardgamegeek_filelist(client, thing_id: int) -> List[BoardGameFileInfo]:
    url = f"https://api.geekdo.com/api/files?ajax=1&nosession=1&objectid={thing_id}&objecttype=thing&pageid=1&showcount=25&sort=hot&languageid=2184"
    async with client.get(url) as response:
        content = await response.json()
        files = content['files']
        file_list = filter(lambda x: x.extension == 'pdf', [BoardGameFileInfo.from_file_info(file) for file in files])
        return file_list

async with aiohttp.ClientSession() as client:
    print(list(await get_boardgamegeek_filelist(client, 1)))

[BoardGameFileInfo(id='20733', name='Die Macher Brief by Liumas 2015-01', description="Turn sequence, setup, and symbol descriptions.This new 'FINAL' version is all PDF and easier to print and cut out using the guidelines.", extension='pdf'), BoardGameFileInfo(id='23411', name='Die Macher Condensed Rules by Liumas 2015-01', description='Complete 3rd Edition rules condensed onto 6 pages, sans Comic Sans.', extension='pdf'), BoardGameFileInfo(id='44870', name='Die Macher player aide_BW', description='Die Macher player aide black & white', extension='pdf'), BoardGameFileInfo(id='23811', name='Die Macher Best Options by Liumas 2015-01', description='The standard official rules updates for 2nd and 3rd editions.', extension='pdf'), BoardGameFileInfo(id='12795', name='playeraid front', description='Summary of a round, and illustration of major items in game..perfect for learners. Print on A4 landscape, and printplayeraidback on reverse', extension='pdf'), BoardGameFileInfo(id='29520', name='D

In [36]:
from string import punctuation
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

nlp = spacy.load('en_core_web_sm')

async def get_rulebook(client, thing_id: int) -> BoardGameFileInfo:
    def tokenizer(raw_doc: str):
        doc = nlp(raw_doc)
        lemmas = (token.lemma_ for token in doc if token.lemma_ not in punctuation)
        return filter(lambda x: not nlp.vocab[x].is_stop, lemmas)
    
    raw_docs = [x for x in await get_boardgamegeek_filelist(client, thing_id)]
    if len(raw_docs) == 0:
        return None
    vectorizer = TfidfVectorizer(strip_accents='ascii', lowercase=True, tokenizer=tokenizer)
    doc_vectors = vectorizer.fit_transform(f'{x.name} {x.description}' for x in raw_docs)

    query_vector = vectorizer.transform(["revised official rule rulebook update new"])
    docs_ranked = cosine_similarity(query_vector, doc_vectors)

    return next(raw_docs[i] for i, score in sorted(enumerate(docs_ranked[0]), key=lambda x: -x[1]))

async with aiohttp.ClientSession() as client:
    print(await get_rulebook(client, 1))

BoardGameFileInfo(id='23811', name='Die Macher Best Options by Liumas 2015-01', description='The standard official rules updates for 2nd and 3rd editions.', extension='pdf')


In [38]:
from dataclasses import dataclass
import xml.etree.ElementTree as xe

@dataclass
class BoardGameInfo:
    id: int
    name: str
    numweights: int
    averageweight: float
    rulebook: BoardGameFileInfo
    
    @staticmethod
    async def build(item: xe, client):
        id = item.attrib['id']
        name = item.find('./name[@type=\'primary\']').attrib['value']
        ratings = item.find('./statistics/ratings')
        
        return BoardGameInfo(id, 
                             name, 
                             ratings.find('./numweights').attrib['value'], 
                             ratings.find('./averageweight').attrib['value'],
                             await get_rulebook(client, id)
                            ) 
    

In [39]:
import aiohttp
import asyncio

bgg_api_root = 'https://boardgamegeek.com/xmlapi2'

async def fill_dataset():
    async with aiohttp.ClientSession() as client:
        async def get_boardgamegeek_thing(*thing_ids):
            url = f'{bgg_api_root}/thing?id={",".join(str(x) for x in thing_ids)}&type=boardgame&pagesize=100&stats=1'
            async with client.get(url) as response:
                assert response.status == 200
                return (await response.read()).decode('utf-8')
        
        async def get_bg_info(*thing_ids):    
            '''
            get a list of `BoardGameInfo` that corresponds to the thing_ids requested. The list could be smaller than `len(thing_ids)`
            because filtered using `type=boardgame` and there must be at least one vote about weight
            '''
            assert len(thing_ids) <= 100

            items = xe.fromstring(await get_boardgamegeek_thing(*thing_ids))
            bg_infos = await asyncio.gather(*[BoardGameInfo.build(item, client) for item in items])

            return bg_infos
        
        file_lock = asyncio.Lock()
        with open('data/dataset.txt', 'w') as f_out:
            async def write_bg_info_on_file(*thing_ids):
                bg_infos = await get_bg_info(*thing_ids)
                async with file_lock:
                    f_out.write('\n'.join(map(str, bg_infos)) + '\n')
                    
            await asyncio.gather(*[write_bg_info_on_file(*[x + 100 * i for x in range(100)]) for i in range(2)])
                
    
await fill_dataset()