In [9]:
import bs4
from bs4 import BeautifulSoup
import requests
import re
import html
import pandas as pd
import nltk

# Web Scraping

In [2]:
def getGameList(genreString, numPages):
    games_list = []
    for i in range(1,numPages+1):
        source = requests.get('https://boardgamegeek.com/'+genreString+'/browse/boardgame/page/' + str(i))
        soup = BeautifulSoup(source.text, 'lxml')
        symbolslist = soup.find('table').tr.next_siblings
        for sec in symbolslist:
            if type(sec) is not bs4.element.NavigableString:
                temp = sec.find('td', class_='collection_thumbnail').a.attrs['href']
                games_list.append(temp)
    return games_list

In [3]:
source = requests.get('https://boardgamegeek.com/thematic/browse/boardgame/page/1')
soup = BeautifulSoup(source.text, 'lxml')
symbolslist = soup.find('table').tr.next_siblings
for sec in symbolslist:
    if type(sec) is not bs4.element.NavigableString:
        temp = sec.find('td', class_='collection_thumbnail').a.attrs['href']

In [4]:
thematic_list = getGameList("thematic",50)
print("Thematic:", len(thematic_list))

child_list = getGameList("childrensgames",50)
print("Child:", len(child_list))

fam_list = getGameList("familygames",50)
print("Family:", len(fam_list))

strat_list = getGameList("strategygames",50)
print("Strategy:", len(strat_list))

war_list = getGameList("wargames",50)
print("War:", len(war_list))

abs_list = getGameList("abstracts",50)
print("Abstract:", len(abs_list))

Thematic: 1565
Child: 4038
Family: 2608
Strategy: 2840
War: 5000
Abstract: 4669


In [5]:
def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    cleantext = re.sub(cleanr, '', raw_html)
    return cleantext

def getGameDF(gamesList):
    df = pd.DataFrame(columns = ["name",
                   "description", "types", "categories"])
    for el in gamesList:
        newSource = requests.get('https://www.boardgamegeek.com/xmlapi/boardgame/' + el.split('/')[2] + '?stats=1')
        print(el)
        soup = BeautifulSoup(newSource.text, 'html.parser')

        # Game name
        for names in soup.find_all('name'):
            if names.get('primary') == 'true':
                name = names.encode_contents().decode("utf-8")

        # Description
        description = cleanhtml(html.unescape(soup.find('description').encode_contents().decode("utf-8")))

        # Game Types
        types = [domain.encode_contents().decode("utf-8").split(" ")[0] for domain in soup.find_all('boardgamesubdomain')]

        # Game Categories
        categories = [cat.encode_contents().decode("utf-8").split(" ")[0] for cat in soup.find_all('boardgamecategory')]


        df.loc[len(df)] = [name,
                           description,
                           types,
                           categories
                           ]

    return df

In [6]:
theme_df = getGameDF(thematic_list)
print("Thematic:", len(theme_df))
child_df = getGameDF(child_list)
print("Children's:", len(child_df))
fam_df = getGameDF(fam_list)
print("Family:", len(fam_df))
strat_df = getGameDF(strat_list)
print("Strategy:", len(strat_df))
war_df = getGameDF(war_list)
print("War:", len(war_df))
abs_df = getGameDF(abs_list)
print("Abstract:", len(abs_df))

/boardgame/174430/gloomhaven
/boardgame/161936/pandemic-legacy-season-1
/boardgame/291457/gloomhaven-jaws-lion
/boardgame/233078/twilight-imperium-fourth-edition
/boardgame/187645/star-wars-rebellion
/boardgame/115746/war-ring-second-edition
/boardgame/266507/clank-legacy-acquisitions-incorporated
/boardgame/167355/nemesis
/boardgame/192135/too-many-bones
/boardgame/205637/arkham-horror-card-game
/boardgame/55690/kingdom-death-monster
/boardgame/316554/dune-imperium
/boardgame/221107/pandemic-legacy-season-2
/boardgame/314040/pandemic-legacy-season-0
/boardgame/96848/mage-knight-board-game
/boardgame/205059/mansions-madness-second-edition
/boardgame/180263/7th-continent
/boardgame/209010/mechs-vs-minions
/boardgame/264220/tainted-grail-fall-avalon
/boardgame/164153/star-wars-imperial-assault
/boardgame/235802/too-many-bones-undertow
/boardgame/253344/cthulhu-death-may-die
/boardgame/121921/robinson-crusoe-adventures-cursed-island
/boardgame/269385/lord-rings-journeys-middle-earth
/boar

KeyboardInterrupt: 

In [None]:
df = pd.concat([theme_df,child_df,fam_df,strat_df,war_df,abs_df])
df = df.drop_duplicates(subset=['name'])
df.reset_index(drop=True, inplace=True)
df

In [None]:
pd.to_pickle(df, 'BGGDataGenre.pkl')

In [15]:
df = pd.read_pickle('BGGDataGenre.pkl')

In [16]:
all_genres = sum(df['types'],[])
len(set(all_genres))
print(set(all_genres))

{"Children's", 'Wargames', 'Customizable', 'Thematic', 'Strategy', 'Abstract', 'Party', 'Family'}


In [17]:
all_genres = nltk.FreqDist(all_genres)

# create dataframe
all_genres_df = pd.DataFrame({'Genre': list(all_genres.keys()),
                              'Count': list(all_genres.values())})

In [18]:
def clean_df(dataframe):
    approvedGenres = ['Abstract',"Children's",'Family','Strategy','Thematic','Wargames']
    i = 0
    while i < len(dataframe) - 1:
        i += 1
        for genre in dataframe.iloc[i]['types']:
            if genre not in approvedGenres:
                #name = dataframe.iloc[i]['name']
                dataframe = dataframe.drop(i,axis=0)
                dataframe.reset_index(drop=True, inplace=True)
                i -= 1
                #print(i, len(dataframe), name)
    return dataframe

In [19]:
df = clean_df(df)
df.reset_index(drop=True, inplace=True)
df

Unnamed: 0,name,description,types,categories
0,Gloomhaven,Gloomhaven is a game of Euro-inspired tactica...,"[Strategy, Thematic]","[Adventure, Exploration, Fantasy, Fighting, Mi..."
1,Pandemic Legacy: Season 1,Pandemic Legacy is a co-operative campaign gam...,"[Strategy, Thematic]","[Environmental, Medical]"
2,Gloomhaven: Jaws of the Lion,Gloomhaven: Jaws of the Lion is a standalone g...,"[Strategy, Thematic]","[Adventure, Exploration, Fantasy, Fighting, Mi..."
3,Twilight Imperium: Fourth Edition,Twilight Imperium (Fourth Edition) is a game o...,"[Strategy, Thematic]","[Civilization, Economic, Exploration, Negotiat..."
4,Star Wars: Rebellion,Star Wars: Rebellion is a board game of epic c...,[Thematic],"[Civil, Fighting, Miniatures, Movies, Science,..."
...,...,...,...,...
18358,Knights Poker,Knights Poker is the new board game from the w...,[Abstract],[Abstract]
18359,Gekitai,Gekitai (Repel or Push Away) is a 3-in-a-row g...,[Abstract],"[Abstract, Print]"
18360,Gekitai²,Gekitai (Repel or Push Away) is a 3-in-a-row ...,[Abstract],"[Abstract, Print]"
18361,Digit Draughts,Digit Draughts is a set of three related games...,[Abstract],"[Abstract, Number]"


In [20]:
def clean_desc(raw_html):
    clean = re.sub("[^a-zA-Z]"," ",raw_html)
    clean = ' '.join(clean.split())
    clean = clean.lower()
    return clean

In [21]:
df['description'] = df['description'].apply(lambda x: clean_desc(x))
df

Unnamed: 0,name,description,types,categories
0,Gloomhaven,gloomhaven is a game of euro inspired tactical...,"[Strategy, Thematic]","[Adventure, Exploration, Fantasy, Fighting, Mi..."
1,Pandemic Legacy: Season 1,pandemic legacy is a co operative campaign gam...,"[Strategy, Thematic]","[Environmental, Medical]"
2,Gloomhaven: Jaws of the Lion,gloomhaven jaws of the lion is a standalone ga...,"[Strategy, Thematic]","[Adventure, Exploration, Fantasy, Fighting, Mi..."
3,Twilight Imperium: Fourth Edition,twilight imperium fourth edition is a game of ...,"[Strategy, Thematic]","[Civilization, Economic, Exploration, Negotiat..."
4,Star Wars: Rebellion,star wars rebellion is a board game of epic co...,[Thematic],"[Civil, Fighting, Miniatures, Movies, Science,..."
...,...,...,...,...
18358,Knights Poker,knights poker is the new board game from the w...,[Abstract],[Abstract]
18359,Gekitai,gekitai repel or push away is a in a row game ...,[Abstract],"[Abstract, Print]"
18360,Gekitai²,gekitai repel or push away is a in a row game ...,[Abstract],"[Abstract, Print]"
18361,Digit Draughts,digit draughts is a set of three related games...,[Abstract],"[Abstract, Number]"
