In [33]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import importlib
import scraper_utils as su
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [35]:
su.get_marketplace_data(271324)

[6, 33.99]

In [36]:
base_url = 'https://boardgamegeek.com/browse/boardgame/page/5'
fields = ['usersrated','average','baverage','avgweight', 'numweights', 'numgeeklists', 'numtrading',
          'numwanting','numwish', 'numowned','numplays', 'numplays_month','numfans', 'yearpublished', 'minplayers',
          'maxplayers', 'minplaytime', 'maxplaytime', 'minage']

In [71]:
def get_field_value_html(gi_string, field_name):
    """
    Given string in <script> tag containing all information and field name ('minplayers', 'baverage', etc),
    returns float of the value from that string
    """
    search_text = '\"'+field_name+'\":.+?[0-9][^0-9.]'
    string_with_val = re.findall(search_text, gi_string)[0]
    val = re.sub("[^\d\.]", "", string_with_val)
    try:
        return float(val)
    except:
        return -1

def get_game_types(gi_string, game_id):
    """
    Given string in <script> tag containing all information, returns a list containing the top types of that
    game (only 6 or so from BGG)
    
    """
    search_text = '\"veryshortprettyname\":\".+?\"'
    type_strings = re.findall(search_text,gi_string)
    types = []
    for type_string in type_strings:
        type_ = type_string.split('\"')[-2].strip().lower()
        if type_.lower() != 'overall':
            types.append(type_)
    return types

def get_game_categories(gi_string, game_id):
    """
    Given string in <script> tag containing all information, return list containing numerical codes (as ints)
    of the categories
    """
    search_text='\"propertyid\":\[(.+?)\]'
    pid_string = re.search(search_text, gi_string)
    try:
        category_list_str = pid_string.group(1).replace('\"','')
        category_list = category_list_str.strip().split(',')
        category_list = [int(cat) for cat in category_list]
        return category_list
    except:
        print('Couldnt get categories for id ',game_id)
        return []

In [72]:
def get_game_info_html(game_id, game_dict):
    base_url = 'https://boardgamegeek.com/boardgame/'
    url = base_url+game_id
    response=requests.get(url)
    soup = BeautifulSoup(response.text, 'html5lib')
    script=soup.find_all('script')[1]
    game_info=script.contents[0]

    #Get numerical/continues field values
    for field in fields:
        game_dict[field] = get_field_value_html(game_info, field)
    
    #Get game categories
    game_dict['categories']=get_game_categories(game_info, game_id)
    
    #Get game types, mostly redundant with categories
    game_dict['types'] = get_game_types(game_info, game_id)



In [73]:
def get_bgg_info(first,last):
    
    """
    Outermost scraping function - starts by scraping name and id from BGG page, then goes to that games page
    and scrapes information. Then puts all that data into a dict and turns the dicts for all the games into
    a dataframe. Goes from first page to last page, inclusive.
    """
    
    #List of dicts that will be turned into dataframe and saved
    game_dict_list = []
    
    #URL for pages with games in order of popularity
    base_url = 'https://boardgamegeek.com/browse/boardgame/page/'
    
    for i in range(first, last+1):
        #Get web page html with BS
        url = base_url + str(i)
        response = requests.get(url)
        if response.status_code > 299:
            print("Couldnt get page ", i)
            continue
        soup = BeautifulSoup(response.text, 'html5lib')
        
        games = soup.find_all('tr', id='row_')
        for game in games:
            game_dict = {}
            main_a = game.find('a', class_='primary')
            game_id = main_a['href'].split('/')[2]
            game_dict['Name'] = main_a.text
            game_dict['id']   = game_id
            get_game_info_html(game_id, game_dict)
            game_dict_list.append(game_dict)
    df = pd.DataFrame(game_dict_list)
    return df

In [None]:
df1=get_bgg_info(1,10)

In [74]:
df_list = []
for i in range(2,10):
    print('On loop',i)
    start = i*10 + 1
    end = (i+1)*10
    df = get_bgg_info(start,end)
    fname = f'data/partial_data/df{i}.csv'
    df.to_csv(fname, index=False)
    df_list.append(df)
    

On loop 2
Couldnt get categories for id  18291
On loop 3
Couldnt get categories for id  23953
On loop 4
Couldnt get categories for id  295944
Couldnt get categories for id  5985
Couldnt get categories for id  317231
On loop 5
On loop 6
Couldnt get categories for id  214747
Couldnt get categories for id  255332
Couldnt get categories for id  206928
On loop 7
Couldnt get categories for id  284082
On loop 8
Couldnt get categories for id  202477
Couldnt get categories for id  268846
Couldnt get categories for id  307656
On loop 9
Couldnt get categories for id  206925


In [75]:
len(df_list)

8