# Scraping xml from bgg because the age of the data is bothering me

In [None]:
import time
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, parse_qs
import pandas as pd
from random import randint
import os
import csv
import pickle
from tqdm import tqdm

In [None]:
games = []
categories = set()
mechanics = set()

# Functions

In [None]:
def get_game_data(game_id, retry_limit=3):
    response = requests.get(f'https://www.boardgamegeek.com/xmlapi2/thing?id={game_id}&stats=1')
    soup = BeautifulSoup(response.content, 'xml')

    # If it returns None, try 3 more times and then otherwise return None
    if soup.find('item') is None:
        print(f'Unexpected response for game ID {game_id}: HTTP {response.status_code}, content: {response.content}')
        if retry_limit > 0:
            time.sleep(1)
            return get_game_data(game_id, retry_limit=retry_limit -1)
        else:
            return None

    game = {}

    game['rank'] = None  # This information is not available in XML API, placeholder for future usage.
    game['bgg_url'] = f'https://boardgamegeek.com/boardgame/{game_id}'
    game['game_id'] = game_id
    game['name'] = soup.find('name').get('value')
    
    game['min_players'] = soup.find('minplayers').get('value')
    game['max_players'] = soup.find('maxplayers').get('value')
    game['avg_time'] = soup.find('playingtime').get('value')
    game['min_time'] = soup.find('minplaytime').get('value')
    game['max_time'] = soup.find('maxplaytime').get('value')
    game['weight'] = soup.find('averageweight').get('value')
    
    game['year'] = soup.find('yearpublished').get('value')
    game['age'] = soup.find('minage').get('value')

    game['avg_rating'] = soup.find('average').get('value')
    game['geek_rating'] = soup.find('bayesaverage').get('value')
    game['num_votes'] = soup.find('usersrated').get('value')
    game['owned'] = soup.find('owned').get('value')
    game['designer'] = ', '.join(d.get('value') for d in soup.find_all('link', type='boardgamedesigner'))

    game['categories'] = [c.get('value') for c in soup.find_all('link', type='boardgamecategory')]
    categories.update(game['categories'])

    game['mechanics'] = [m.get('value') for m in soup.find_all('link', type='boardgamemechanic')]
    mechanics.update(game['mechanics'])

    return game
    

In [None]:
def get_id_from_page(game_page_url):
    response = requests.get(game_page_url)
    if response.statuscod !=200:
        return None
    
    url = urlparse(response.url)
    game_id = parse_qs(url.query).get('id', None)
    
    if game_id is not None:
        return game_id[0]

In [None]:
def get_top_game_ids(page_num):
    response = requests.get(f'https://boardgamegeek.com/browse/boardgame/page/{page_num}')
    soup = BeautifulSoup(response.content, 'html.parser')
    game_ids = []
    
    if page_num >= 21:
        links = soup.select('td.collection_thumbnail a[href^="/boardgame/"]')
        for link in links:
            game_url = 'https://boardgamegeek.com' + link['href']
            game_id = get_game_id_from_page(game_url)
            if game_id is not None:
                game_ids.append(game_id)
    
    # Find all the boardgame links and extract the IDs
    links = soup.select('td.collection_thumbnail a[href^="/boardgame/"]')
    game_ids = [link['href'].split('/')[2] for link in links]
    
    
    return game_ids

In [None]:
# def pickle_here(games_list, rank): # for ranks
#     with open(f'data/pickle/games_{rank}.pickle', 'wb') as f:
#         pickle.dump(games_list, f)
def pickle_here(seq_list, idx): # for sequels
    with open(f'data/seq_list_{idx}.pkl', 'wb') as f:
        pickle.dump(seq_list, f)

In [None]:
def get_links_from_page(page_num):
    response = requests.get(f'https://boardgamegeek.com/browse/boardgame/page/{page_num}')
    soup = BeautifulSoup(response.content, 'html.parser')
    
    # Find all the links on the page
    links = soup.select('a[href]')
    link_urls = [link['href'] for link in links]
    
    return link_urls

## Implementation

## Creating csv and writing header

In [None]:
with open('data/basic_data_2023.csv', 'w', newline='', encoding='utf-8') as csvfile:
    
    fields = [
    'rank',
    'bgg_url',
    'game_id',
    'name',
    'min_players',
    'max_players',
    'avg_time',
    'min_time',
    'max_time',
    'weight',
    'year',
    'age',
    'avg_rating',
    'geek_rating',
    'num_votes',
    'owned',
    'designer',
    'categories',
    'mechanics',
    ]
    
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    writer.writeheader()

### Scraping website

In [None]:
# get top game ids
top_game_ids = []

with tqdm(total=21) as pbar:
    for page_num in range(1, 22):
        try:
            top_game_ids.extend(get_top_game_ids(page_num))
            time.sleep(1)
            pbar.update()
        except Exception as e:
            tqdm.write(f'Error {e} occurred on page number {page_num}')

In [None]:
top_games_pg = get_top_game_ids(22)
top_games_pg

In [None]:
games = []

#open csv and scrape website
with open('data/basic_data_2023.csv', 'a', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fields)
    with tqdm(total=len(top_game_ids)) as pbar:
        for rank, game_id in enumerate(top_game_ids, start=1):
            try:
                game = get_game_data(game_id)
                if game is not None:
                    game['rank'] = rank
                    games.append(game)
                    pbar.set_description(f'Successfully fetched game ID: {game_id}, Rank: {rank}')
                    writer.writerow(game)
                    
                    if rank % 500==0:
                        pickle_here(games, rank)
                    
                else:
                    pbar.set_description(f'No data discovered for game ID: {game_id}')
                pbar.update()
            except Exception as e:
                tqdm.write(f'Error {e} occurred fetching game ID: {game_id}')
            time.sleep(randint(1,3))

## For subdomains:

In [None]:
df_17 = pd.read_csv('data/2017/rank_id.csv')
df_23 = pd.read_csv('data/2023/rank_id.csv')
df_list = [df_17, df_23]
sub_df_list = []

In [None]:

object_ids = {
    '4666': 'Abstract Games',
    '4665': 'Children\'s Games',
    '4667': 'Customizable Games',
    '5499': 'Family Games',
    '5498': 'Party Games',
    '5497': 'Strategy Games',
    '5496': 'Thematic Games',
    '4664': 'Wargames'
}

for df in df_list:
    game_ids = df['game_id']

    records = []
    
    # scape
    for game_id in tqdm(game_ids):
        url = f"https://boardgamegeek.com/boardgame/{game_id}"
        for attempt in range(5):
            try:
                response = requests.get(url)

                if response.status_code != 200:
                    raise Exception(f"Failed to fetch XML: status code {response.status_code}")

                soup = BeautifulSoup(response.content, 'html.parser')

                raw_html = str(soup)

                record = {'game_id': game_id}

                # loop over object_ids
                for object_id, column_name in object_ids.items():
                    if object_id in raw_html:
                        record[column_name] = 1
                    else:
                        record[column_name] = 0

                records.append(record)

                # If successful, break the retry loop
                break

            except Exception as e:
                time.sleep(1)
                print(f"Attempt {attempt+1}: {e}")

            # Sleep to avoid rate limiting
            time.sleep(1)

        else:
            print(f"Failed to fetch XML for game_id {game_id} after 5 attempts.")

    # Create a DataFrame from the list of records
    df_new = pd.DataFrame(records)
    sub_df_list.append(df_new)


sub_df_17 = sub_df_list[0]
sub_df_23 = sub_df_list[1]

sub_df_17.to_csv('data/2017/subdomains_2017.csv', index=False)
sub_df_23.to_csv('data/2023/subdomains_2023.csv', index=False)

## For expansions/sequels

In [None]:
df_17 = pd.read_csv('data/2017/rank_id.csv')
df_23 = pd.read_csv('data/2023/rank_id.csv')
df_list = [df_17, df_23]
seq_list = []

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning, module="pandas")


for df in df_list:
    game_ids = df['game_id']
    seq_df = pd.DataFrame(columns=['game_id', 'parent_id', 'year', 'iteration', 'reimplementation', 'expansion', 'compilation'])
    for game_id in tqdm(game_ids):
        url = f'https://www.boardgamegeek.com/xmlapi2/thing?id={game_id}'
        parent_id = None
        for attempt in range(5):
            time.sleep(.5)
            try:
                response = requests.get(url)
                if response.status_code != 200:
                    raise Exception(f"Failed to fetch XML: status code {response.status_code}")
                soup = BeautifulSoup(response.content, 'lxml-xml')
                
                # Extract data
                item = soup.find('item', {'id': str(game_id)})
                year = item.find('yearpublished')['value']
                
                reimplementations = item.find_all('link', {'type': 'boardgameimplementation', 'inbound': 'true'})
                reimplementation = 1 if reimplementations else 0
                iteration = len(reimplementations) + 1 if reimplementations else 1
                expansion = 1 if item.find('link', {'type': 'boardgameexpansion', 'inbound': 'true'}) else 0
                compilation = 1 if item.find('link', {'type': 'boardgamecompilation', 'inbound': 'true'}) else 0

                if reimplementation:
                    parent_id = min([int(ri['id']) for ri in reimplementations])
                elif expansion:
                    parent_id = item.find('link', {'type': 'boardgameexpansion'})['id']
                elif compilation:
                    parent_id = item.find('link', {'type': 'boardgamecompilation'})['id']

                if parent_id:
                    # Add data to Dataframe
                    row = {'game_id': [game_id],
                           'parent_id': [parent_id],
                           'year': [year],
                           'iteration': [iteration],
'reimplementation': [reimplementation],
                                        'expansion': [expansion],
                                        'compilation': [compilation]})
                    
                    seq_df = pd.concat([seq_df, row], ignore_index=True)
                    break
                
            except Exception as e:
                time.sleep(2)
                print(f"Attempt {attempt+1}: {e}")
                if attempt == 4: # if it's the last attempt
                    print(f"Failed to fetch data for game_id {game_id} after 5 attempts.")

    seq_list.append(seq_df)

seq_df_17 = seq_list[0]
seq_df_23 = seq_list[1]

seq_df_17.to_csv('data/2017/sequels_2017.csv', index=False)
seq_df_23.to_csv('data/2023/sequels_2023.csv', index=False)