In [1]:
# Imports
from splinter import Browser
from bs4 import BeautifulSoup as soup
import pandas as pd

In [2]:
# Open browser
browser = Browser("chrome")

In [3]:
# Iterate through webpages to create list
table_list = []
for x in range(1,11):
    # Parse html from webpages
    url = f"https://boardgamegeek.com/browse/boardgame/page/{x}"
    browser.visit(url)
    html = browser.html
    page = soup(html, 'html.parser')

    # Find all table rows
    rows = page.find_all('tr', id='row_')

    # Create dictionaries and append to list for dataframe
    for row in rows:
        rank = row.find('a')['name']
        name = row.find('a', class_='primary').text
        link = row.find('a', class_='primary')['href']
    
        row_dict = {
            'rank': rank,
            'title': name,
            'link': link
                }
        table_list.append(row_dict)

In [4]:
# Close browser
browser.quit()

In [5]:
# Create dataframe
df = pd.DataFrame(table_list)

# Convert link to game ID#
df['game id'] = df['link'].str.extract(r"/(\d+)/")
new_df = df[['rank', 'title', 'game id']]
new_df.head()

Unnamed: 0,rank,title,game id
0,1,Brass: Birmingham,224517
1,2,Pandemic Legacy: Season 1,161936
2,3,Ark Nova,342942
3,4,Gloomhaven,174430
4,5,Twilight Imperium: Fourth Edition,233078


In [None]:
# Make sure 1000 rows came through and check for duplicates
new_df.describe()

Unnamed: 0,rank,title,game id
count,1000,1000,1000
unique,1000,995,999
top,1000,Love Letter,351913
freq,1,2,2


In [12]:
# Look at non-unique rows
non_unique_ids = new_df[new_df['game id'].duplicated(keep=False)]
print(non_unique_ids)

non_unique_titles = new_df[new_df['title'].duplicated(keep=False)]
print(non_unique_titles)

    rank    title game id
299  300  Tiletum  351913
300  301  Tiletum  351913
    rank                                 title game id
202  203                                  Dune  283355
299  300                               Tiletum  351913
300  301                               Tiletum  351913
329  330                           Love Letter  277085
351  352                           Love Letter  129622
517  518  Lord of the Rings: The Confrontation   18833
527  528                                  Dune     121
539  540                              Citadels  205398
557  558                              Citadels     478
756  757  Lord of the Rings: The Confrontation    3201


Repeated titles are mostly reiterations/variations. Only one true duplicate.

In [14]:
# Remove duplicate id
final_df = new_df.drop(300)
final_df.describe()

Unnamed: 0,rank,title,game id
count,999,999,999
unique,999,995,999
top,1000,Dune,177590
freq,1,2,1


In [15]:
# Save df to csv
final_df.to_csv('final_scrape_dataframe', index=False)