In [1]:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup, NavigableString
from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

# Connect to MongoDB
client = MongoClient('192.168.0.209', 27017)
db = client['capstone_1']
cards_coll = db['cards']
decks_coll = db['deck_lists']
events_coll = db['events']

# Base URLs for building requests
scryfall_api_url = 'https://api.scryfall.com/{}'  # API docs: https://scryfall.com/docs/api
mtgtop8_url = 'https://www.mtgtop8.com/{}'

# mtgtop8.com format keys for building requests
mt8_format_keys = {
    'vintage': 'VI',
    'legacy': 'LE',
    'modern': 'MO',
    'pioneer': 'PI',
    'historic': 'HI',
    'standard': 'ST',
    'commander': 'EDH',
    'limited': 'LI',
    'pauper': 'PAU',
    'peasant': 'PEA',
    'block': 'BL',
    'extended': 'EX',
    'highlander': 'HIGH',
    'canadian_highlander': 'CHL'
}

def query(link, payload={}):
    """A requests wrapper function"""
    response = requests.get(link, params=payload)
    if response.status_code != 200:
        print('WARNING', response.status_code)
        print(response.content)
    return response

def get_card(name_str):
    """Returns data from Scryfall API on an individual card by name"""
    payload = {'fuzzy': '+'.join(name_str.split())}
    response = query(scryfall_api_url.format('cards/named'), payload)
    return response.json()

def hot_soup(url, payload={}):
    """Makes a steaming bowl of hot soup"""
    response = query(url, payload)
    soup = BeautifulSoup(response.content, 'html.parser')
    return soup

def gather_archtypes(url):
    """Gathering a list of archtypes among deck styles for a meta"""
    soup = hot_soup(url)
    archtypes = { strat: [] for strat in ['aggro', 'control', 'combo'] }
    for strat in soup.find_all(class_='Stable')[0].find_all(rowspan=True):  # In this table, only the style type headers use 'rowspan'
        strat_str = strat.contents[0].lower()  # Get corrosponding key for archtypes dict
        item = strat.parent

        # Gather each archtype under each strategy type
        while len(archtypes[strat_str]) < int(strat['rowspan']) - 1:  # Rowspan == number of archtypes under this style
            item = item.next_sibling
            if isinstance(item, NavigableString):
                continue
            if item.a:  # If this sibling has a link, we know it's what we're looking for
                text  = item.a.text
                num_decks = int(item.contents[3].text)
                archtypes[strat_str].append((text, num_decks))
    
    return archtypes

def scrape_events(meta_url):
    """
        I do this inside it's own function because it's the only step
        in the scraping process that requires Selenium
    """
    events_agg = {}
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)
    driver.get(meta_url)
    sleep(1)
    
    def get_next(driver, class_name):
        """Check if the next button is still valid"""
        try:
            return driver.find_element_by_class_name('Nav_PN')
        except Exception as e:
            return False
    
    while True:
        next_btn = get_next(driver, 'Nav_PN')
        soup = BeautifulSoup(driver.page_source, 'html.parser')  # make some soup
        
        for event in soup.find_all(class_='Stable')[2].find_all(class_='hover_tr'):  # 10 events list table
            """
                This loop iterates through event table rows, pulling out an ID number,
                the star rating and the date of the event
            """
            link = event.a  # associated hyperlink
            eid = re.search(r"e=(\d+)&", link['href']).group(1)  # unique id number
            stars = event.find(class_='O16').find_all('img')  # star rating / level
            events_agg[eid] = {
                'id': eid,
                'name': link.text,
                'date': event.find(class_='S10').text,
                'level': 4 if 'bigstar' in stars[0]['src'] else len(stars),
                'link': mtgtop8_url.format(link['href']),
                'meta': selected_meta.text
            }
        
        if next_btn:
            next_btn.click()
            sleep(1)
        else:
            break

    driver.close()
    return events_agg


I originally inteded to scrape the official WotC magic.wizards.com site for tournament results but they unfortunately don't make much available outside of very recent events. After some research, tournament data will be scraped from mtgtop8.com, who offer *years* worth of back data. Some individual card data could be scraped here as well, but scryfall.com offers much more robust card data via a convenient API.

interesting note on data gathering: https://www.dailyesports.gg/wizards-of-the-coast-mtg-frank-karsten-stop-publishing-gp-results-win-rates/

In [2]:
form = 'standard'
meta = 'History - All Worlds'
soup = hot_soup(mtgtop8_url.format('format'), {'f': mt8_format_keys[form]})
meta_select = soup.find('select', {'name': 'meta'})  # get drop down selector for meta
selected_meta = meta_select.find('option', selected=True)  # get current meta
metas = {opt.text: mtgtop8_url.format(opt['value']) for opt in meta_select.find_all('option')}  # meta URLs
chosen_meta = metas[meta]  # a meta url will be fed into gather_archtypes() and scrape_events()

In [14]:
event_url = 'https://www.mtgtop8.com/event?e=29323&f=ST'
soup = hot_soup(event_url)

<a class="player" href="search?player=Eliott_dragon">Eliott_dragon</a>

In [19]:
top_table = soup.find_all(class_='Stable')[0]
num_players = int(re.search(r"(\d+) players -", top_table.text).group(1))
decks_placed = []
for sib in top_table.next_siblings:
    if isinstance(sib, NavigableString):
        continue
    if sib.a:
        placement, title, pilot = sib.text.split('\n')[1:-1]
        link = mtgtop8_url.format(sib.a['href'])
        deck = {
            'title': title,
            'pilot': pilot,
            'placement': placement,
            'link': link
        }
        decks_placed.append(deck)
decks_placed

[{'title': 'Dimir Control',
  'pilot': 'Eliott_dragon',
  'placement': '1',
  'link': 'https://www.mtgtop8.com/?e=29323&d=430627&f=ST'},
 {'title': 'Izzet Control',
  'pilot': 'Tyler Anderson',
  'placement': '2',
  'link': 'https://www.mtgtop8.com/?e=29323&d=430628&f=ST'},
 {'title': 'Boros Cycling Aggro',
  'pilot': 'Nicholas Demichele',
  'placement': '3-4',
  'link': 'https://www.mtgtop8.com/?e=29323&d=430630&f=ST'},
 {'title': 'Red Deck Wins',
  'pilot': 'Drew Christensen',
  'placement': '3-4',
  'link': 'https://www.mtgtop8.com/?e=29323&d=430629&f=ST'},
 {'title': 'Gruul Aggro',
  'pilot': 'Ptarts2win',
  'placement': '5-8',
  'link': 'https://www.mtgtop8.com/?e=29323&d=430633&f=ST'},
 {'title': 'Gruul Aggro',
  'pilot': 'Maher Samuel',
  'placement': '5-8',
  'link': 'https://www.mtgtop8.com/?e=29323&d=430634&f=ST'},
 {'title': 'Red Deck Wins',
  'pilot': 'Barry Riddell',
  'placement': '5-8',
  'link': 'https://www.mtgtop8.com/?e=29323&d=430632&f=ST'},
 {'title': 'Sultai Yorio