In [4]:
import re
import requests
from time import sleep
from bs4 import BeautifulSoup, NavigableString
from pymongo import MongoClient
from selenium import webdriver
from selenium.webdriver.firefox.options import Options

# Connect to MongoDB
client = MongoClient('192.168.0.209', 27017)
db = client['capstone_1']
cards_coll = db['cards']
decks_coll = db['deck_lists']
events_coll = db['events']

# Base URLs for building requests
scryfall_api_url = 'https://api.scryfall.com/{}'  # API docs: https://scryfall.com/docs/api
mtgtop8_url = 'https://www.mtgtop8.com/{}'

# mtgtop8.com format keys for building requests
mt8_format_keys = {
    'vintage': 'VI',
    'legacy': 'LE',
    'modern': 'MO',
    'pioneer': 'PI',
    'historic': 'HI',
    'standard': 'ST',
    'commander': 'EDH',
    'limited': 'LI',
    'pauper': 'PAU',
    'peasant': 'PEA',
    'block': 'BL',
    'extended': 'EX',
    'highlander': 'HIGH',
    'canadian_highlander': 'CHL'
}

def query(link, payload={}):
    """A requests wrapper function"""
    response = requests.get(link, params=payload)
    if response.status_code != 200:
        print('WARNING', response.status_code)
        print(response.content)
    return response

def get_card(name_str):
    """Returns data from Scryfall API on an individual card by name"""
    payload = {'fuzzy': '+'.join(name_str.split())}
    response = query(scryfall_api_url.format('cards/named'), payload)
    return response.json()

def soup_to_xpath(child):
    """Convert BeautifulSoup object to XPath address"""
    output = ['']
    for parent in child.parents:
        siblings = parent(child.name, recursive=False)
        add = '' if siblings == [child] else f'[{siblings.index(child)}]'
        output.append(child.name + add)
        child = parent
    output.reverse()
    return f"/{'/'.join(output)}".rstrip('/')

I originally inteded to scrape the official WotC magic.wizards.com site for tournament results but they unfortunately don't make much available outside of very recent events. After some research, tournament data will be scraped from mtgtop8.com, who offer *years* worth of back data. Some individual card data could be scraped here as well, but scryfall.com offers much more robust card data via a convenient API.

interesting note on data gathering: https://www.dailyesports.gg/wizards-of-the-coast-mtg-frank-karsten-stop-publishing-gp-results-win-rates/

In [5]:
q = query(mtgtop8_url.format('format'), {'f': mt8_format_keys['standard']})
soup = BeautifulSoup(q.content, 'html.parser')

'/html/body/div[2]/div/table/tr/td[1]/form/table/tr/td/table/tr/td/td/td/td/td/td/td/td/td/td/td/div'

In [13]:
meta_select = soup.find('select', {'name': 'meta'})  # drop down selector for meta
selected_meta = meta_select.find('option', selected=True)
metas = {opt.text: mtgtop8_url.format(opt['value']) for opt in meta_select.find_all('option')}  # meta keys for building URL

archtypes = {
    'aggro': [],
    'control': [],
    'combo': []
}

# Gathering a list of archtypes among deck styles for a meta
for strat in soup.find_all(class_='Stable')[0].find_all(rowspan=True):  # In this table, only the style type headers use 'rowspan'
    strat_str = strat.contents[0].lower()  # Get corrosponding key for archtypes dict
    item = strat.parent
    
    # Gather each archtype under each deck style
    while len(archtypes[strat_str]) < int(strat['rowspan']) - 1:  # Rowspan == number of archtypes under this style
        item = item.next_sibling
        if isinstance(item, NavigableString):
            continue
        if item.a:  # If this sibling has a link, we know it's what we're looking for
            text  = item.a.text
            num_decks = int(item.contents[3].text)
            archtypes[strat_str].append((text, num_decks))


In [28]:
def get_next(driver, class_name):
    try:
        return driver.find_element_by_class_name('Nav_PN')
    except Exception as e:
        print(e)
        return False

def scrape_events(url):
    """
        I do this inside it's own function because it's the only step
        in the scraping process that requires Selenium
    """
    options = Options()
    options.add_argument('--headless')
    driver = webdriver.Firefox(options=options)
    driver.get(url)
    sleep(1)
    
    events_agg = {}
    
    while True:
        next_btn = get_next(driver, 'Nav_PN')
        soup = BeautifulSoup(driver.page_source, 'html.parser')  # make some soup
        
        for event in soup.find_all(class_='Stable')[2].find_all(class_='hover_tr'):  # 10 events list table
            """
                This loop iterates through event table rows, pulling out an ID number,
                the star rating and the date of the event
            """
            link = event.a  # associated hyperlink
            eid = re.search(r"e=(\d+)&", link['href'])  # unique id number
            stars = event.find(class_='O16').find_all('img')  # star rating / level
            events_agg[eid.group(1)] = {
                'id': eid.group(1),
                'name': link.text,
                'date': event.find(class_='S10').text,
                'level': 4 if 'bigstar' in stars[0]['src'] else len(stars),
                'link': mtgtop8_url.format(link['href']),
                'meta': selected_meta.text
            }
        
        if next_btn:
            next_btn.click()
            sleep(1)
        else:
            break

    driver.close()

In [29]:
scrape_events(q.url)

{'29350': {'id': '29350', 'name': 'Event @ On Curve Events', 'date': '23/02/21', 'level': 1, 'link': 'https://www.mtgtop8.com/event?e=29350&f=ST', 'meta': 'Last 2 Months'}, '29352': {'id': '29352', 'name': 'Weekly Event @ The Mythic Society', 'date': '23/02/21', 'level': 1, 'link': 'https://www.mtgtop8.com/event?e=29352&f=ST', 'meta': 'Last 2 Months'}, '29337': {'id': '29337', 'name': 'Challenge @ The Monkey Planet', 'date': '22/02/21', 'level': 1, 'link': 'https://www.mtgtop8.com/event?e=29337&f=ST', 'meta': 'Last 2 Months'}, '29343': {'id': '29343', 'name': 'Season 1 Daily Trial @ BIG Magic', 'date': '22/02/21', 'level': 2, 'link': 'https://www.mtgtop8.com/event?e=29343&f=ST', 'meta': 'Last 2 Months'}, '29344': {'id': '29344', 'name': 'Ligue #4 @ Torino Tournament', 'date': '22/02/21', 'level': 1, 'link': 'https://www.mtgtop8.com/event?e=29344&f=ST', 'meta': 'Last 2 Months'}, '29351': {'id': '29351', 'name': 'Free Daily Bo1 @ Owl Central Games', 'date': '22/02/21', 'level': 1, 'link'