# Scraping MTG Cards


For this project, I will be scraping the information available on the Card Kingdom website regarding Magic the Gathering cards.

In [1]:
from bs4 import BeautifulSoup as bs
from selenium import webdriver
import re
import time
import pandas as pd
import mtgutilities as mtgu
import json

In [2]:
with open('config.json', 'r') as config_file:
    contents = json.loads(config_file.read())

### Getting all of the expansion URLs

In [7]:
soup = mtgu.get_soup_from_website("https://www.cardkingdom.com/catalog/magic_the_gathering/by_az")
expansions = dict()
for a in soup.find_all('a', href=True):
        urls = dict()
        if 'mtg/' in a['href'] and 'art-series' not in a['href'] and 'graded-magic' \
        not in a['href'] and 'complete-sets' not in a['href'] and 'commander-deck' not in a['href'] \
        and 'card-kingdom-tokens' not in a['href'] and 'variants' not in a['href'] \
        and 'starter-kits' not in a['href'] and 'jpn' not in a['href']:
            
            urls["url"] = f"https://www.cardkingdom.com{a['href']}"
            expansions[a.string.replace(' ', '-').replace('---','-').replace(':','')\
                .replace('&','').replace('.','').replace("'", "").replace('--','-').replace('/','-').lower()] = urls

In [13]:
expansions = {
 'promotional': {'url': 'https://www.cardkingdom.com/mtg/promotional'},
 'prophecy': {'url': 'https://www.cardkingdom.com/mtg/prophecy'},
 'ravnica': {'url': 'https://www.cardkingdom.com/mtg/ravnica'},
 'ravnica-allegiance': {'url': 'https://www.cardkingdom.com/mtg/ravnica-allegiance'},
 'ravnica-allegiance-guild-kits': {'url': 'https://www.cardkingdom.com/mtg/ravnica-allegiance-guild-kits'},
 'return-to-ravnica': {'url': 'https://www.cardkingdom.com/mtg/return-to-ravnica'},
 'rise-of-the-eldrazi': {'url': 'https://www.cardkingdom.com/mtg/rise-of-the-eldrazi'},
 'rivals-of-ixalan': {'url': 'https://www.cardkingdom.com/mtg/rivals-of-ixalan'},
 'saviors-of-kamigawa': {'url': 'https://www.cardkingdom.com/mtg/saviors-of-kamigawa'},
 'scars-of-mirrodin': {'url': 'https://www.cardkingdom.com/mtg/scars-of-mirrodin'},
 'scourge': {'url': 'https://www.cardkingdom.com/mtg/scourge'},
 'secret-lair': {'url': 'https://www.cardkingdom.com/mtg/secret-lair'},
 'shadowmoor': {'url': 'https://www.cardkingdom.com/mtg/shadowmoor'},
 'shadows-over-innistrad': {'url': 'https://www.cardkingdom.com/mtg/shadows-over-innistrad'},
 'shards-of-alara': {'url': 'https://www.cardkingdom.com/mtg/shards-of-alara'},
 'signature-spellbook-chandra': {'url': 'https://www.cardkingdom.com/mtg/signature-spellbook-chandra'},
 'signature-spellbook-gideon': {'url': 'https://www.cardkingdom.com/mtg/signature-spellbook-gideon'},
 'signature-spellbook-jace': {'url': 'https://www.cardkingdom.com/mtg/signature-spellbook-jace'},
 'starter-1999': {'url': 'https://www.cardkingdom.com/mtg/starter-1999'},
 'starter-2000': {'url': 'https://www.cardkingdom.com/mtg/starter-2000'},
 'streets-of-new-capenna': {'url': 'https://www.cardkingdom.com/mtg/streets-of-new-capenna'},
 'strixhaven-mystical-archive': {'url': 'https://www.cardkingdom.com/mtg/strixhaven-mystical-archive'},
 'strixhaven-school-of-mages': {'url': 'https://www.cardkingdom.com/mtg/strixhaven-school-of-mages'},
 'stronghold': {'url': 'https://www.cardkingdom.com/mtg/stronghold'},
 'tempest': {'url': 'https://www.cardkingdom.com/mtg/tempest'},
 'the-dark': {'url': 'https://www.cardkingdom.com/mtg/the-dark'},
 'theros': {'url': 'https://www.cardkingdom.com/mtg/theros'},
 'theros-beyond-death': {'url': 'https://www.cardkingdom.com/mtg/theros-beyond-death'},
 'throne-of-eldraine': {'url': 'https://www.cardkingdom.com/mtg/throne-of-eldraine'},
 'time-spiral': {'url': 'https://www.cardkingdom.com/mtg/time-spiral'},
 'time-spiral-remastered': {'url': 'https://www.cardkingdom.com/mtg/time-spiral-remastered'},
 'timeshifted': {'url': 'https://www.cardkingdom.com/mtg/timeshifted'},
 'torment': {'url': 'https://www.cardkingdom.com/mtg/torment'},
 'ultimate-box-topper': {'url': 'https://www.cardkingdom.com/mtg/ultimate-box-topper'},
 'ultimate-masters': {'url': 'https://www.cardkingdom.com/mtg/ultimate-masters'},
 'unglued': {'url': 'https://www.cardkingdom.com/mtg/unglued'},
 'unhinged': {'url': 'https://www.cardkingdom.com/mtg/unhinged'},
 'unlimited': {'url': 'https://www.cardkingdom.com/mtg/unlimited'},
 'unsanctioned': {'url': 'https://www.cardkingdom.com/mtg/unsanctioned'},
 'unstable': {'url': 'https://www.cardkingdom.com/mtg/unstable'},
 'urzas-destiny': {'url': 'https://www.cardkingdom.com/mtg/urzas-destiny'},
 'urzas-legacy': {'url': 'https://www.cardkingdom.com/mtg/urzas-legacy'},
 'urzas-saga': {'url': 'https://www.cardkingdom.com/mtg/urzas-saga'},
 'vanguard': {'url': 'https://www.cardkingdom.com/mtg/vanguard'},
 'visions': {'url': 'https://www.cardkingdom.com/mtg/visions'},
 'war-of-the-spark': {'url': 'https://www.cardkingdom.com/mtg/war-of-the-spark'},
 'weatherlight': {'url': 'https://www.cardkingdom.com/mtg/weatherlight'},
 'world-championships': {'url': 'https://www.cardkingdom.com/mtg/world-championships'},
 'worldwake': {'url': 'https://www.cardkingdom.com/mtg/worldwake'},
 'zendikar': {'url': 'https://www.cardkingdom.com/mtg/zendikar'},
 'zendikar-rising': {'url': 'https://www.cardkingdom.com/mtg/zendikar-rising'},
 'zendikar-rising-expeditions': {'url': 'https://www.cardkingdom.com/mtg/zendikar-rising-expeditions'}}

In [14]:
expansions_df = pd.DataFrame()
for expansion_key, expansion_values in expansions.items():
    # Obtaining number of pages to scrape per expansion
    soup = mtgu.get_soup_from_website(expansion_values.get("url"))
    max_pages = mtgu.get_max_pages(soup)

    # Scraping page by page
    driver_pages = webdriver.Chrome(contents["chrome-driver-location"])
    
    card_names, card_costs, card_converted_costs, card_type, \
        card_urls, card_rarity, card_pt, card_text = list(), list(), list(), list(), list(), list(), list(), list()

    for i in range(1,max_pages+1):
            driver_pages.get(f"{expansion_values.get('url')}/?page={str(i)}")
            soup_pages = bs(driver_pages.page_source, 'html.parser')
            
            # Getting card URLs
            card_urls += mtgu.get_card_urls(soup_pages, expansion_key)
            # Getting card names
            card_names += mtgu.get_card_names(soup_pages, expansion_key)
            # Getting mana costs
            card_costs += mtgu.get_mana_cost(soup_pages)
            # Getting converted mana cost
            card_converted_costs = mtgu.get_converted_mana_cost(card_costs)
            # Getting card type
            card_type += mtgu.get_card_type(soup_pages)
            # Getting card rarity
            card_rarity += mtgu.get_card_rarity(soup_pages)
            # Getting card P/T
            card_pt += mtgu.get_card_pt(soup_pages)
            # Getting card text
            card_text += mtgu.get_card_text(soup_pages)
            time.sleep(3)
    df = pd.DataFrame(
    {'expansion': expansion_key,
     'url': card_urls,
     'name': card_names,
     'cost': card_costs,
     'converted_cost': card_converted_costs,
     'type': card_type,
     'rarity': card_rarity,
     'pt': card_pt,
     'text': card_text
    })
    
    expansions_df = pd.concat([expansions_df, df],axis=0)           
driver_pages.close()

KeyboardInterrupt: 

In [12]:
expansions_df.to_csv('expansions2.csv', index=False)