In [69]:
# Import Dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import time

In [70]:
# Function to remove formatting from gatherer
def extract_name(card, card_long):
    length = len(card_long)
    good_string = card_long[length-len(card):-1]
    good_string += card_long[length-1]
    return(good_string) 

In [71]:
# Create browser to navigate oracle
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://gatherer.wizards.com/Pages/Default.aspx'
browser.visit(url)

In [72]:
# Here will be the list with all cards which will be converted from a csv.
# Temporarily, these cards are chosen so each mana symbol is represented at least once.
cards_df = pd.read_csv("raw_count.csv")

In [73]:
# Cards dataframe is used to scrape oracle text and then relevant data is crafted into dataframe
cards_df = pd.DataFrame({"Name": ["Delver of Secrets", "Goblin Ringleader", "Mountain"]})
# Empty list for list of mismatched links is crafted
bad_link = []

# Empty lists are initialized
card_list = []
gen_list = []
snow_list = []
c_list = []
v_list = []
two_b_list = []
phy_list = []
w_list = []
u_list = []
b_list = []
r_list = []
g_list = []
cmc_list = []
type_line_list = []
text_list = []
power_list = []
toughness_list = []
loyalty_list = []

# For loop loops through list of cards
for row in cards_df.iterrows():
    
    # Initialize all mana values which may be duplicated at 0
    gen = 0
    snow = 0
    c = 0
    v = 0
    two_b = 0
    phy = 0
    w = 0
    u = 0
    b = 0
    r = 0
    g = 0
    
    # Navigates browser to card page.  Note that this picks the first option in the dropdown when a card name is entered.
    # It is possible that the wrong card will be grabbed, so this is verified below.
    browser.fill("ctl00$ctl00$MainContent$Content$SearchControls$CardSearchBoxParent$CardSearchBox", row[1]["Name"])
    time.sleep(3)
    browser.links.find_by_partial_text(row[1]["Name"]).click()
    
    # Soup object is crafted at card page so html can be read
    html = browser.html
    hot_soup = BeautifulSoup(html, "html.parser")
    
    
    ############# NAME ############# 
    
    
    # Check to see if card navigation was successful.
    # Attempt is made to find string containing name. This only fails if card is double-faced (HTML IDs change).
    # Try and except do same thing except the list of HTML ID's differ.
    # In each, name must be extracted from formatting to if check strings are equal.
    # If strings are not equal, the entry is entered into a list.  Otherwise, the program continues.
    try:
        card_long = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow").text.split("\n")[1]
        card_name = extract_name(row[1]["Name"], card_long)
        html_ids = ["ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow",
                    "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow",
                    "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow"]
    except:
        card_long = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_nameRow")\
            .text.split("\n")[1]
        card_name = extract_name(row[1]["Name"], card_long)
        html_ids = ["ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_typeRow",
            "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_textRow",
            "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_ptRow"]
   
    if row[1]["Name"] != card_name:
            bad_link.append(row[1]["Name"])
            
    # Card name is correct,so data is scraped
    else:
        
        
        ############# MANA #############
        
        
        # Check for a mana cost, if one does not exist, skip ahead.
        try:
            mana_row = hot_soup.find("td", class_="rightCol").find_all("div", class_="row")[1].find_all("img")
            
            # Loop through all mana symbols in the mana row, in each case the alt text will be used.
            # Mana is saved as [gen]eric, [snow], [c]olorless, [v]ariable, (two_b)rid, (phy)rexian, and
            #    [w]hite, bl[u]e, [b]lack, [r]ed, and [g]reen (hybrid, phyrexian, and 2brid mana are counted twice).
            for cost in mana_row:
                
                # Attempt to turn symbol into a number, only possible if it is generic
                try:
                    gen = int(cost["alt"])
                
                # The symbol is not generic mana
                except:
                    
                    # These three qualities are unique, and thus mutually exclusive
                    if "Snow" == cost["alt"]:
                        snow += 1
                    elif "Colorless" == cost["alt"]:
                        c += 1
                    elif "Variable Colorless" == cost["alt"]:
                        v += 1
                    
                    # The following mana types may appear in combination in the same symbol, thus are checked if string is
                    #    in the alt text. The intent is that mana which may be paid for in multiple ways is counted as such.
                    else:
                        if "Two or " in cost["alt"]:
                            two_b += 1
                        if "Phyrexian" in cost["alt"]:
                            phy += 1
                        if "White" in cost["alt"]:
                            w += 1
                        if "Blue" in cost["alt"]:
                            u += 1
                        if "Black" in cost["alt"]:
                            b += 1
                        if "Red" in cost["alt"]:
                            r += 1
                        if "Green" in cost["alt"]:
                            g += 1    

        
        # If no mana cost, return all mana entries as None ("None" is different from "0", see Mountain and Ornithopter)
        except:
            gen = None
            snow = None
            c = None
            v = None
            two_b = None
            phy = None
            w = None
            u = None
            b = None
            r = None
            g = None
        
        
        ############# CONVERTED MANA COST #############
        
        
        # Scrape a converted mana cost. If one does not exist, skip ahead
        # In order to ensure that the cmc is a number and usable, it must be altered from that on oracle.
        # Here, we abuse the fact that the highest cmc in pauper is 9, so all numbers will be 1 character.
        try:
            ugly_number = hot_soup.find("td", class_="rightCol").find_all("div", class_="row")[2].\
                find("div", class_="value").text
            cmc = int(ugly_number[len(ugly_number) - 1])
        except:
            cmc = None 

        
        
        ############# TYPE-LINE #############
        
        
        # Scrape the type-line
        type_line = browser.find_by_id(html_ids[0]).text.split("\n")[1]
        
        
        ############# TEXT BOX #############
        
        
        # Check for a text box, if one does not exist, skip ahead
        try:
            text = browser.find_by_id(html_ids[1]).\
                text.replace("Card Text:\n", "")
        except:
            text = None
        
        
        ############# POWER, TOUGHNESS, LOYALTY #############
        
        
        # Check for power and toughness.  If both are valid, set loyalty to zero.
        try:
            pt = browser.find_by_id(html_ids[2]).text.split("\n")[1].\
                    split(" / ")
            power = int(pt[0])
            toughness = int(pt[1])
            loyalty = None
        # Something failed so card has no card has no pt. Next, check if card is a planeswalker.
        except:
            power = None
            toughness = None
            try:
                loyalty = int(browser.find_by_id(html_ids[2]).text.\
                              split("\n")[1])
            except:
                loyalty = None
    
    # Appends the card data to the relevant lists
    card_list.append(row[1]["Name"])
    gen_list.append(gen)
    snow_list.append(snow)
    c_list.append(c)
    v_list.append(v)
    two_b_list.append(two_b)
    phy_list.append(phy)
    w_list.append(w)
    u_list.append(u)
    b_list.append(b)
    r_list.append(r)
    g_list.append(g)
    cmc_list.append(cmc)
    type_line_list.append(type_line)
    text_list.append(text)
    power_list.append(power)
    toughness_list.append(toughness)
    loyalty_list.append(loyalty)
    
    # Navigates back to search page
    browser.back()

In [74]:
card_df= pd.DataFrame({"Name": card_list,
                       "Cost Generic": gen_list,
                       "Cost Snow": snow_list,
                       "Cost Colorless": c_list,
                       "Cost Variable": v_list,
                       "Cost 2-brid": two_b_list,
                       "Cost Phyrexian": phy_list,
                       "Cost White": w_list,
                       "Cost Blue": u_list,
                       "Cost Black": b_list,
                       "Cost Red": r_list,
                       "Cost Green": g_list,
                       "Converted Mana Cost": cmc_list,
                       "Type Line": type_line_list,
                       "Text Box": text_list,
                       "Power": power_list,
                       "Toughness": toughness_list,
                       "Loyalty": loyalty_list
                      })
card_df

Unnamed: 0,Name,Cost Generic,Cost Snow,Cost Colorless,Cost Variable,Cost 2-brid,Cost Phyrexian,Cost White,Cost Blue,Cost Black,Cost Red,Cost Green,Converted Mana Cost,Type Line,Text Box,Power,Toughness,Loyalty
0,Delver of Secrets,0,0,0,0,0,0,0,1,0,0,0,1,Creature — Human Wizard,"At the beginning of your upkeep, look at the t...",1,1,
1,Goblin Ringleader,3,0,0,0,0,0,0,0,0,1,0,4,Creature — Goblin,Haste (This creature can attack and as soon as...,2,2,
2,Mountain,0,0,0,0,0,0,0,0,0,0,0,4,Creature — Goblin,Haste (This creature can attack and as soon as...,2,2,


In [83]:
# Generate a list of card names with bad links

# Empty lists initialized
bad_names = []

# For loop floops through the links appending relevant entries to relevant list
for link in bad_link:
    bad_names.append(link)
    
# Turns lists into a dataframe for later merging
bad_df = pd.DataFrame({"Name": bad_names,
                       })
bad_df

Unnamed: 0,Name
0,Mountain


In [84]:
# Drop all cards whose link did not go to the correct card
card_df.drop(card_df.loc[card_df["Name"].isin(bad_names)].index, inplace=True)

In [85]:
# Generate list of failed links to ensure each is contained in below bad_ids dataframe
bad_link

['Mountain']

In [86]:
# A dataframe of cards known to cause issues when the above process is run is manually entered then merged with the bad_df.
bad_ids = pd.DataFrame({"Name": ["Island", "Swamp", "Mountain", "Forest", "Ponder", "Snap"],
                       "Oracle ID": [491574, 491576, 491578, 491580, 451051, 426582]})
card_id_df = bad_df.merge(bad_ids, how="inner", on="Name")
card_id_df

Unnamed: 0,Name,Oracle ID
0,Mountain,491578


In [87]:
# Dataframe of cards with bad links has data scraped using card url's

# Base url for manual entry and navigation
base_url = "https://gatherer.wizards.com/Pages/Card/Details.aspx?multiverseid="

# Empty lists are initialized
card_list = []
gen_list = []
snow_list = []
c_list = []
v_list = []
two_b_list = []
phy_list = []
w_list = []
u_list = []
b_list = []
r_list = []
g_list = []
cmc_list = []
type_line_list = []
text_list = []
power_list = []
toughness_list = []
loyalty_list = []

# For loop loops through list of cards
for row in card_id_df.iterrows():
    
    url = base_url + str(row[1]["Oracle ID"])
    
    # Initialize all mana values which may be duplicated at 0
    gen = 0
    snow = 0
    c = 0
    v = 0
    two_b = 0
    phy = 0
    w = 0
    u = 0
    b = 0
    r = 0
    g = 0
    
    # Navigates browser to card page.  Note that this picks the first option in the dropdown when a card name is entered.
    # It is possible that the wrong card will be grabbed, so this is verified below.
    browser.visit(url)
    time.sleep(3)
    
    # Soup object is crafted at card page so html can be read
    html = browser.html
    hot_soup = BeautifulSoup(html, "html.parser")
    
    
    ############# NAME (is already right by definition, but we use it here) ############# 
    
    
    # Determine as in first run if the card is double-faced.
    try:
        card_long = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow").text.split("\n")[1]
        card_name = extract_name(row[1]["Name"], card_long)
        html_ids = ["ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow",
                    "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow",
                    "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow"]
    except:
        card_long = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_nameRow")\
            .text.split("\n")[1]
        card_name = extract_name(row[1]["Name"], card_long)
        html_ids = ["ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_typeRow",
            "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_textRow",
            "ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ctl02_ptRow"]
    
    
    ############# MANA ############# 
    
    
    # Check for a mana cost, if one does not exist, skip ahead
    try:
        mana_row = hot_soup.find("td", class_="rightCol").find_all("div", class_="row")[1].find_all("img")

        # Loop through all mana symbols in the mana row, in each case the alt text will be used.
        # Mana is saved as [gen]eric, [snow], [c]olorless, [v]ariable, (two_b)rid, (phy)rexian, and
        #    [w]hite, bl[u]e, [b]lack, [r]ed, and [g]reen (hybrid, phyrexian, and 2brid mana are counted twice).
        for cost in mana_row:

            # Attempt to turn symbol into a number, only possible if it is generic
            try:
                gen = int(cost["alt"])

            # The symbol is not generic mana
            except:

                # These three qualities are unique, and thus mutually exclusive
                if "Snow" == cost["alt"]:
                    snow += 1
                elif "Colorless" == cost["alt"]:
                    c += 1
                elif "Variable Colorless" == cost["alt"]:
                    v += 1

                # The following mana types may appear in combination in the same symbol, thus are checked if the string is
                #    in the alt text.  The intent is that mana which may be paid for in multiple ways is counted as such.
                else:
                    if "Two or " in cost["alt"]:
                        two_b += 1
                    if "Phyrexian" in cost["alt"]:
                        phy += 1
                    if "White" in cost["alt"]:
                        w += 1
                    if "Blue" in cost["alt"]:
                        u += 1
                    if "Black" in cost["alt"]:
                        b += 1
                    if "Red" in cost["alt"]:
                        r += 1
                    if "Green" in cost["alt"]:
                        g += 1    


    # If no mana cost, return all mana entries as None ("None" is different from "0", see Mountain and Ornithopter)
    except:
        gen = None
        snow = None
        c = None
        v = None
        two_b = None
        phy = None
        w = None
        u = None
        b = None
        r = None
        g = None

    
    ############# CONVERTED MANA COST ############# 

    
    # Scrape a converted mana cost. If one does not exist, skip ahead
    # In order to ensure that the cmc is a number and usable, it must be altered from that on oracle.
    # Here, we abuse the fact that the highest cmc in pauper is 9, so all numbers will be 1 character.
    try:
        ugly_number = hot_soup.find("td", class_="rightCol").find_all("div", class_="row")[2].\
                find("div", class_="value").text
        cmc = int(ugly_number[len(ugly_number) - 1])
    except:
        cmc = None

        
    ############# TYPE-LINE ############# 
    
    
    # Scrape the typeline
    type_line = browser.find_by_id(html_ids[0]).text.split("\n")[1]
    
    
    ############# TEXT BOX ############# 
    

    # Check for a text box, if one does not exist, skip ahead
    try:
        text = browser.find_by_id(html_ids[1]).\
            text.replace("Card Text:\n", "")
    except:
        text = None

        
    ############# POWER, TOUGHNESS, AND LOYALTY ############# 
    
    
    # Check for power and toughness.  If both are valid, set loyalty to zero.
    try:
        pt = browser.find_by_id(html_ids[2]).text.split("\n")[1].\
                split(" / ")
        power = int(pt[0])
        toughness = int(pt[1])
        loyalty = None
    # Something failed so card has no card has no pt. Next, check if card is a planeswalker.
    except:
        power = None
        toughness = None
        try:
            loyalty = int(browser.find_by_id(html_ids[2]).text.\
                          split("\n")[1])
        except:
            loyalty = None
    
    
    # Appends the card data to the relevant lists
    card_list.append(row[1]["Name"])
    gen_list.append(gen)
    snow_list.append(snow)
    c_list.append(c)
    v_list.append(v)
    two_b_list.append(two_b)
    phy_list.append(phy)
    w_list.append(w)
    u_list.append(u)
    b_list.append(b)
    r_list.append(r)
    g_list.append(g)
    cmc_list.append(cmc)
    type_line_list.append(type_line)
    text_list.append(text)
    power_list.append(power)
    toughness_list.append(toughness)
    loyalty_list.append(loyalty)

In [88]:
# The dataframe containing all cards whose links did not properly navigate is generated
card_manual_df= pd.DataFrame({"Name": card_list,
                              "Cost Generic": gen_list,
                              "Cost Snow": snow_list,
                              "Cost Colorless": c_list,
                              "Cost Variable": v_list,
                              "Cost 2-brid": two_b_list,
                              "Cost Phyrexian": phy_list,
                              "Cost White": w_list,
                              "Cost Blue": u_list,
                              "Cost Black": b_list,
                              "Cost Red": r_list,
                              "Cost Green": g_list,
                              "Converted Mana Cost": cmc_list,
                              "Type Line": type_line_list,
                              "Text Box": text_list,
                              "Power": power_list,
                              "Toughness": toughness_list,
                              "Loyalty": loyalty_list
                      })
card_manual_df

Unnamed: 0,Name,Cost Generic,Cost Snow,Cost Colorless,Cost Variable,Cost 2-brid,Cost Phyrexian,Cost White,Cost Blue,Cost Black,Cost Red,Cost Green,Converted Mana Cost,Type Line,Text Box,Power,Toughness,Loyalty
0,Mountain,0,0,0,0,0,0,0,0,0,0,0,,Basic Land — Mountain,R,,,


In [89]:
# Dataframes of cards which did and did not navigate are appended to form one another to generate a complete dataframe
card_final_df = card_df.append(card_manual_df).reset_index(drop=True)
card_final_df.to_csv("mtg_merged_df.csv")
card_final_df

Unnamed: 0,Name,Cost Generic,Cost Snow,Cost Colorless,Cost Variable,Cost 2-brid,Cost Phyrexian,Cost White,Cost Blue,Cost Black,Cost Red,Cost Green,Converted Mana Cost,Type Line,Text Box,Power,Toughness,Loyalty
0,Delver of Secrets,0,0,0,0,0,0,0,1,0,0,0,1.0,Creature — Human Wizard,"At the beginning of your upkeep, look at the t...",1.0,1.0,
1,Goblin Ringleader,3,0,0,0,0,0,0,0,0,1,0,4.0,Creature — Goblin,Haste (This creature can attack and as soon as...,2.0,2.0,
2,Mountain,0,0,0,0,0,0,0,0,0,0,0,,Basic Land — Mountain,R,,,
