In [30]:
# Import Dependencies
import pandas as pd
from splinter import Browser
from bs4 import BeautifulSoup
import time

In [31]:
# Function to remove formatting from gatherer
def extract_name(card, card_long):
    length = len(card_long)
    good_string = card_long[length-len(card):-1]
    good_string += card_long[length-1]
    return(good_string) 

In [32]:
# Create browser to navigate oracle
executable_path = {'executable_path': 'chromedriver.exe'}
browser = Browser('chrome', **executable_path, headless=False)
url = 'https://gatherer.wizards.com/Pages/Default.aspx'
browser.visit(url)

In [33]:
# Here will be the list with all cards which will be converted from a csv.
# Temporarily, these cards are chosen so each mana symbol is represented at least once.
cards_df = pd.DataFrame({"Name": ["Arcum's Astrolabe", "Plains", "Island", "Swamp", "Mountain", "Forest", "Ponder"],
                         "Copies": [0,1,2,3,4,5,6]
                        })

In [34]:
# Cards dataframe is used to check oracle text and then relevant data is crafted into dataframe

# Empty list for list of mismatched links is crafted
bad_link = []

# Empty lists are initialized
card_list = []
copies_list = []
gen_list = []
snow_list = []
c_list = []
v_list = []
two_b_list = []
phy_list = []
w_list = []
u_list = []
b_list = []
r_list = []
g_list = []
cmc_list = []
type_line_list = []
text_list = []
power_list = []
toughness_list = []
loyalty_list = []

# For loop loops through list of cards
for row in cards_df.iterrows():
    
    # Initialize all mana values which may be duplicated at 0
    gen = 0
    snow = 0
    c = 0
    v = 0
    two_b = 0
    phy = 0
    w = 0
    u = 0
    b = 0
    r = 0
    g = 0
    
    # Navigates browser to card page.  Note that this picks the first option in the dropdown when a card name is entered.
    # It is possible that the wrong card will be grabbed, so this is verified below.
    browser.fill("ctl00$ctl00$MainContent$Content$SearchControls$CardSearchBoxParent$CardSearchBox", row[1]["Name"])
    time.sleep(1)
    browser.links.find_by_partial_text(row[1]["Name"]).click()
    
    # Soup object is crafted at card page so html can be read
    html = browser.html
    hot_soup = BeautifulSoup(html, "html.parser")
    
    # Check to see if card navigation was successful.  Name must be extracted from formatting to check strings are equal.
    # If strings are not equal, the entry is entered into a list.  Otherwise, the program continues.
    card_long = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_nameRow").text.split("\n")[1]
    card_name = extract_name(row[1]["Name"], card_long)
    if row[1]["Name"] != card_name:
        bad_link.append([row[1]["Name"],row[1]["Copies"]])
    
    # Card name is correct,so data is scraped
    else:
        
        # Check for a mana cost, if one does not exist, skip ahead
        try:
            mana_row = hot_soup.find("div", class_="row manaRow").find_all("img")
            
            # Loop through all mana symbols in the mana row, in each case the alt text will be used.
            # Mana is saved as [gen]eric, [snow], [c]olorless, [v]ariable, (two_b)rid, (phy)rexian, and
            #    [w]hite, bl[u]e, [b]lack, [r]ed, and [g]reen (hybrid, phyrexian, and 2brid mana are counted twice).
            for cost in mana_row:
                
                # Attempt to turn symbol into a number, only possible if it is generic
                try:
                    gen = int(cost["alt"])
                
                # The symbol is not generic mana
                except:
                    
                    # These three qualities are unique, and thus mutually exclusive
                    if "Snow" == cost["alt"]:
                        snow += 1
                    elif "Colorless" == cost["alt"]:
                        c += 1
                    elif "Variable Colorless" == cost["alt"]:
                        v += 1
                    
                    # The following mana types may appear in combination in the same symbol, thus are checked if the string is
                    #    in the alt text.  The intent is that mana which may be paid for in multiple ways is counted as such.
                    else:
                        if "Two or " in cost["alt"]:
                            two_b += 1
                        if "Phyrexian" in cost["alt"]:
                            phy += 1
                        if "White" in cost["alt"]:
                            w += 1
                        if "Blue" in cost["alt"]:
                            u += 1
                        if "Black" in cost["alt"]:
                            b += 1
                        if "Red" in cost["alt"]:
                            r += 1
                        if "Green" in cost["alt"]:
                            g += 1    

        
        # If no mana cost, return all mana entries as None ("None" is different from "0", see Mountain and Ornithopter)
        except:
            gen = None
            snow = None
            c = None
            v = None
            two_b = None
            phy = None
            w = None
            u = None
            b = None
            r = None
            g = None

        
        
        # Scrape a converted mana cost. If one does not exist, skip ahead
        # In order to ensure that the cmc is a number and usable, it must be altered from that on oracle.
        # Here, we abuse the fact that the highest cmc in pauper is 9, so all numbers will be 1 character.
        try:
            ugly_number = hot_soup.find_all("div", class_="row")[2].find("div", class_="value").text
            cmc = int(ugly_number[len(ugly_number) - 1])
        except:
            cmc = None
        
        # Scrape the typeline
        type_line = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_typeRow").text.split("\n")[1]
        
        # Check for a text box, if one does not exist, skip ahead
        try:
            text = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_textRow").\
                text.replace("Card Text:\n", "")
        except:
            text = None
        
        # Check for power and toughness.  If both are valid, set loyalty to zero.
        try:
            pt = browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow").text.split("\n")[1].\
                    split(" / ")
            power = int(pt[0])
            toughness = int(pt[1])
            loyalty = None
        # Something failed so card has no card has no pt. Next, check if card is a planeswalker.
        except:
            power = None
            toughness = None
            try:
                loyalty = int(browser.find_by_id("ctl00_ctl00_ctl00_MainContent_SubContent_SubContent_ptRow").text.\
                              split("\n")[1])
            except:
                loyalty = None
    
    # Appends the card data to the relevant lists
    card_list.append(row[1]["Name"])
    copies_list.append(row[1]["Copies"])
    gen_list.append(gen)
    snow_list.append(snow)
    c_list.append(c)
    v_list.append(v)
    two_b_list.append(two_b)
    phy_list.append(phy)
    w_list.append(w)
    u_list.append(u)
    b_list.append(b)
    r_list.append(r)
    g_list.append(g)
    cmc_list.append(cmc)
    type_line_list.append(type_line)
    text_list.append(text)
    power_list.append(power)
    toughness_list.append(toughness)
    loyalty_list.append(loyalty)
    
    # Navigates back to search page
    browser.back()

In [35]:
card_df= pd.DataFrame({"Name": card_list,
                       "Copies": copies_list,
                       "Cost Generic": gen_list,
                       "Cost Snow": snow_list,
                       "Cost Colorless": c_list,
                       "Cost Variable": v_list,
                       "Cost 2-brid": two_b_list,
                       "Cost Phyrexian": phy_list,
                       "Cost White": w_list,
                       "Cost Blue": u_list,
                       "Cost Black": b_list,
                       "Cost Red": r_list,
                       "Cost Green": g_list,
                       "Converted Mana Cost": cmc_list,
                       "Type Line": type_line_list,
                       "Text Box": text_list,
                       "Power": power_list,
                       "Toughness": toughness_list,
                       "Loyalty": loyalty_list
                      })
card_df

Unnamed: 0,Name,Copies,Cost Generic,Cost Snow,Cost Colorless,Cost Variable,Cost 2-brid,Cost Phyrexian,Cost White,Cost Blue,Cost Black,Cost Red,Cost Green,Converted Mana Cost,Type Line,Text Box,Power,Toughness,Loyalty
0,Arcum's Astrolabe,0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,Snow Artifact,( can be paid with one mana from a snow perman...,,,
1,Plains,1,,,,,,,,,,,,,Basic Land — Plains,W,,,
2,Island,2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Basic Land — Plains,W,,,
3,Swamp,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Basic Land — Plains,W,,,
4,Mountain,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Basic Land — Plains,W,,,
5,Forest,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,,Basic Land — Plains,W,,,
6,Ponder,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,Sorcery,"Look at the top three cards of your library, t...",,,


In [36]:
# Generate a list of card names with bad links, and a list of copies of cards with bad links

# Empty lists initialized
bad_names = []
bad_copies = []

# For loop floops through the links appending relevant entries to relevant list
for link in bad_link:
    bad_names.append(link[0])
    bad_copies.append(link[1])
    
# Turns lists into a dataframe for later merging
bad_df = pd.DataFrame({"Name": bad_names,
                       "Copies": bad_copies})
bad_df

Unnamed: 0,Name,Copies
0,Island,2
1,Swamp,3
2,Mountain,4
3,Forest,5


In [37]:
# Drop all cards whose link did not go to the correct card
card_df.drop(card_df.loc[card_df["Name"].isin(bad_names)].index, inplace=True)

In [38]:
# A dataframe of cards known to cause issues when the above process is run is manually entered then merged with the bad_df.
bad_ids = pd.DataFrame({"Name": ["Island", "Swamp", "Mountain", "Forest", "Ponder"],
                       "Oracle ID": [491574, 491576, 491578, 491580, 451051]})
card_id_df = bad_df.merge(bad_ids, how="inner", on="Name")
card_id_df

Unnamed: 0,Name,Copies,Oracle ID
0,Island,2,491574
1,Swamp,3,491576
2,Mountain,4,491578
3,Forest,5,491580
