In [10]:
import pandas as pd
from time import sleep
import requests
from bs4 import BeautifulSoup
import re

In [2]:
# Get list of sets we want to include in our dataset
# set up only to fetch the sets released in the first nine months of 2022
start_date = "2021/12/31"
end_date = "2022/10/01"

# parse the html content of the page 
set_releases_page = requests.get("https://www.db.yugioh-card.com/yugiohdb/card_list.action")
set_releases_page = BeautifulSoup(set_releases_page.content)

In [3]:
# Based on exploring the page html, I see that the class "pack" refers to a set that has been released
packs = set_releases_page.find_all("div", class_="t_row")

In [4]:
def get_date(row):
  #use regular expressions to pull the data out of the html
  date_exp = '\d{4}/\d{2}/\d{2}'
  return re.findall(date_exp, row.find("div", class_="time").text)[0]

def get_card_names(row):
  base_url = "https://www.db.yugioh-card.com"
  # the input tags are links to a list of each pack and the available cards in that set
  # value contains the relative path to the base url
  set_page = requests.get(base_url + row.find("input")['value'])
  set_page = BeautifulSoup(set_page.content)
  # the cards 
  return [i.text for i in set_page.find_all("span", class_="card_name")]


In [5]:
all_names = set()
for card_set in packs:
  #if the set was released outside of our range, don't scrape it
  if get_date(card_set) > end_date:
    continue
  #the list is sorted so once we find one before our range, we can break the loop
  if get_date(card_set) < start_date:
    break
  #we use a set for card_names because 
  for card in get_card_names(card_set):
    all_names.add(card)

In [7]:
#  checkout how many cards we'll be looking at
len(all_names)
# now we want to get name, set code, rarity, price low, price avg

1572

In [9]:
def get_codes_and_rarities(price_page):
  set_and_rarity = price_page.select(".print-variant-header")
  s = []
  r = []
  for printing in set_and_rarity:
    info = printing.text.split('--')[1]
    s.append(info[1:5])
    r.append(info[info.find("(")+1:info.find(")")])
  return(s, r)

def get_min_avg_prices(price_page):
  # there are two unneeded data tables at the beginning of each page
  # additionally for each price table, we get a table explaining
  # change over time which we also don't need
  price_tables = price_page.find_all("table", id="item_stats")[2::2]
  mins = []
  avgs = []
  for printing in price_tables:
    vals = [float(i.text.strip("\n")[1:]) for i in printing.find_all("p")]
    vals.sort()
    # the min price will be the lowest of the three prices listed
    # the avg price will be the middle of the three prices listed
    mins.append(vals[0])
    avgs.append(vals[1])
  return(mins, avgs)


In [13]:
result = pd.DataFrame(data={'name':[], 'code':[], 'rarity':[], 'min_price':[], 'avg_price':[]})

price_base_url = "https://yugiohprices.com/card_price?name="
iteration = 0
for name in all_names:
  try:
    iteration += 1
    # keep ourselves updated on progress
    if iteration % 50 == 0:
      print(iteration)
    # get our price page                                #spaces in names and a special character need replacement
    page = BeautifulSoup(requests.get(f"{price_base_url}{name.replace(' ', '+').replace('&','%26')}").content)
    #a,b,c,d and are simply buffers to hold the data implied by the function names until we append them to the dataframe
    a,b = get_codes_and_rarities(page)
    c,d = get_min_avg_prices(page)
    result = result.append(pd.DataFrame(data={'name': [name]*len(a),'code':a, 'rarity':b, 'min_price':c, 'avg_price':d}))
    # DOSing people isn't cool, make sure to add a time delay if you need to make a lot of requests!
    sleep(1)
  except:
    #print out errors, just in case
    print(name)

Zero Gravity
Fabled Ashenveil
Hamon, Lord of Striking Thunder
Spirit Message "N"
50
Aluber the Jester of Despia
Champion's Vigilance
The Legendary Fisherman II
Keeper of Dragon Magic
Red-Eyes Darkness Metal Dragon
100
Pre-Preparation of Rites
Elemental HERO Avian
D.D. Warrior Lady
Michizure
150
Torrential Tribute
Morphtronic Earfon
Ebon High Magician
Bashing Shield
Senju of the Thousand Hands
200
Gadget Gamer
Jack's Knight
Ally of Justice Thunder Armor
250
Jowgen the Spiritualist
Kunai with Chain
Blue-Eyes White Dragon
Reinforcement of the Army
300
Mask of Darkness
350
400
Elemental HERO Burstinatrix
Cyber Dragon
Insect Imitation
Book of Life
Creature Swap
Naturia Guardian
Spirit Message "L"
450
Ghoti of the Deep Beyond
Queen's Knight
500
Cost Down
Anchamoufrite
The Legendary Fisherman
550
Go! - D/D/D Divine Zero King Rage
Kuriboh
Big Koala
Ancient Gear Golem
600
Winged Kuriboh
Elemental HERO Stratos
650
700
Breaker the Magical Warrior
Mystical Elf - White Lightning
750
The Warrior Ret

In [15]:
result.to_csv("yugioh-prices.csv")