In [1]:
import requests 
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import numpy as np   
import pandas as pd       
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

In [2]:
page_link ='https://scryfall.com/sets'
response = requests.get(page_link, headers={'User-Agent': UserAgent().chrome})
html = response.content
soup = BeautifulSoup(html, 'html.parser')
obj = soup.find_all('td', attrs = {'class':'flexbox'})

In [3]:
# Получаем ссылки на все сеты
sets_page = []
for single_set in obj:
        set_page = single_set.find('a').get('href')
        if 'http' not in set_page:
            set_page = 'https://scryfall.com' + set_page
        sets_page.append(set_page)
sets_page[:3]

['https://scryfall.com/sets/gn3',
 'https://scryfall.com/sets/unf',
 'https://scryfall.com/sets/40k']

In [4]:
# Формируем параметры подключения с повторным подключением
session = requests.Session()
retry = Retry(connect=5, backoff_factor=1.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [5]:
def getPageLinks(sets_page: list) -> list:
    '''
    Получает ссылки на страницы карт из ссылок на сеты
    '''
#     # составляем ссылку на страницу поиска
#     sets_page = []
    cards_page = []
#     response = requests.get(page_link, headers={'User-Agent': UserAgent().chrome})
#     html = response.content
#     soup = BeautifulSoup(html, 'html.parser')
#     obj = soup.find_all('td', attrs = {'class':'flexbox'})
#     for single_set in obj:
#         set_page = single_set.find('a').get('href')
#         if 'http' not in set_page:
#             set_page = 'https://scryfall.com'+set_page
#         sets_page.append(set_page)
        
#     sets_page = sets_page[:2]
    
    for set_page in tqdm(sets_page, desc = 'Sets'):
        response = session.get(set_page, headers={'User-Agent': UserAgent().chrome})
        html = response.content
        soup = BeautifulSoup(html,'html.parser')
        cards = soup.findAll(lambda tag: tag.get('class') == ['card-grid-item-card'])
        for link in cards:
            cards_page.append(link.attrs['href'])
    
    if not response.ok:
        # если сервер нам отказал, вернем пустой лист для текущей страницы
        return [] 
    
    return cards_page

In [6]:
mtg_links = getPageLinks(sets_page)

Sets: 100%|██████████████████████████████████████████████████████████████████████████| 761/761 [03:58<00:00,  3.19it/s]


In [7]:
# mtg_links.append('https://scryfall.com/card/rna/122/biogenic-ooze')

In [8]:
# Нужные данные со страницы
stats = [['span', 'card-text-card-name'], ['span', 'card-text-mana-cost'], ['p', 'card-text-type-line'], 
         ['div', 'card-text-oracle'], ['div', 'card-text-stats'], ['p','card-text-artist'], ['div','card-legality-item']]

In [9]:
def getStats(soup: BeautifulSoup, stats1: str, stats2: str) -> list:
    """
        Возвращает строку очищенных данных
    """
    try:
        if stats2 != 'card-legality-item':
            obj = soup.find(stats1, attrs={'class':stats2}).get_text()
            obj = obj.strip()
        else:
            obj = soup.find_all(stats1, attrs={'class':stats2})
            obj = [x.get_text().strip().split('\n')[1] for x in obj]
    except:
        obj=None
    return obj

In [10]:
def getProperties(link):
    """
        Возвращает список с описанием
    """
    result = []
    response = session.get(link, headers={'User-Agent': UserAgent().chrome})
    html = response.content
    soup = BeautifulSoup(html,'html.parser')
    for stat in stats:
        result.append(getStats(soup, stat[0], stat[1]))
    # Если в карте не написано про легальность, создадим пустой лист
    if not len(result[6]) > 1:
        result[6] = [[] for x in range(12)]
    for legality in result[6]:
        result.append(legality)
    del result[6]
    return result

In [11]:
rows = getProperties('https://scryfall.com/card/t40k/10/vanguard-suppressor')
rows

['Vanguard Suppressor',
 None,
 'Token Creature — Astartes Warrior',
 'Flying\nWhenever Vanguard Suppressor deals combat damage to a player, draw a card.',
 '3/2',
 'Illustrated by\n            Fajareka Setiawan',
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 []]

In [12]:
final_df = pd.DataFrame(columns=['Name', 'Cost', 'Type', 'Feature', 'Stats', 'Illustrator',
                                'Standard', 'Alchemy', 'Pioneer', 'Explorer', 'Modern',
                                'Brawl', 'Legacy', 'Historic', 'Vintage', 'Pauper', 'Commander',
                                'Penny'])
df_row = 0
# session = requests.Session()
# retry = Retry(connect=3, backoff_factor=0.5)
# adapter = HTTPAdapter(max_retries=retry)
# session.mount('http://', adapter)
# session.mount('https://', adapter)


for page in tqdm(mtg_links, desc='Pages'):
    parsed_row = getProperties(page)
#     print(page)
    final_df.loc[df_row] = parsed_row
    df_row += 1

Pages: 100%|███████████████████████████████████████████████████████████████████| 76641/76641 [6:51:01<00:00,  3.11it/s]


In [13]:
def power_strength_split(row: str) -> [str, str]:
    '''
    Разделение базовых характеристик на силу и выносливость
    '''
    try:
        power, strength = row.split('/')
    except Exception:
        strength = None
        power = None
    return [power, strength]

In [14]:
 final_df['Power'], final_df['Strength'] = zip(*final_df['Stats'].map(power_strength_split))

In [15]:
def get_cost(price: str) -> int:
    '''
    Конвертируем цифры и символы в цену
    '''
    try:
        cost = 0
        price = re.sub('\W', '', price)
        for i in price:
            if i.isnumeric():
                cost += int(i)
            else:
                cost += 1
    except Exception:
        cost = 0
    return cost

In [16]:
final_df['Main feature'] = final_df['Feature'].fillna('').apply(lambda x: x.split('\n')[0] if '\n' in x else '')

In [17]:
final_df['Cost_qty'] = final_df['Cost'].map(get_cost)

In [18]:
final_df['Illustrator'] = final_df['Illustrator'].fillna('').apply(lambda x: x.split('\n')[-1].strip())

In [19]:
final_df['Legendary'] = final_df['Type'].fillna('').apply(lambda x: 1 if 'Legendary' in x else 0)

In [20]:
def get_main_type(card_type:str) -> str:
    '''
    Деление карт на базовые классы (существо, заклинание, земля, проч)
    '''
    card_type = card_type.replace('Legendary ', '').split(' — ')[0]
    return card_type

In [21]:
final_df['Main type'] = final_df['Type'].fillna('').apply(lambda x: get_main_type(x))

In [22]:
final_df.loc[5]['Feature']

'Vigilance\nAssembled Ensemble’s power is equal to the number of Robots you control.\nWhenever you cast a spell with an artifact creature in its art, create a 1/1 white Clown Robot artifact creature token.'

In [24]:
final_df.to_csv('parsed_scryfall.csv') 