In [1]:
import requests 
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import numpy as np   
import pandas as pd  
import time          
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import re
from tqdm import tqdm

In [2]:
# page_link = 'https://mtgsale.ru/home/search-results?Name=&Lang=Any&Type=Any&Color=Any&Rarity=Any&Foil=Any&minP=3&maxP=34400&Page=5&Sort=name&CardSet='
page_link ='https://scryfall.com/sets'

In [3]:
UserAgent().chrome

'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/27.0.1453.93 Safari/537.36'

In [4]:
response = requests.get(page_link, headers={'User-Agent': UserAgent().chrome})
response

<Response [200]>

In [5]:
html = response.content

In [10]:
soup = BeautifulSoup(html, 'html.parser')

In [15]:
obj = soup.find_all('td', attrs = {'class':'flexbox'})
# link = obj.find('a').get('href')
# link

In [18]:
sets_page = []
for single_set in obj:
        set_page = single_set.find('a').get('href')
        if 'http' not in set_page:
            set_page = 'https://scryfall.com'+set_page
        sets_page.append(set_page)
sets_page[:2]

['https://scryfall.com/sets/gn3', 'https://scryfall.com/sets/unf']

In [19]:
session = requests.Session()
retry = Retry(connect=5, backoff_factor=1.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)

In [20]:
def getPageLinks(sets_page):
    """
        Возвращает список ссылок на карты mtg, полученный с текущей страницы
        
        page_number: int/string
            номер страницы для парсинга
    """
    
    # составляем ссылку на страницу поиска
    sets_page = []
    cards_page = []
    response = requests.get(page_link, headers={'User-Agent': UserAgent().chrome})
    html = response.content
    soup = BeautifulSoup(html, 'html.parser')
    obj = soup.find_all('td', attrs = {'class':'flexbox'})
    for single_set in obj:
        set_page = single_set.find('a').get('href')
        if 'http' not in set_page:
            set_page = 'https://scryfall.com'+set_page
        sets_page.append(set_page)
        
    sets_page = sets_page[:2]
    
    for set_page in sets_page:
#         print(set_page)
        response = session.get(set_page, headers={'User-Agent': UserAgent().chrome})
        html = response.content
        soup = BeautifulSoup(html,'html.parser')
        cards = soup.findAll(lambda tag: tag.get('class') == ['card-grid-item-card'])
        for link in cards:
#             print(link)
            cards_page.append(link.attrs['href'])
    # запрашиваем данные по ней
    
    
    if not response.ok:
        # если сервер нам отказал, вернем пустой лист для текущей страницы
        return [] 
 
    
    
    return cards_page

In [21]:
mtg_links = getPageLinks('https://scryfall.com/sets/gn3')
mtg_links[:2]

['https://scryfall.com/card/gn3/1/zamriel-seraph-of-steel',
 'https://scryfall.com/card/gn3/2/maeve-insidious-singer']

In [73]:
stats = [['span', 'card-text-card-name'], ['span', 'card-text-mana-cost'], ['p', 'card-text-type-line'], 
         ['div', 'card-text-oracle'], ['abbr','card-symbol card-symbol-2'], ['abbr','card-symbol card-symbol-U'], 
         ['div', 'card-text-stats'], ['p','card-text-artist'], ['div','card-legality-item']]

In [126]:
def getStats(soup, stats1, stats2):
    """
        Возвращает очищенные данные
    """
    
    try:
        if stats2 != 'card-legality-item':
            obj = soup.find(stats1, attrs={'class':stats2}).get_text()
            obj = obj.strip()
#         obj = obj.get("title")
#         obj = obj.split()[0]
#         obj = int(obj.replace(',', ''))
        else:
            obj = soup.find_all(stats1, attrs={'class':stats2})
            obj = [x.get_text().strip().split('\n') for x in obj]
            
    except:
        obj=None
    return obj

In [129]:
def getProperties(link):
    """
        Возвращает список с описанием
    """
    result = []
    response = session.get(link, headers={'User-Agent': UserAgent().chrome})
    html = response.content
    soup = BeautifulSoup(html,'html.parser')
    for stat in stats:
        result.append(getStats(soup, stat[0], stat[1]))
    return result

In [130]:
rows = getProperties('https://scryfall.com/card/gn3/1/zamriel-seraph-of-steel')
rows

['Zamriel, Seraph of Steel',
 '{2}{W}{W}',
 'Legendary Creature — Angel',
 'Flying\nAs long as it’s your turn, equipped creatures you control have indestructible.',
 '{2}',
 None,
 '3/4',
 'Illustrated by\n            Chris Rallis',
 [['Standard', 'Not Legal'],
  ['Alchemy', 'Not Legal'],
  ['Pioneer', 'Not Legal'],
  ['Explorer', 'Not Legal'],
  ['Modern', 'Not Legal'],
  ['Brawl', 'Not Legal'],
  ['Legacy', 'Legal'],
  ['Historic', 'Not Legal'],
  ['Vintage', 'Legal'],
  ['Pauper', 'Not Legal'],
  ['Commander', 'Legal'],
  ['Penny', 'Not Legal']]]

In [64]:
rows.get_text().strip()

In [20]:
# final_df = pd.DataFrame(columns=['Language', 'Set', 'Name1', 'Name2', 'Foil',
#                                  'Quality', 'Rarity', 'Price', 'Qty', 'Special1', 'Special2'])
# df_row = 0
# final_df.loc[df_row] = rows[0]

In [21]:
final_df = pd.DataFrame(columns=['Language', 'Set', 'Name1', 'Name2', 'Foil',
                                 'Quality', 'Rarity', 'Price', 'Qty', 'Special1', 'Special2'])
df_row = 0
session = requests.Session()
retry = Retry(connect=3, backoff_factor=0.5)
adapter = HTTPAdapter(max_retries=retry)
session.mount('http://', adapter)
session.mount('https://', adapter)


for page in tqdm(mtg_links):
    response = session.get(page, headers={'User-Agent': UserAgent().chrome})
    html = response.content
    soup = BeautifulSoup(html,'html.parser')
    rows = getProperties(soup)
    for parsed_row in rows:
#         final_df = pd.DataFrame(columns=['Language', 'Set', 'Name1', 'Name2', 'Foil',
#                                  'Quality', 'Rarity', 'Price', 'Qty', 'Special1', 'Special2'])
        if len(parsed_row) < 11:
            parsed_row = np.insert(parsed_row, 9, '')
        final_df.loc[df_row] = parsed_row
        df_row += 1
final_df

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:01<00:00,  7.41it/s]


Unnamed: 0,Language,Set,Name1,Name2,Foil,Quality,Rarity,Price,Qty,Special1,Special2
0,Английский,Unhinged,"""Ach! Hans, Run!""",,,NM,Редкая,370 ₽,1 шт.,,
1,Английский,Unstable,"""Rumors of My Death . . .""",,,NM,Необычная,7 ₽,13 шт.,,
2,Английский,Adventures in the Forgotten Realms,+2 Mace,,,NM,Обычная,8 ₽,0 шт.,,Equipped creature gets +2/+2.
3,Английский,Adventures in the Forgotten Realms,+2 Mace,,Фойл,NM,Обычная,18 ₽,2 шт.,,Equipped creature gets +2/+2.
4,Русский,Adventures in the Forgotten Realms,+2 Mace [RUS],+2 Mace,,NM,Обычная,8 ₽,9 шт.,,Equipped creature gets +2/+2.
5,Русский,Adventures in the Forgotten Realms,+2 Mace [RUS],+2 Mace,Фойл,NM,Обычная,22 ₽,0 шт.,,Equipped creature gets +2/+2.
6,Русский,Eldritch Moon,Променять Мудрость,Abandon Reason,,NM,Необычная,14 ₽,3 шт.,Не более двух целевых существ получают по +1/+...,Up to two target creatures each get +1/+0 and ...
7,Русский,Eldritch Moon,Променять Мудрость,Abandon Reason,,SP,Необычная,13 ₽,0 шт.,Не более двух целевых существ получают по +1/+...,Up to two target creatures each get +1/+0 and ...
8,Русский,Eldritch Moon,Променять Мудрость,Abandon Reason,,MP,Необычная,11 ₽,0 шт.,Не более двух целевых существ получают по +1/+...,Up to two target creatures each get +1/+0 and ...
9,Русский,Eldritch Moon,Променять Мудрость,Abandon Reason,Фойл,NM,Необычная,45 ₽,0 шт.,Не более двух целевых существ получают по +1/+...,Up to two target creatures each get +1/+0 and ...


In [23]:
final_df = pd.DataFrame(columns=['Language', 'Set', 'Name1', 'Name2', 'Foil',
                                 'Quality', 'Rarity', 'Price', 'Qty', 'Special1', 'Special2'])
df_row = 0

In [30]:
for page_number in tqdm(range(3000, 3734), desc='Pages'):
    # собрали хрефы с текущей страницы
    mtg_links = getPageLinks('https://mtgsale.ru/home/search-results?Page={}', page_number)  
    
    
    
    for mtg_links in mtg_links:
        # иногда с первого раза страничка не парсится
        for i in range(3):
            try:
                # пытаемся собрать по мему немного даты
                
                response = session.get(mtg_links, headers={'User-Agent': UserAgent().chrome})
                
#                 response = requests.get(page, headers={'User-Agent': UserAgent().chrome}) 
                html = response.content
                soup = BeautifulSoup(html,'html.parser')
                rows = getProperties(soup)
                for parsed_row in rows:
                    if len(parsed_row) < 11:
                        parsed_row = np.insert(parsed_row, 9, '')
                    final_df.loc[df_row] = parsed_row
                    df_row += 1
                # если всё получилось - выходим из внутреннего цикла
                break
            except:
                # Иначе, пробуем еще несколько раз, пока не закончатся попытки
                print('AHTUNG! parsing once again:', mtg_links)
                continue
final_df

Pages: 100%|███████████████████████████████████████████████████████████████████████| 734/734 [1:55:30<00:00,  9.44s/it]


Unnamed: 0,Language,Set,Name1,Name2,Foil,Quality,Rarity,Price,Qty,Special1,Special2
0,Английский,Unhinged,"""Ach! Hans, Run!""",,,NM,Редкая,370 ₽,1 шт.,,
1,Английский,Unstable,"""Rumors of My Death . . .""",,,NM,Необычная,7 ₽,13 шт.,,
2,Английский,Adventures in the Forgotten Realms,+2 Mace,,,NM,Обычная,8 ₽,0 шт.,,Equipped creature gets +2/+2.
3,Английский,Adventures in the Forgotten Realms,+2 Mace,,Фойл,NM,Обычная,18 ₽,2 шт.,,Equipped creature gets +2/+2.
4,Русский,Adventures in the Forgotten Realms,+2 Mace [RUS],+2 Mace,,NM,Обычная,8 ₽,9 шт.,,Equipped creature gets +2/+2.
...,...,...,...,...,...,...,...,...,...,...,...
219610,Английский,Kaladesh,Æther Tradewinds,Эфирный Пассат,Фойл,NM,Обычная,19 ₽,0 шт.,,
219611,Английский,Darksteel,Æther Vial,,,NM,Необычная,1620 ₽,0 шт.,,"At the beginning of your upkeep, you may put a..."
219612,Английский,Darksteel,Æther Vial,,,SP,Необычная,1470 ₽,0 шт.,,"At the beginning of your upkeep, you may put a..."
219613,Английский,Darksteel,Æther Vial,,,MP,Необычная,1310 ₽,0 шт.,,"At the beginning of your upkeep, you may put a..."


In [31]:
final_df.to_csv('parsed_data3.csv') 