# Этап 1
Получение ссылок на первые 300 игр в рейтинге https://gg.deals/games/?sort=metascore&type=1&page=1. На одной странице по дефолту находится 24 игры.

In [1]:
from bs4 import BeautifulSoup
import requests
from tqdm.notebook import tqdm

In [2]:
game_urls = []
npages = 13
for n in tqdm(range(1, npages+1)):
    page = requests.get(f'https://gg.deals/games/?sort=metascore&type=1&page={n}')
    soup = BeautifulSoup(page.content, 'html.parser')
    for link in soup.find_all('a', class_='full-link'):
        game_urls.append('https://gg.deals' + link.get('href'))
        if len(game_urls) >= 300:
            break

  0%|          | 0/13 [00:00<?, ?it/s]

In [3]:
game_urls[-5:]

['https://gg.deals/game/europa-universalis/',
 'https://gg.deals/game/oddworld-new-n-tasty/',
 'https://gg.deals/game/homeworld-remastered-collection/',
 'https://gg.deals/game/dirt-3-complete-edition/',
 'https://gg.deals/game/the-binding-of-isaac-rebirth/']

# Этап 2
Получение информации об игре.

Функция process_page(url) возвращает словарь с данными. Некоторые атрибуты карточки обязательны, и если их не удастся получить, скачивание прервется. Другие атрибуты получаются с ignored(AttributeError), их может не быть. Чтобы достучаться до Highcharts графика, используется Selenium. Затраченное время на обработку страницы достигало 40 секунд (высокая нагрузка на сеть), в среднем 15 секунд.

In [9]:
%%time
from urllib.parse import urlparse
import urllib
import re
import json
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from contextlib import suppress

BASE = 'https://gg.deals'

def resolve(url):
    return urllib.request.urlopen(url).geturl()

def process_page(url, driver):
    time.sleep(3)
    driver.get(url)
    time.sleep(3)
    # open dlcs and packs tabs
    try:
        driver.find_element_by_xpath("//section[@id='game-packs']/div[2]/div[2]/a[1]").click()
        driver.find_element_by_xpath("//section[@id='game-packs']/div[2]/div[2]/a[1]").click()
        time.sleep(3)
    except:
        pass
    soup = BeautifulSoup(driver.page_source, 'html.parser')
    
    # get required data
    url_parsed = urlparse(url)
    game_card = {'url': url,
                 'status': True,
                 'name': soup.find('a', href=url_parsed.path).find('span', itemprop='name').text,
                 'image': soup.find('div', class_='game-info-image').find('img').get('srcset').split(' ')[0],
                 'wishlist_count': int(soup.find('div', class_='game-info-actions').find('div', class_='wishlisted-game')\
                                       .find('span', class_='user-count').find('span', class_='count').text),
                 'alert_count': int(soup.find('div', class_='game-info-actions').find('div', class_='alerted-game')\
                                    .find('span', class_='user-count').find('span', class_='count').text),
                 'owners_count': int(soup.find('div', class_='game-info-actions').find('div', class_='owned-game')\
                                     .find('span', class_='user-count').find('span', class_='count').text)}
    
    # everything wrapped in suppress may or may not be in game card
    infocontent = soup.find('div', class_='game-info-content')
    with suppress(AttributeError):
        game_card['market_url'] = resolve(soup.find('div', class_='game-info-image').find('a', class_='game-link-widget').get('href'))
    
    with suppress(AttributeError):
        game_card['release_date'] = infocontent\
                                        .find('div', class_='game-info-details-section game-info-details-section-release')\
                                        .find('p', class_='game-info-details-content').text
    
    with suppress(AttributeError):
        game_card['developer'] = infocontent\
                                     .find('div', class_='game-info-details-section game-info-details-section-developer')\
                                     .find('p', class_='game-info-details-content').text
    
    with suppress(AttributeError):
        game_card['metacritic_score'] = float(infocontent\
                                                  .find('div', class_='game-info-details-section game-info-details-section-reviews')\
                                                  .find('a', class_='score-circle score-metascore')\
                                                  .find('span', class_='overlay').text)
    
    with suppress(AttributeError):
        game_card['user_score'] = float(infocontent\
                                            .find('div', class_='game-info-details-section game-info-details-section-reviews')\
                                            .find('a', class_='score-circle score-userscore')\
                                            .find('span', class_='overlay').text)
    
    with suppress(AttributeError):
        game_card['review_label'] = infocontent.find('span', class_='reviews-label')\
                                        .text.split('(')[0]
    
    with suppress(AttributeError):
        game_card['review_positive_pctg'] = float(infocontent.find('span', class_='reviews-label')\
                                                      .get('title').split('%')[0])
    
    with suppress(AttributeError):
        game_card['review_count'] = int(re.search(r'\((.*?)\)', infocontent\
                                        .find('span', class_='reviews-label').text).group(1).replace(',', ''))
    
    with suppress(AttributeError):
        game_card['genres'] = []
        for genre in infocontent.find('div', id='game-info-genres').find_all('a', class_='badge'):
            game_card['genres'].append(genre.text)
    
    with suppress(AttributeError):
        game_card['tags'] = []
        for tag in infocontent.find('div', id='game-info-tags').find_all('a', class_='badge'):
            game_card['tags'].append(tag.text)
    
    with suppress(AttributeError):
        game_card['features'] = []
        for feature in infocontent.find('div', id='game-info-features')\
                           .find_all('a', class_='badge'):
            game_card['features'].append(feature.text)
    
    with suppress(AttributeError):
        game_card['dlcs'] = []
        for dlc in soup.find('section', id='game-dlcs').find_all('a', class_='full-link'):
            game_card['dlcs'].append(BASE + dlc.get('href'))
    
    with suppress(AttributeError):
        game_card['packs'] = []
        for pack in soup.find('section', id='game-packs').find_all('a', class_='full-link'):
            game_card['packs'].append(BASE + pack.get('href'))
    
    with suppress(AttributeError):
        game_card['pc_systems'] = []
        for system in soup.find('div', class_='game-requirements-tabs')\
                          .find_all('a', id=lambda s: s and s.startswith('requirement-tab-trigger-')):
            game_card['pc_systems'].append(system.text)
    
    with suppress(AttributeError):
        game_card['price_history'] = []
        try:
            element = driver.find_element_by_id('historical-chart-container')
        except:
            raise AttributeError()
        actions = webdriver.ActionChains(driver)
        actions.move_to_element(element)
        actions.click()
        actions.perform()
        time.sleep(3)
        driver.switch_to_active_element()
        chart_data = driver.execute_script('return Highcharts.charts[0].series[0].options.data')
        for point in chart_data:
            game_card['price_history'].append({'ts': point['x']//1000, 'price': point['y'], 'shop': point['shop']})
    
    for key in list(game_card):
        if game_card[key] == []:
            game_card.pop(key)
        
    return game_card
    
with webdriver.Chrome('C:\Bin\chromedriver.exe') as driver:
    print(json.dumps(process_page(game_urls[0], driver), indent=4))



{
    "url": "https://gg.deals/game/grand-theft-auto-v/",
    "status": true,
    "name": "Grand Theft Auto V",
    "image": "https://img.gg.deals/79/83/9db226294b1e8632c5f235d18e49fa899bf2_307xt176.jpg",
    "wishlist_count": 3447,
    "alert_count": 664,
    "owners_count": 10110,
    "market_url": "https://store.steampowered.com/app/271590/",
    "release_date": "14 Apr 2015",
    "developer": "Rockstar North",
    "metacritic_score": 96.0,
    "user_score": 7.8,
    "genres": [
        "Action",
        "Adventure"
    ],
    "tags": [
        "Action",
        "Adventure",
        "Atmospheric",
        "Automobile Sim",
        "Co-op",
        "Comedy",
        "Crime",
        "First-Person",
        "Funny",
        "Great Soundtrack",
        "Mature",
        "Moddable",
        "Multiplayer",
        "Open World",
        "Racing",
        "Sandbox",
        "Shooter",
        "Singleplayer",
        "Third Person",
        "Third-Person Shooter"
    ],
    "features": [
  

Wall time: 16.3 s


Если страница не скачалась, нужно попробовать скачать еще несколько раз.

In [6]:
import sys
from multiprocessing import Lock
lock = Lock()


def get_page(url, driver, n_attempts=5, t_sleep=1):
    for _ in range(n_attempts):
        try:
            record = process_page(url, driver)
        except Exception as e:
            lock.acquire()
            print(url, e, file=sys.stderr)
            lock.release()
            time.sleep(t_sleep)
            continue
        return record

Теперь обкачку нужно параллелизовать с помощью модуля multiprocessing.

In [7]:
import gzip
import json
import codecs

from multiprocessing.dummy import Pool, Queue

queue = Queue()
for url in game_urls:
    queue.put(url)

def process_page_wrapper(i):
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    with gzip.open(f'data/pt{i}.jsonl.gz', 'wb') as f_json, webdriver.Chrome('C:\Bin\chromedriver.exe', options=chrome_options) as driver:
        f_json = codecs.getwriter('utf8')(f_json)
        while not queue.empty():
            record = get_page(queue.get(), driver=driver)
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)
            
            with lock:
                pbar.update(1)
            
with Pool(processes=8) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

  0%|          | 0/300 [00:00<?, ?it/s]



Убедимся, что все 300 карточек скачаны.

In [8]:
count = 0
for i in range(8):
    with gzip.open(f'data/pt{i}.jsonl.gz', 'rb') as f_json:
        for line in f_json:
            count += 1
            
count

300

Готово!