In [4]:
import gzip
import json
import codecs

from multiprocessing.dummy import Pool, Queue
from bs4 import BeautifulSoup
from lxml import etree,html as lhtml
import re
import requests
import functools

In [5]:
url = 'https://gg.deals/games/?sort=metascore&type=1'

In [6]:
def get_urls_in_page(pages_text):
    soup = BeautifulSoup(pages_text, 'html.parser')
    l = []
    for link in soup.find_all('a', class_="ellipsis title"):
        l.append(link['href'])
    return l

In [7]:
def next_page(page_id, t_sleep=1):
    pages = requests.get('https://gg.deals/games/?sort=metascore&type=1&page={}'.format(page_id))
    if pages.status_code == 200:
            return my_url.json()
    else:
        time.sleep(t_sleep)
        print("ERROR: data doesn't load", url)
        return -1
    return pages

In [8]:
def reviews_label(): 
    dlcs = soup.find('a', class_="score-grade")
    dlcs = dlcs.find('span', class_="reviews-label").text
    s = re.findall(r'[^ ,\(\)0123456789]',dlcs)
    joined = ''.join([str(x) for x in s])
    return joined

In [9]:
def get_game_id(soup):
     return soup.find('div', class_=lambda s: s and s.startswith('game-collection-actions'))['data-game-id']

In [10]:
import time

def price_history(url, t_sleep=0.2):
    headers = {'x-requested-with': 'XMLHttpRequest'}
    my_url = requests.get(url, headers=headers)
    if my_url.status_code == 200:
            return my_url.json()
    time.sleep(t_sleep)
    print("ERROR: data doesn't load", url)
    return -1

In [11]:
import urllib3
urllib3.disable_warnings()

def game_info(url):
    url_html = requests.get('https://gg.deals' + url, verify=False).text
    soup = BeautifulSoup(url_html, 'lxml')
    res = {}
    res['url'] = 'https://gg.deals' + url
    res['name'] = soup.find('a', href=url).find('span').text
    
    image = soup.find('div', class_=lambda s: s and s.startswith('game-info-image')).find('img')
    image_url = image['src']
    res['image'] = image_url
    
    market_url = soup.find('a', rel="nofollow noopener external")
    res['market_url'] = market_url['href']
    
    # wishlist, alert, owners - counters
    counter = soup.find('div', class_=lambda s: s and s.startswith('game-collection-actions'))
    counter = counter.find_all('span', class_='count')
    cnt_list = re.findall(r'>\d+<',str(counter))
    for i in range(0,6):
        cnt_list[i] = cnt_list[i][1:-1]
    cnt_list = cnt_list[0:-1:2]# wish,alert,own
    wish_cnt = int(cnt_list[0])
    alert_cnt = int(cnt_list[1])
    own_cnt = int(cnt_list[2])
    res['wishlist_count'] = wish_cnt
    res['alert_count'] =  alert_cnt
    res['owners_count'] = own_cnt

    #release and developer
    release_developer = soup.find('div', class_='game-info-content')
    
    release = release_developer.find('p', class_='game-info-details-content')
    res['release_date'] = release.text
    
    developer = release_developer.find('p', class_='game-info-details-content')
    res['developer'] = developer.text
    
    # metacritic_score and user_score
    score = soup.find_all('div', class_=r'score-col')
    if score:
        for block in score:
            if len(block['class']) > 1:
                break
            scores = block.find('span', class_='overlay').text
            title = block.find('div', class_='score-label').text
            res[title] = scores

    # tags and features, and genres        
    tags = soup.find('div', id="game-info-tags")
    if tags:
        t_arr = tags.find_all('a')
        res['tags'] = [el.text for el in t_arr]

    genres = soup.find('div', id='game-info-genres')
    if genres:
        t_arr = genres.find_all('a')
        res['genres'] = [el.text for el in t_arr]
    
    feature = soup.find('div', id='game-info-features')
    if feature:
        t_arr = feature.find_all('a')
        res['feature'] = [el.text for el in t_arr]
    
    
    #dlcs
    dlcs= soup.find('section', id='game-dlcs')
    if dlcs:
        dlcs = dlcs.find_all('a', class_='full-link')
        local_list = []
        for i in dlcs:
            local_list.append('https://gg.deals/' + i['href'])
        res['dlcs'] = local_list
    else:
        res['dlcs'] = []
        
    #packs
    packs = soup.find('section', id='game-packs')
    if packs:
        packs = packs.find_all('a', class_='full-link')
        local_list = []
        for i in packs:
            local_list.append('https://gg.deals/' + i['href'])
        res['packs'] = local_list
    else:
        res['packs'] = []
    
    # review_label, review_positive_pctg, review_count
    review_label = soup.find('div', class_='score-heading')
    if review_label:
        review_label = review_label.find('span', class_=lambda s: s and s.startswith('reviews'))
        s = re.findall(r'[^ ,\(\)0123456789]', review_label.text)
        res['review_label'] = ''.join([str(x) for x in s])
    
        review_positive_pctg = review_label['title']
        s = re.findall(r'[0123456789]+%', review_positive_pctg)[0]
        s1 = re.findall(r'[^%]', s)
        res['review_positive_pctg'] = ''.join([str(x) for x in s1])
    
        review_count = review_label.text
        s = re.findall(r'[0123456789]', review_count)
        res['review_count'] = ''.join([str(x) for x in s])

    res['price_history'] = []
    id = get_game_id(soup)
    url = 'https://gg.deals/ru/games/chartHistoricalData/{}/?hideKeyshops=0'.format(id)
    tree = price_history(url)
    if tree == -1:
        return -1
    
    if ('chartData') in tree:
        tree = tree['chartData']
        if tree['deals']:
            prev = tree['deals'][0]['y']
        for i in tree['deals']:
            next = i['y']
            local_dict = {}
            if next != prev:
                prev = next
                local_dict['ts'] = i['x'] / 1000
                local_dict['price'] = next
                local_dict['shop'] = i['shop']
                res['price_history'].append(local_dict)
    else:
        res['price_history'] = []
    return res

In [12]:
def queue_pages(cnt):
    my_list= []
    N = cnt
    for page_id in range(N):
        page = requests.get('https://gg.deals/games/?sort=metascore&type=1&page={}'.format(page_id))
        if page.status_code != 200:
            time.sleep(t_sleep)
            print("ERROR: data doesn't load", url)
            return 
        my_list.append(page)
    return my_list

In [13]:
import gzip
import json
import codecs
from tqdm.notebook import tqdm
from multiprocessing import Pool, Lock, Value
from time import sleep
import queue

N = 300
que = queue_pages(13)
my_list= []
for i in que:
    local_list = []
    local_list = get_urls_in_page(i.text) # 1-24, 2-24, 3-24, ... 10-24
    my_list += local_list # 300

q = queue.Queue()
[q.put(my_list[i]) for i in range(len(my_list) - 12)]
print(q.get())

/game/grand-theft-auto-v/


In [14]:
from multiprocessing.dummy import Pool as ThreadPool

def process_page_wrapper(i):
    with gzip.open('data/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)
        
        while not q.empty():
            record = game_info(q.get())
            if record == -1:
                print('ERROR: can not download', file=sys.stderr)
            record_str = json.dumps(record, ensure_ascii=False)
            print(record_str, file=f_json)

            with lock:
                pbar.update(1)

with ThreadPool(processes=300) as pool, tqdm(total=q.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

  0%|          | 0/299 [00:00<?, ?it/s]

In [2]:
import json
!gzcat data/part_00075.jsonl.gz | python -m json.tool

{
    "url": "https://gg.deals/game/mark-of-the-ninja/",
    "name": "Mark of the Ninja",
    "image": "https://img.gg.deals/b8/7e/fb48f49b6eb1cec8df589be3d843ea420d2d_307xt176.jpg",
    "market_url": "https://gg.deals/redirect/b43ccc2a68159044bb62d656f279af741dfa4159/?utm_source=games%2Fsingle",
    "wishlist_count": 922,
    "alert_count": 74,
    "owners_count": 9347,
    "release_date": "16 Oct 2012",
    "developer": "16 Oct 2012",
    "Metascore": "91",
    "Userscore": "8.0",
    "tags": [
        "Stealth",
        "Platformer",
        "Ninja",
        "Indie",
        "2D",
        "Action",
        "Singleplayer",
        "Side Scroller",
        "Adventure",
        "Assassin",
        "Atmospheric",
        "Controller",
        "Replay Value",
        "Cartoon",
        "Puzzle",
        "Dark",
        "Short",
        "Story Rich",
        "Strategy",
        "Casual"
    ],
    "genres": [
        "Action",
        "Adventure",
    

In [3]:
import json
!gzcat data/part_00075.jsonl.gz | python -m json.tool

{
    "url": "https://gg.deals/game/mark-of-the-ninja/",
    "name": "Mark of the Ninja",
    "image": "https://img.gg.deals/b8/7e/fb48f49b6eb1cec8df589be3d843ea420d2d_307xt176.jpg",
    "market_url": "https://gg.deals/redirect/b43ccc2a68159044bb62d656f279af741dfa4159/?utm_source=games%2Fsingle",
    "wishlist_count": 922,
    "alert_count": 74,
    "owners_count": 9347,
    "release_date": "16 Oct 2012",
    "developer": "16 Oct 2012",
    "Metascore": "91",
    "Userscore": "8.0",
    "tags": [
        "Stealth",
        "Platformer",
        "Ninja",
        "Indie",
        "2D",
        "Action",
        "Singleplayer",
        "Side Scroller",
        "Adventure",
        "Assassin",
        "Atmospheric",
        "Controller",
        "Replay Value",
        "Cartoon",
        "Puzzle",
        "Dark",
        "Short",
        "Story Rich",
        "Strategy",
        "Casual"
    ],
    "genres": [
        "Action",
        "Adventure",
    