In [None]:
import requests
import time
from bs4 import BeautifulSoup

In [None]:
import sys

def get_page(url, n_attempts=5, t_sleep=1, lock=None):
    r = None
    error = None
    for i in range(n_attempts):
        try:
            r = requests.get(url)
            if r.status_code == 200:
                break
        except Exception as e:
            error = e
            continue
        finally:
            time.sleep(t_sleep)
    if r is None:
        if lock is not None:
            with lock:
                print(f'Error: {str(error)}\nURL: {url}\n', file=sys.stderr)
        else:
            print(f'Error: {str(error)}\nURL: {url}\n', file=sys.stderr)
        return None
    if r.status_code != 200:
        if lock is not None:
            with lock:
                print(f'CODE: {r.status_code}\nURL: {url}\n', file=sys.stderr)
        else:
            print(f'CODE: {r.status_code}\nURL: {url}\n', file=sys.stderr)
        return None
    return r

In [None]:
def get_cards_urls(page_url):
    page = get_page(page_url)
    if page is None:
        return []
    soup = BeautifulSoup(page.content, 'html.parser')
    cards_row = soup.find('div', class_='row ')
    cards = cards_row.find_all('a', 
                               class_='bxr-font-color bxr-font-color-hover')
    cards_urls = list(map(lambda a: base_url + a['href'], cards))
    return cards_urls

In [None]:
import re

FORMAT_PRICE_OFFSET = len('"PRICE_FORMAT": "')
assert len(FORMAT_PRICE_OFFSET) == 17

def process_text(text):
    return re.sub('\s+', ' ', text).strip()

def get_format_price(page):
    k = page.text.find('"PRICE_FORMAT"')
    if k < 0:
        return None
    q = page.text[k + FORMAT_PRICE_OFFSET:].find(' ')
    return float(page.text[k + 17:k + 17 + q])

def get_rating_value(soup):
    elem = soup.find('meta', itemprop='ratingValue')
    return elem['content'] if elem else 0

def get_rating_count(soup):
    elem = soup.find('meta', itemprop='ratingCount')
    return elem['content'] if elem else 0

def get_pictures(soup):
    return list(map(lambda x: 'https:' + x['data-src'], 
                    soup.find_all('img', class_='bxr-zoom-img lazy'))) or None

# def get_marks(soup):
#     marks = []
#     for elem in soup.find('div', class_='bxr-ribbon-marker-vertical').contents:
#         if hasattr(elem, 'text'):
#             marks.append(elem.text)
#     return marks or None

def get_marks(soup):
    marks = soup.find('div', class_='bxr-ribbon-marker-vertical').contents
    marks = [elem.text for elem in marks if hasattr(elem, 'text')]
    return marks

In [None]:
def process_page(url, lock=None):
    page = get_page(url, lock=lock)
    if page is None:
        return None
    soup = BeautifulSoup(page.content, 'html.parser')
    page_data = {
        'url': page.url,
        'Название': soup.find('h1', itemprop='name').text,
        'Иллюстрации': get_pictures(soup),
        'Метки': get_marks(soup),
        'Оценка': get_rating_value(soup),
        'Число голосов': get_rating_count(soup),
        'Наличие': soup.find('div', itemprop='availability').text,
        'Цена': float(soup.find('meta', itemprop='price')['content']),
        'Цена (скидка)': get_format_price(page),
        'Описание': process_text(soup.find('div', class_='bxr-detail').text)
    }

    for elem in soup.find_all('tr', itemprop='additionalProperty'):
        prop = elem.contents[1].text
        data = elem.contents[3].text
        page_data[prop] = process_text(data)
    
    page_data = {k:v for k, v in page_data.items() if v is not None}

    return page_data

In [None]:
def process_page_wrapper(i):
    with gzip.open('data/part_{:05d}.jsonl.gz'.format(i), mode='wb') as f_json:
        f_json = codecs.getwriter('utf8')(f_json)

        while not queue.empty():
            try:
                record = process_page(queue.get(), lock=lock)
            except Exception as e:
                record = None
                with lock:
                    print(f'Exception has occured:\n{str(e)}', file=sys.stderr)
            if record is not None:
                record_str = json.dumps(record, ensure_ascii=False)
                print(record_str, file=f_json)

            # счетчик должен атомарно обновиться
            with lock:
                pbar.update(1)

In [None]:
import pandas as pd

from itertools import chain         # рекомендуется использовать
from contextlib import ExitStack    # рекомендуется использовать

from typing import Generator, Dict, Any


def records_reader(dirname: str) -> Generator[Dict[str, Any], None, None]:
    for i in range(PROCESS_COUNT):
        with gzip.open(dirname + '/part_{:05d}.jsonl.gz'.format(i), mode="rb") as f_json:
            f_json = codecs.getreader('utf8')(f_json)

            for line in f_json:
                line = line.strip()
                page_data = json.loads(line)
                yield page_data

In [None]:
base_url = 'https://shop.relod.ru'
catalog_url = base_url + '/catalog-products/4577/?sort=PROPERTY_RATING&order=desc&PAGEN_1={}/'

In [None]:
CARDS_ON_PAGE = 20
CARDS_COUNT = 5000
PROCESS_COUNT = 16

PAGE_COUNT = CARDS_COUNT // CARDS_ON_PAGE + 2

In [None]:
from itertools import chain
from tqdm.notebook import tqdm

import gzip
import json
import codecs

from multiprocessing.dummy import Pool, Queue

In [None]:
%%time

with Pool(processes=PROCESS_COUNT) as pool:
    pages_urls = [catalog_url.format(page_id) for page_id in range(1, 1 + PAGE_COUNT, 1)]
    # альтернативный вариант. Вообще говоря, кажется, приведение к списку тоже лишнее
    # pages_urls = map(catalog_url.format, range(1, 1 + PAGE_COUNT, 1))
    cards_urls = tqdm(pool.imap(get_cards_urls, pages_urls), total=PAGE_COUNT)
    cards_urls = list(set(chain.from_iterable(cards_urls)))

pool.join()

print(len(cards_urls))

In [None]:
# %%time

# with Pool(processes=PROCESS_COUNT) as pool:
#     cards_urls = list(tqdm(pool.imap(get_cards_urls, 
#                                      [catalog_url.format(page_id) 
#                                       for page_id 
#                                       in range(1, 1 + PAGE_COUNT, 1)]), 
#                            total=PAGE_COUNT))

# pool.join()
# cards_urls = list(chain(*cards_urls))
# cards_urls = list(set(cards_urls))
# print(len(cards_urls))

HBox(children=(FloatProgress(value=0.0, max=252.0), HTML(value='')))


5040
CPU times: user 1min 43s, sys: 2.13 s, total: 1min 45s
Wall time: 1min 47s


In [None]:
%%time

queue = Queue()
for url in cards_urls:
    queue.put(url)


with Pool(processes=PROCESS_COUNT) as pool, tqdm(total=queue.qsize()) as pbar:
    lock = pbar.get_lock()
    pool.map(process_page_wrapper, range(pool._processes))

HBox(children=(FloatProgress(value=0.0, max=5040.0), HTML(value='')))

CODE: 404
URL: https://shop.relod.ru/catalog-products/the_oversight/

CODE: 404
URL: https://shop.relod.ru/catalog-products/the_angel_esmeralda_nine_stories/

CODE: 404
URL: https://shop.relod.ru/catalog-products/the_two_week_wait/

CODE: 404
URL: https://shop.relod.ru/catalog-products/woes_of_the_true_policeman/




CPU times: user 52min 22s, sys: 53.7 s, total: 53min 16s
Wall time: 51min 41s


In [None]:
df = pd.DataFrame(records_reader('data'))
df.to_csv('hw_3.csv', index=False)

In [None]:
import numpy as np

def check_random(i=None):
    if i is None:
        i = np.random.randint(df.shape[0])
    return dict(df.iloc[i])

In [None]:
check_random()

{'ISBN': '9781841495545',
 'url': 'https://shop.relod.ru/catalog-products/the_talismans_of_shannara/',
 'Автор': 'Brooks Terry',
 'Вес (грамм)': '354',
 'Возрастные ограничения': '16+',
 'Год издания': '2006',
 'Издание': nan,
 'Издатель': 'Orbit (сайт издательства)',
 'Иллюстрации': ['https://opt-1458870.ssl.1c-bitrix-cdn.ru/upload/iblock/8d2/8d28e757babcf43fa4592a33dae3076a.jpg?152812868616496'],
 'Метки': nan,
 'Название': 'The Talismans of Shannara',
 'Наличие': 'Под заказ',
 'Носитель': nan,
 'Обзор/Ролик': nan,
 'Описание': "The descendants of the Elven house of Shannara have all completed their quests: Paranor, the Druid's Keep, has been restored; the Elves have been returned to the Four Lands; and Par Ohmsford has found what he believes to be the legendary Sword of Shannara. But their work is not yet done. The Shadowen still swarm over the Four Lands, poisoning all with their dark magic, and their leader is determined to prevent the scions of Shannara from sharing the knowledge