# Imports

In [1]:
import os
import sys
import glob
import gzip
import json
import pickle
import urllib
import traceback
import datetime as dt
import concurrent.futures

import bs4
from bs4 import BeautifulSoup
from tqdm.auto import tqdm

from typing import Any, Dict, List, Optional, Tuple

# Helpers to Get Content

In [2]:
def get_object_pickled_gzip(filepath: str) -> Any:
    assert isinstance(filepath, str)

    with gzip.open(filepath, 'rb') as f:
        pickled_obj = f.read()  # Extract GZIP
        unpickled_obj = pickle.loads(pickled_obj)  # Un-pickle
    return unpickled_obj

In [3]:
def get_website_as_string(url: str) -> str:
    
    # Header is required so we don't get 403 error
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    req = urllib.request.Request(url, headers=hdr)
    response = urllib.request.urlopen(req)
    content_raw = response.read()
    content = content_raw.decode()
    
    return content   # HTML with embedded script source code

# Helpers to Extract EBuyer

In [4]:
class CheckerEBuyer:
    
    @staticmethod
    def _check_title_and_link(product_tag: bs4.element.Tag) -> Tuple[str, str]:
        assert isinstance(product_tag, bs4.element.Tag)
        link_a = product_tag.select_one('h3.grid-item__title > a')
        # print(link_a)

        title = link_a.text.strip()
        # print(title)

        link_a_href = link_a['href']
        full_link = 'https://www.ebuyer.com' + link_a_href
        # print(full_link)
        return title, full_link
    
    @staticmethod
    def _check_available(product_tag: bs4.element.Tag) -> bool:
        assert isinstance(product_tag, bs4.element.Tag)

        button_basket = product_tag.select_one('div.grid-item__buttons > button.button--mini-basket')

        if button_basket is not None:
            assert button_basket.text.strip().lower() == 'add to basket'
            return True   # check succeded, product is available
        else:
            return False  # check failed
    
    @staticmethod
    def _check_not_available(product_tag: bs4.element.Tag) -> bool:
        assert isinstance(product_tag, bs4.element.Tag)

        p_coming_soon = product_tag.select_one('p.grid-item__coming-soon')
        # print(p_coming_soon)

        if p_coming_soon is not None:
            assert p_coming_soon.text.strip().lower() == 'coming soon'
            return True   # check succeded, product IS NOT available!
        else:
            return False  # check failed

    @staticmethod
    def _check_product(product_tag: bs4.element.Tag) -> List[Dict[str, str]]:
        assert isinstance(product_tag, bs4.element.Tag)
        
        title, link = CheckerEBuyer._check_title_and_link(product_tag)
        is_available = CheckerEBuyer._check_available(product_tag)
        is_not_available = CheckerEBuyer._check_not_available(product_tag)

        assert is_available or is_not_available
        assert not (is_available and is_not_available)
        
        if is_available:
            print('AVAILABLE  AVAILABLE  AVAILABLE')

        return {
            'title': title,
            'link': link,
            'available': is_available,
        }
    
    @staticmethod
    def check_request(content_html: str) -> List[Dict[str, Any]]:
        assert isinstance(content_html, str)
        
        soup = bs4.BeautifulSoup(content_html)
        product_list = soup.select('div.grid-item')
        
        results_list = []
        for product_tag in product_list:
            item_dict = CheckerEBuyer._check_product(product_tag)
            results_list.append(item_dict)
        
        return results_list

In [17]:
def process_request(request_dict: Dict[str, Optional[str]]) -> Dict[str, Any]:
    assert isinstance(request_dict, dict)
    assert request_dict.keys() == {'content', 'datetime', 'model', 'page', 'supplier', 'traceback'}
    assert all(isinstance(k, str) for k in request_dict.keys())
    assert all(v is None or isinstance(v, str) for v in request_dict.values())

    content = request_dict['content']
    assert content is not None
    
    datetime_str = request_dict['datetime']
    assert isinstance(datetime_str, str)
    assert dt.datetime.fromisoformat(datetime_str)
    
    model = request_dict['model']
    assert model in {'3060ti', '3070'}
    
    page = request_dict['page']
    assert page in {'p1'}
    
    supplier = request_dict['supplier']
    assert supplier in {'ebuyer'}
    
    traceback = request_dict['traceback']
    assert traceback is None
    
    if supplier == 'ebuyer':
        items_list = CheckerEBuyer.check_request(content)
    
    num_products = len(items_list)
    num_available = sum([it['available'] for it in items_list])
    assert num_products > 0
    
    return {
        'datetime': datetime_str,
        'model': model,
        'page': page,
        'supplier': supplier,
        'num_products': num_products,
        'num_available': num_available,
        'items_list': items_list
    }

# Get Content URL

In [18]:
request_dict = {
    'content': get_website_as_string('https://www.ebuyer.com/store/Components/cat/Graphics-Cards-Nvidia?page=1'),
    'datetime': dt.datetime.utcnow().isoformat(),
    'model': '3060ti',
    'page': 'p1',
    'supplier': 'ebuyer',
    'traceback': None
}

In [19]:
process_request(request_dict)

AVAILABLE  AVAILABLE  AVAILABLE
AVAILABLE  AVAILABLE  AVAILABLE
AVAILABLE  AVAILABLE  AVAILABLE
AVAILABLE  AVAILABLE  AVAILABLE
AVAILABLE  AVAILABLE  AVAILABLE


{'datetime': '2021-02-27T13:57:03.033854',
 'model': '3060ti',
 'page': 'p1',
 'supplier': 'ebuyer',
 'num_products': 24,
 'num_available': 5,
 'items_list': [{'title': 'MSI GeForce RTX 3060 12GB GAMING X TRIO Ampere Graphics Card',
   'link': 'https://www.ebuyer.com/1140149-msi-geforce-rtx-3060-12gb-gaming-x-trio-ampere-graphics-card-rtx-3060-gaming-x-trio-12g',
   'available': False},
  {'title': 'ASUS GeForce RTX 3060 12GB ROG STRIX OC Ampere Graphics Card',
   'link': 'https://www.ebuyer.com/1142162-asus-geforce-rtx-3060-12gb-rog-strix-oc-ampere-graphics-card-rog-strix-rtx3060-o12g-gaming',
   'available': False},
  {'title': 'MSI GeForce RTX 3070 8GB GAMING X TRIO Ampere Graphics Card',
   'link': 'https://www.ebuyer.com/1126975-msi-geforce-rtx-3070-8gb-gaming-x-trio-ampere-graphics-card-rtx-3070-gaming-x-trio',
   'available': False},
  {'title': 'Palit GeForce RTX 3060 12GB Dual OC Ampere Graphics Card',
   'link': 'https://www.ebuyer.com/1141626-palit-geforce-rtx-3060-12gb-dual

# Get Content GZIP

In [20]:
filepath = '/mnt/data/raw/ebuyer/3060ti/p1/2021-02-23_10-48-01.pkl.gzip'

In [21]:
def process_requests_from_gzip(filepath):
    
    supplier = filepath.split('/')[-4]
    assert supplier in {'ebuyer'}

    model = filepath.split('/')[-3]
    assert model in {'3060ti', '3070'}

    page = filepath.split('/')[-2]

    requests_collection = get_object_pickled_gzip(filepath)

    results_collection = []
    for i, request_dict in enumerate(requests_collection):
        request_dict['model'] = model
        request_dict['page'] = page
        request_dict['supplier'] = supplier
        request_dict['datetime'] = request_dict['datetime']()

        result_dict = process_request(request_dict)

        if result_dict['num_available'] > 0:
            print(f'{filepath}   {i:3d}   {result_dict["num_products"]}   {result_dict["num_available"]}')

        results_collection.append(result_dict)

    return results_collection

In [22]:
def _process_one(filepath):
    try:
        assert '/raw/' in filepath
        target_filepath = filepath.replace('/raw/', '/processed/')
        target_filepath = target_filepath.replace('.gzip', '.json')
        if not os.path.exists(target_filepath):
            results_collection = process_requests_from_gzip(filepath)
            os.makedirs(os.path.split(target_filepath)[0], exist_ok=True)
            json.dump(results_collection, open(target_filepath, 'w'))
    except:
        print(f'ERROR: {filepath}')
        print(traceback.format_exc())

In [23]:
_process_one(filepath)

In [24]:
filepaths = sorted(glob.glob('/mnt/data/raw/ebuyer/3070/**/*.gzip', recursive=True))

In [25]:
# for fp in tqdm(filepaths):
#     _process_one(fp)

In [26]:
with concurrent.futures.ProcessPoolExecutor(max_workers=10) as executor:
        results = list(tqdm(executor.map(_process_one, filepaths), total=len(filepaths)))

  0%|          | 0/5460 [00:00<?, ?it/s]

ERROR: /mnt/data/raw/ebuyer/3070/p1/2021-02-25_17-01-01.pkl.gzip
Traceback (most recent call last):
  File "<ipython-input-22-90b9d724ede4>", line 7, in _process_one
    results_collection = process_requests_from_gzip(filepath)
  File "<ipython-input-21-681198b549b4>", line 20, in process_requests_from_gzip
    result_dict = process_request(request_dict)
  File "<ipython-input-17-6d87782dd4e7>", line 31, in process_request
    assert num_products > 0
AssertionError

ERROR: /mnt/data/raw/ebuyer/3070/p1/2021-02-25_17-04-00.pkl.gzip
Traceback (most recent call last):
  File "<ipython-input-22-90b9d724ede4>", line 7, in _process_one
    results_collection = process_requests_from_gzip(filepath)
  File "<ipython-input-21-681198b549b4>", line 20, in process_requests_from_gzip
    result_dict = process_request(request_dict)
  File "<ipython-input-17-6d87782dd4e7>", line 31, in process_request
    assert num_products > 0
AssertionError

ERROR: /mnt/data/raw/ebuyer/3070/p1/2021-02-25_17-06-00.pk