# Imports

In [1]:
import os
import sys
import gzip
import pickle
import urllib
import datetime as dt

import bs4
from bs4 import BeautifulSoup

from typing import Any

# Helpers to Get Content

In [2]:
def get_object_pickled_gzip(filepath: str) -> Any:
    assert isinstance(filepath, str)

    with gzip.open(filepath, 'rb') as f:
        pickled_obj = f.read()  # Extract GZIP
        unpickled_obj = pickle.loads(pickled_obj)  # Un-pickle
    return unpickled_obj

In [3]:
def get_website_as_string(url: str) -> str:
    
    # Header is required so we don't get 403 error
    hdr = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11',
       'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
       'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
       'Accept-Encoding': 'none',
       'Accept-Language': 'en-US,en;q=0.8',
       'Connection': 'keep-alive'}
    req = urllib.request.Request(url, headers=hdr)
    response = urllib.request.urlopen(req)
    content_raw = response.read()
    content = content_raw.decode()
    
    return content   # HTML with embedded script source code

# Helpers to Extract EBuyer

In [4]:
def check_title_and_link(product_tag: bs4.element.Tag):
    assert isinstance(product_tag, bs4.element.Tag)
    link_a = product.select_one('h3.grid-item__title > a')
    # print(link_a)
    
    title = link_a.text.strip()
    # print(title)

    link_a_href = link_a['href']
    full_link = 'https://www.ebuyer.com' + link_a_href
    # print(full_link)
    return title, full_link

In [5]:
def check_available(product_tag: bs4.element.Tag):
    assert isinstance(product_tag, bs4.element.Tag)
    
    button_basket = product.select_one('div.grid-item__buttons > button.button--mini-basket')
    
    if button_basket is not None:
        assert button_basket.text.strip().lower() == 'add to basket'
        return True   # check succeded, product is available
    else:
        return False  # check failed

In [6]:
def check_not_available(product_tag: bs4.element.Tag):
    assert isinstance(product_tag, bs4.element.Tag)
    
    p_coming_soon = product.select_one('p.grid-item__coming-soon')
    # print(p_coming_soon)
    
    if p_coming_soon is not None:
        assert p_coming_soon.text.strip().lower() == 'coming soon'
        return True   # check succeded, product IS NOT available!
    else:
        return False  # check failed

In [7]:
def check_product(datetime_str: str, product_tag: bs4.element.Tag):
    assert isinstance(datetime_str, str)
    assert dt.datetime.fromisoformat(datetime_str)
    assert isinstance(product_tag, bs4.element.Tag)
    
    title, link = check_title_and_link(product)
    
    is_available = check_available(product)
    is_not_available = check_not_available(product)
    
    assert is_available or is_not_available
    assert not (is_available and is_not_available)
    
    return {
        'supplier': 'ebuyer',
        'datetime': datetime_str,
        'title': title,
        'link': link,
        'available': is_available,
    }        

# Get Content GZIP

In [8]:
filepath = '/mnt/data/ebuyer_3060ti/2021-02-23_10-48-01.pkl.gzip'

In [9]:
requests_collection = get_object_pickled_gzip(filepath)

In [10]:
request_dict = requests_collection[0]

In [11]:
datetime_str = request_dict['datetime']()
datetime_str

'2021-02-23T10:47:38.858159'

In [12]:
traceback = request_dict['traceback']
traceback

In [13]:
content = request_dict['content']

# Get Content URL

In [14]:
# content = get_website_as_string('https://www.ebuyer.com/store/Components/cat/Graphics-Cards-Nvidia?page=9')

# Check Content

In [15]:
soup = bs4.BeautifulSoup(content)

In [16]:
product_list = soup.select('div.grid-item')

In [17]:
len(product_list)

24

In [18]:
product = product_list[0]

In [19]:
check_product(datetime_str, product)

{'supplier': 'ebuyer',
 'datetime': '2021-02-23T10:47:38.858159',
 'title': 'Gigabyte GeForce RTX 3060 Ti 8GB GAMING OC PRO Ampere Graphics Card',
 'link': 'https://www.ebuyer.com/1133708-gigabyte-geforce-rtx-3060-ti-8gb-gaming-oc-pro-ampere-graphics-card-gv-n306tgamingoc-pro-8gd-2-0',
 'available': False}