In [101]:
from bs4 import BeautifulSoup
import requests
import time
import os
from tqdm import tqdm
import html
import re
from datetime import datetime
from multiprocessing import Manager, Pool
import pickle
import pandas as pd

In [49]:
cookies = {
    # Replace this secion with your own cookies etc
}

headers = {
    # Replace this section with your own headers
}

def params_for_page(n):
    return (
        ('No', n),
        ('N', '790 231 52 6437'),
    )

for n in range(0, 9598, 204):
    params = params_for_page(n)
    response = requests.get('https://comics.ha.com/c/search-results.zx', headers=headers, params=params, cookies=cookies)

    with open(f'./dumps/{n}.html', 'w') as file:
        file.write(response.text)
        
    time.sleep(2)

In [54]:
links = []

for filepath in os.listdir('./dumps'):
    with open(os.path.join('dumps', filepath), 'r') as file:
        data = file.read()
        soup = BeautifulSoup(data, 'html.parser')
        auction_items = soup.find('ul', {'class': 'auction-items'})
        for div in auction_items.find_all('div', {'class': 'current-amount'}):
            for link in div.find_all('a'):
                if link.text == 'Click to view amount':
                    links.append(link.attrs.get('href'))

In [72]:
base_url = 'https://comics.ha.com'
base_dir = './dumps'
for link in tqdm(links):
    response = requests.get(base_url + link, headers=headers, params=params, cookies=cookies)
    if response.status_code != 200:
        raise Exception('Non-200 response')
    with open(os.path.join('./dumps', f'{link}.html'), 'w') as file:
        file.write(response.text)
    time.sleep(0.5)

100%|██████████| 9637/9637 [5:02:06<00:00,  1.88s/it]


In [227]:
base_dir = './dumps/c/'

manager = Manager()
sales = manager.list()

def process_file(filepath):
    try:
        with open(os.path.join(base_dir, filepath)) as file:
            data = file.read()
            soup = BeautifulSoup(data, 'html.parser')
            
            # Should be fine to take the first element, the pages seem fairly unchanging.
            description = soup.find('h1', {'itemprop': 'name'}).text.replace('\n', ' ')

            # Format of date is d MMM, yyyy
            rx = re.compile(r"(?<=Sold on )([A-Za-z 0-9,])+(?= for)")
            sale_date = soup.find('div', {'class': 'item-info' }).find_all('div', {'class': 'section-headline'})[0].text
            sale_date = datetime.strptime(rx.search(sale_date)[0], '%b %d, %Y')

            # The page writes the value of the bid using encoded data for some reason, I suppose in an attempt to stop scraping?
            # This took like 30 seconds to work-around, not too sure what's happening there to be honest.
            rx = re.compile(r"(?<=')([0-9;&#]+)(?=')")
            raw_sale_price = soup.find('strong', {'class': 'opening-bid'}).find('script')
            raw_sale_price = rx.search(str(raw_sale_price.contents[0]))[0]
            sale_price = float(re.sub(r'[,\$]', '', html.unescape(raw_sale_price)))

            rx = re.compile(r'\b[0-9]+\b')
            sale_id, lot_id = tuple(rx.findall(filepath))
            
            sale = {
                'sale_date': sale_date,
                'sale_price': sale_price,
                'description': description,
                'sale_id': sale_id,
                'lot_id': lot_id
            }
            
            sales.append(sale)
    except AttributeError as exception:
        """
        Not handling exceptions btw, this was just for debugging.
        """
        raise AttributeError(exception, filepath)

with Pool() as pool:
    files = os.listdir(base_dir)
    list(tqdm(pool.imap_unordered(process_file, files), total=len(files)))

with open('sales.pickle', 'wb') as file:
    pickle.dump([i for i in sales], file)

100%|██████████| 9637/9637 [00:51<00:00, 186.38it/s]


In [7]:
with open('sales.pickle', 'rb') as file:
    sales = pickle.load(file)

wata = []

for sale in sales:
    sale_tuple = tuple(i.strip() for i in sale['description'].replace(' - Wata', 'Wata').split('Wata'))
    if len(sale_tuple) == 2:
        wata.append((sale, *sale_tuple))

# We only care about Wata graded games, so this means we can quickly filter out the non-Wata games which leaves us 9286/9637 sales.
print(len(wata))

9286


In [124]:
sales = []

grade_rx = re.compile(r'^[0-9].[0-9]')
seal_grade_rx = re.compile(r'[(A\+*)|B\+*|C\+*]')
seal_type_rx = re.compile(r'SEALED|GLUE SEAL|NO SEAL|CIB|LOOSE CART')
variant_rx = re.compile(r'VARIANT: ')
# {'9.4', '7.0', '2.5', '9.2', '3.0', '9.6', '6.0', '6.5', '5.0', '7.5', '4.5', '5.5', '8.0', '9.0', '4.0', None, '9.8', '8.5', '3.5'}

seal_comments = set()

for index, sale in enumerate(wata):
    sale, title, grading = sale

    # The titles have bunch of random comments sometimes, this gets rid of that.
    comment_rx = re.compile(r'(?:[\[\(])(.+)(?:[\]\)])')
    comments = re.findall(comment_rx, sale.get('description'))

    for comment in comments:
        comment.replace('"', '')

    clean_up_rx = re.compile(r'\.{3,}|\.$|[\[\(].+[\]\)]|,')
    cleaned = re.sub(clean_up_rx, '', grading)
    cleaned = re.sub(r'[ ]{1,}', ' ', cleaned)
    cleaned = re.sub(r'\. ', '', cleaned)
    cleaned = cleaned.strip().upper()

    grade = re.search(grade_rx, cleaned)
    if grade:
        grade = grade[0]

    seal_grade = re.search(seal_grade_rx, cleaned)
    if seal_grade:
        seal_grade = seal_grade[0]

    seal_type = re.search(seal_type_rx, cleaned)
    if seal_type:
        seal_type = seal_type[0]

    variant = re.search(variant_rx, cleaned)
    if variant:
        variant = True

    sale = {
        **sale,
        'comments': ",".join(comments),
        'title': re.sub(r'[\[\(].+[\]\)]', '', title).strip(),
        'grade': grade,
        'seal_grade': seal_grade,
        'seal_type': seal_type,
        'variant': variant or False
    }
        
    sales.append(sale)

df = pd.DataFrame().from_dict(sales)
df.to_csv('heritage_sales.csv', index=False)


<re.Match object; span=(4, 7), match='A++'>
<re.Match object; span=(4, 7), match='A++'>
<re.Match object; span=(4, 6), match='A+'>
None
None
None
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 6), match='A+'>
None
None
None
None
<re.Match object; span=(4, 7), match='A++'>
None
None
None
<re.Match object; span=(4, 7), match='A++'>
None
None
None
None
None
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 7), match='A++'>
None
None
<re.Match object; span=(4, 6), match='A+'>
None
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 6), match='A+'>
<re.Match object; span=(4, 6), match='A+'>
None
None
<re.Match object; span=(4, 6), match='A+'>
None
None
<re.Match object; span=(4, 7), match='A++'>
None
None
None
None
<re.Match object; span=(4, 6), match='A+'>
<re.