In [1]:
import re
import jsonlines
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import random
from time import sleep
from os import path
from typing import Union

import comicvision.webscraper as webscraper

In [2]:
def get_brackets(title: str) -> Union[str, None]:
    regex_brackets = re.search(r"\[(.*?)\]", title)
    if regex_brackets is None:
        return None
    else:
        return regex_brackets.group()
    

def strip_brackets_from_title(title: str) -> str:
    brackets = get_brackets(title)
    if brackets is None:
        if '--' in title:
            return title.split('--')[0].strip()
        else:
            return title
    else:
        debracketed_title = title.split(brackets)[0].strip()
        if '--' in debracketed_title:
            return debracketed_title.split('--')[0].strip()
        else:
            return debracketed_title


def check_if_issue_is_reprinting(title: str) -> bool:
    is_second_printing = 'Second Printing' in title
    is_2nd_printing = '2nd Printing' in title
    is_3rd_printing = '3rd Printing' in title
    is_4th_printing = '4th Printing' in title
    is_5th_printing = '5th Printing' in title
    is_6th_printing = '6th Printing' in title
    is_7th_printing = '7th Printing' in title
    is_8th_printing = '8th Printing' in title
    is_9th_printing = '9th Printing' in title
    is_10th_printing = '10th Printing' in title
    return (is_second_printing | is_2nd_printing | is_3rd_printing | is_4th_printing | is_5th_printing | is_6th_printing | is_7th_printing |
            is_8th_printing | is_9th_printing | is_10th_printing)


def check_if_issue_is_duplicate(title: str, on_sale_date: str, metadata_path: str) -> bool: 
    logged_metadata = []

    with jsonlines.open(metadata_path, mode='r') as reader:
        for item in reader:
            logged_metadata.append(item)

    df = pd.DataFrame(logged_metadata)

    titles = list(df['title'].unique())
    title_is_duplicate = reduce(lambda x, y: x | y, 
                                [strip_brackets_from_title(title) in strip_brackets_from_title(x) for x in titles])
    
    if title_is_duplicate:
        metadata_on_sale_dates = df[df['title'].apply(strip_brackets_from_title) == strip_brackets_from_title(title)]['on_sale_date'].values
        on_sale_date_is_duplicate = on_sale_date in metadata_on_sale_dates
        is_reprinting = check_if_issue_is_reprinting(title)
        is_duplicate_newsstand = check_if_issue_is_duplicate_newsstand(title, df)
        return on_sale_date_is_duplicate | is_reprinting | is_duplicate_newsstand
    else:
        return title_is_duplicate
    
    
def check_if_issue_is_duplicate_newsstand(title, df) -> bool:
    is_newsstand = "Newsstand" in title
    
    is_canadian = "Canadian" in title
    
    titles = list(df['title'].unique())
    have_a_direct_sale_no_brackets = reduce(lambda x, y: x | y, 
                                [str(strip_brackets_from_title(title).split['--'][0].strip() + ' -- Direct Sales') in x for x in titles])
    
    have_a_direct_sale_no_brackets_2 = reduce(lambda x, y: x | y, 
                                [str(strip_brackets_from_title(title).split['--'][0].strip() + ' -- Direct') in x for x in titles])
    
    have_a_direct_sale_brackets = reduce(lambda x, y: x | y, 
                            [str(strip_brackets_from_title(title) + ' [Direct Sales]') in x for x in titles])
    
    have_a_direct_sale_brackets_2 = reduce(lambda x, y: x | y, 
                            [str(strip_brackets_from_title(title) + ' [Direct]') in x for x in titles])
    
    return ((is_newsstand | is_canadian) & (have_a_direct_sale_no_brackets | have_a_direct_sale_no_brackets_2 | have_a_direct_sale_brackets | have_a_direct_sale_brackets_2))

In [3]:
# title = 'Batgirl #1 -- poopy fart'
# on_sale_date = '2000-02-02'
# metadata_path = './metadata/covers.jsonl'


# strip_brackets_from_title(title)
# # check_if_issue_is_duplicate(title, on_sale_date, metadata_path)

In [4]:
# TODO: refactor this method to make it more functional and easier to test
# TODO: add tests!

def get_all_from_publisher_page(publisher_url: str, page: int):
    """
    Do a thing...
    """
    # global val
    URL = 'https://www.comics.org'

    # get publisher page
    publisher_html = webscraper.simple_get(publisher_url + '?page={}'.format(page))
    publisher_soup = webscraper.transform_simple_get_html(publisher_html)
    
    # parse series table from publisher page
    series_name = [result.find('a').contents[0] for result in publisher_soup.find_all('td', {'class': 'name'})]
    series_href = [result.find('a')['href'] for result in  publisher_soup.find_all('td', {'class': 'name'})]
    series_year = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'year'})]
    series_issue_count = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'issue_count'})]
    series_published = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'published'})]
    
    # create dataframe of publisher series (on page)
    series_df = pd.DataFrame(list(zip(series_name, series_href, series_year, series_issue_count, series_published)),
                             columns=['name', 'href', 'year', 'issue_count', 'published'])

    # parse issue count as int from issue_count column
    series_df['issue_count_int'] = series_df['issue_count'].apply(lambda x: int(re.search(r'\d+', x).group()))

    # iterate over series dataframe and get issue covers and metadata
    for series_name, series_page_href, issue_count in zip(series_df['name'], series_df['href'], series_df['issue_count_int']):
        
        if issue_count < 12:
            pass
        else:
            print(series_name, series_page_href, issue_count)
            
            # construct series page url
            series_page_url  = URL + series_page_href

            # get series page
            series_page_html = webscraper.simple_get(series_page_url)
            series_page_soup = webscraper.transform_simple_get_html(series_page_html)

            # get cover gallery url for series
            if series_page_soup.find('a',  href=True, text='Cover Gallery') is None:
                pass
            else:
                cover_gallery_href = series_page_soup.find('a',  href=True, text='Cover Gallery')['href']
                cover_gallery_base_url = URL + cover_gallery_href

                # get cover gallery page
                cover_gallery_html = webscraper.simple_get(cover_gallery_base_url)
                cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

                if len(cover_gallery_soup.find_all('a', {'class': "btn btn-default btn-sm"})) == 0:

                    # get issue hrefs from all linked issues on cover gallery
                    cover_gallery_hrefs = filter(lambda x: '/issue/' in x['href'] and '/cover/' not in x['href'], cover_gallery_soup.find_all('a',  href=True))
                    issue_hrefs = [x['href'] for x in cover_gallery_hrefs]

                    #  construct issue urls from issue hrefs
                    issue_urls = [URL + issue_href for issue_href in  issue_hrefs]

                    # scrape issues
                    for issue_url in issue_urls:

                        # get issue page
                        issue_html = webscraper.simple_get(issue_url)
                        issue_soup = webscraper.transform_simple_get_html(issue_html)

                        # metadata
                        metadata = {}
                        metadata['series_name'] = series_name.replace('/', '|')

                        # scrape metadata from issue page
                        # title, price, pages, color, dimension, paper_stock, binding, publishing_format
                        def get_issue_metadata(soup, name):
                            if len(soup.find_all('dd', id=name)) > 0:
                                if (name != 'issue_indicia_publisher') & (name != 'issue_brand'):
                                    return soup.find_all('dd', id=name)[0].contents[0].strip()
                                else:
                                    try:
                                        return soup.find_all('dd', id=name)[0].find('a').contents[0]
                                    except:
                                        return ""
                            else:
                                return ""

                        # post process the issue title removing extraneous characters
                        metadata['title'] = issue_soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1].replace('/', '|')
                        metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')

                        # check if issue is redundant to an issue  already we pulled (variant)
                        if path.exists('./metadata/covers.jsonl'):
                            is_duplicate = check_if_issue_is_duplicate(title=metadata['title'], 
                                                                       on_sale_date=metadata['on_sale_date'], 
                                                                       metadata_path='./metadata/covers.jsonl')

                            if is_duplicate:
                                pass
                            else:
                                metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                                metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                                metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                                metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                                metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                                metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                                metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                                metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                                metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                                metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                                metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                                metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                                metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                                all_issue_credits = list(zip(
                                    issue_soup.find_all('span', {'class': 'credit_label'}),  
                                    issue_soup.find_all('span', {'class': 'credit_value'})))

                                metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                                # get cover section
                                cover = issue_soup.find("div", {"class": "cover"})

                                # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                                cover_credits = list(zip(
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                ))

                                metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                                metadata.pop('cover_reprints', None)

                                # get the cover url
                                cover_img_href = cover.find("div", {'coverImage'}).a['href']
                                cover_img_url = URL + cover_img_href

                                # get cover page
                                cover_img_html = webscraper.simple_get(cover_img_url)
                                cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                                # get image urls from cover page
                                cover_img_soup.find_all('img')

                                cover_divs = cover_img_soup.find_all('div', {'class': 'issue_covers'})[0].find_all('div')

                                def get_variant_cover_name(cover_name: str):
                                    if get_brackets(cover_name) is None:
                                        return 'Original Cover'
                                    else:
                                        return get_brackets(cover_name).replace('[','').replace(']','')

                                # go into variant url and pull metadata
                                cover_images = [x.find_all('a')[0].contents[0]['src'] for x in cover_divs]
                                cover_name = [get_variant_cover_name(x.find_all('a')[1].contents[0]) for x in cover_divs]
                                cover_urls = [URL + x.find_all('a')[0]['href'] for x in cover_divs]

                                covers = list((zip(cover_names, cover_urls,  cover_images)))

                                covers_dict = {}
                                for cover in covers:
                                    name = cover[0]
                                    url = cover[1]
                                    image = cover[2]

                                    covers_dict[name] = {}
                                    covers_dict[name]['cover_url'] = url
                                    covers_dict[name]['image_url'] = image

                                metadata['variant_covers'] = {}

                                for variant_name in covers_dict:
                                    if 'Second Printing' in variant_name:
                                        pass
                                    elif ('Newsstand' in variant_name) & ('Direct Sales' in covers_dict.keys()):
                                        pass
                                    else:
                                        issue_url = covers_dict[variant_name]['cover_url']

                                        # get issue page
                                        issue_html = webscraper.simple_get(issue_url)
                                        issue_soup = webscraper.transform_simple_get_html(issue_html)

                                        cover = issue_soup.find("div", {"class": "cover"})

                                        cover_credits = list(zip(
                                                            [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                                            [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                                        ))

                                        cover_credits = {"cover_{}".format(x[0].lower()): x[1] for x in cover_credits}
                                        cover_credits.pop('cover_reprints', None)

                                        save_as = "{} -- {} -- {} -- {}".format(metadata['series_name'], strip_brackets_from_title(metadata['title']), variant_name, metadata['on_sale_date'], )
                                        save_to = './covers/' + save_as + '.jpg'

                                        cover_credits['cover_image_file_name'] = save_as

                                        metadata['variant_covers'][variant_name] = cover_credits

                                        # save cover image
                                        urllib.request.urlretrieve(covers_dict[variant_name]['image_url'], save_to)


                                # TODO: reason about response of save; if successful, save metadata, else contine
                                # save metadata
                                with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                    writer.write(metadata)

                                # TODO: write to log... timestamp/publisher/series/issue/
                                now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                                publisher_int = publisher_url.split('/')[-2]

                                log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                                with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                    writer.write(log)

                                # slow down the requests so we don't take too many resources and get blocked
                                sleep(random.uniform(5, 10))


                        else:
                            metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                            metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                            metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                            metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                            metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                            metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                            metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                            metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                            metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                            metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                            metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                            metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                            metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                            all_issue_credits = list(zip(
                                issue_soup.find_all('span', {'class': 'credit_label'}), 
                                issue_soup.find_all('span', {'class': 'credit_value'})))

                            metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                            # get cover section
                            cover = issue_soup.find("div", {"class": "cover"})

                            # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                            cover_credits = list(zip(
                                [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                            ))

                            metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                            metadata.pop('cover_reprints', None)

                            # get the cover url
                            cover_img_href = cover.find("div", {'coverImage'}).a['href']
                            cover_img_url = URL + cover_img_href

                            # get cover page
                            cover_img_html = webscraper.simple_get(cover_img_url)
                            cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                            # get image urls from cover page
                            cover_img_soup.find_all('img')

                            cover_divs = cover_img_soup.find_all('div', {'class': 'issue_covers'})[0].find_all('div')

                            def get_variant_cover_name(cover_name: str):
                                if get_brackets(cover_name) is None:
                                    return 'Original Cover'
                                else:
                                    return get_brackets(cover_name).replace('[','').replace(']','')

                            # go into variant url and pull metadata
                            cover_images = [x.find_all('a')[0].contents[0]['src'] for x in cover_divs]
                            cover_name = [get_variant_cover_name(x.find_all('a')[1].contents[0]) for x in cover_divs]
                            cover_urls = [URL + x.find_all('a')[0]['href'] for x in cover_divs]

                            covers = list((zip(cover_names, cover_urls,  cover_images)))

                            covers_dict = {}
                            for cover in covers:
                                name = cover[0]
                                url = cover[1]
                                image = cover[2]

                                covers_dict[name] = {}
                                covers_dict[name]['cover_url'] = url
                                covers_dict[name]['image_url'] = image

                            metadata['variant_covers'] = {}

                            for variant_name in covers_dict:
                                if 'Second Printing' in variant_name:
                                    pass
                                elif ('Newsstand' in variant_name) & ('Direct Sales' in covers_dict.keys()):
                                    pass
                                else:
                                    issue_url = covers_dict[variant_name]['cover_url']

                                    # get issue page
                                    issue_html = webscraper.simple_get(issue_url)
                                    issue_soup = webscraper.transform_simple_get_html(issue_html)

                                    cover = issue_soup.find("div", {"class": "cover"})

                                    cover_credits = list(zip(
                                                        [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                                        [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                                    ))

                                    cover_credits = {"cover_{}".format(x[0].lower()): x[1] for x in cover_credits}
                                    cover_credits.pop('cover_reprints', None)

                                    save_as = "{} -- {} -- {} -- {}".format(metadata['series_name'], strip_brackets_from_title(metadata['title']), variant_name, metadata['on_sale_date'], )
                                    save_to = './covers/' + save_as + '.jpg'

                                    cover_credits['cover_image_file_name'] = save_as

                                    metadata['variant_covers'][variant_name] = cover_credits

                                    # save cover image
                                    urllib.request.urlretrieve(covers_dict[variant_name]['image_url'], save_to)


                            # TODO: reason about response of save; if successful, save metadata, else contine
                            # save metadata
                            with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                writer.write(metadata)

                            # TODO: write to log... timestamp/publisher/series/issue/
                            now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                            publisher_int = publisher_url.split('/')[-2]

                            log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                            with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                writer.write(log)

                            # slow down the requests so we don't take too many resources and get blocked
                            sleep(random.uniform(5, 10))

                else:
                    cover_gallery_pages = list(filter(lambda x: x.isdigit(), [x.contents[0] for x in cover_gallery_soup.find_all('a', {'class': "btn btn-default btn-sm"})]))
                    cover_gallery_range = max([int(x) for x in cover_gallery_pages])

                    for i in range(1, cover_gallery_range + 1):
                        cover_gallery_url = str(cover_gallery_base_url + '/?page={}').format(i)
                        cover_gallery_html = webscraper.simple_get(cover_gallery_url)
                        cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

                        # get issue hrefs from all linked issues on cover gallery
                        cover_gallery_hrefs = filter(lambda x: '/issue/' in x['href'] and '/cover/' not in x['href'], cover_gallery_soup.find_all('a',  href=True))
                        issue_hrefs = [x['href'] for x in cover_gallery_hrefs]

                        #  construct issue urls from issue hrefs
                        issue_urls = [URL + issue_href for issue_href in  issue_hrefs]

                        # scrape issues
                        for issue_url in issue_urls:

                            # get issue page
                            issue_html = webscraper.simple_get(issue_url)
                            issue_soup = webscraper.transform_simple_get_html(issue_html)

                            # metadata
                            metadata = {}
                            metadata['series_name'] = series_name.replace('/', '|')

                            # scrape metadata from issue page
                            # title, price, pages, color, dimension, paper_stock, binding, publishing_format
                            def get_issue_metadata(soup, name):
                                if len(soup.find_all('dd', id=name)) > 0:
                                    if (name != 'issue_indicia_publisher') & (name != 'issue_brand'):
                                        return soup.find_all('dd', id=name)[0].contents[0].strip()
                                    else:
                                        try:
                                            return soup.find_all('dd', id=name)[0].find('a').contents[0]
                                        except:
                                            return ""
                                else:
                                    return ""

                            metadata['title'] = issue_soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1].replace('/', '|')
                            metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')

                            # check if issue is redundant to an issue  already we pulled (variant)
                            if path.exists('./metadata/covers.jsonl'):
                                is_duplicate = check_if_issue_is_duplicate(title=metadata['title'], 
                                                                           on_sale_date=metadata['on_sale_date'], 
                                                                           metadata_path='./metadata/covers.jsonl')

                                if is_duplicate:
                                    pass
                                else:
                                    metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                                    metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                                    metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                                    metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                                    metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                                    metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                                    metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                                    metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                                    metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                                    metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                                    metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                                    metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                                    metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                                    all_issue_credits = list(zip(
                                        issue_soup.find_all('span', {'class': 'credit_label'}),  
                                        issue_soup.find_all('span', {'class': 'credit_value'})))

                                    metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                                    # get cover section
                                    cover = issue_soup.find("div", {"class": "cover"})

                                    # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                                    cover_credits = list(zip(
                                        [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                        [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                    ))

                                    metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                                    metadata.pop('cover_reprints', None)

                                    # get the cover url
                                    cover_img_href = cover.find("div", {'coverImage'}).a['href']
                                    cover_img_url = URL + cover_img_href

                                    # get cover page
                                    cover_img_html = webscraper.simple_get(cover_img_url)
                                    cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                                    # get image urls from cover page
                                    cover_img_soup.find_all('img')

                                    cover_divs = cover_img_soup.find_all('div', {'class': 'issue_covers'})[0].find_all('div')

                                    def get_variant_cover_name(cover_name: str):
                                        if get_brackets(cover_name) is None:
                                            return 'Original Cover'
                                        else:
                                            return get_brackets(cover_name).replace('[','').replace(']','')

                                    # go into variant url and pull metadata
                                    cover_images = [x.find_all('a')[0].contents[0]['src'] for x in cover_divs]
                                    cover_name = [get_variant_cover_name(x.find_all('a')[1].contents[0]) for x in cover_divs]
                                    cover_urls = [URL + x.find_all('a')[0]['href'] for x in cover_divs]

                                    covers = list((zip(cover_names, cover_urls,  cover_images)))

                                    covers_dict = {}
                                    for cover in covers:
                                        name = cover[0]
                                        url = cover[1]
                                        image = cover[2]

                                        covers_dict[name] = {}
                                        covers_dict[name]['cover_url'] = url
                                        covers_dict[name]['image_url'] = image

                                    metadata['variant_covers'] = {}

                                    for variant_name in covers_dict:
                                        if 'Second Printing' in variant_name:
                                            pass
                                        elif ('Newsstand' in variant_name) & ('Direct Sales' in covers_dict.keys()):
                                            pass
                                        else:
                                            issue_url = covers_dict[variant_name]['cover_url']

                                            # get issue page
                                            issue_html = webscraper.simple_get(issue_url)
                                            issue_soup = webscraper.transform_simple_get_html(issue_html)

                                            cover = issue_soup.find("div", {"class": "cover"})

                                            cover_credits = list(zip(
                                                                [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                                                [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                                            ))

                                            cover_credits = {"cover_{}".format(x[0].lower()): x[1] for x in cover_credits}
                                            cover_credits.pop('cover_reprints', None)

                                            save_as = "{} -- {} -- {} -- {}".format(metadata['series_name'], strip_brackets_from_title(metadata['title']), variant_name, metadata['on_sale_date'], )
                                            save_to = './covers/' + save_as + '.jpg'

                                            cover_credits['cover_image_file_name'] = save_as

                                            metadata['variant_covers'][variant_name] = cover_credits

                                            # save cover image
                                            urllib.request.urlretrieve(covers_dict[variant_name]['image_url'], save_to)


                                    # TODO: reason about response of save; if successful, save metadata, else contine
                                    # save metadata
                                    with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                        writer.write(metadata)

                                    # TODO: write to log... timestamp/publisher/series/issue/
                                    now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                                    publisher_int = publisher_url.split('/')[-2]

                                    log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                                    with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                        writer.write(log)

                                    # slow down the requests so we don't take too many resources and get blocked
                                    sleep(random.uniform(5, 10))


                            else:
                                metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                                metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                                metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                                metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                                metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                                metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                                metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                                metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                                metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                                metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                                metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                                metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                                metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                                all_issue_credits = list(zip(
                                    issue_soup.find_all('span', {'class': 'credit_label'}), 
                                    issue_soup.find_all('span', {'class': 'credit_value'})))

                                metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                                # get cover section
                                cover = issue_soup.find("div", {"class": "cover"})

                                # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                                cover_credits = list(zip(
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                ))

                                metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                                metadata.pop('cover_reprints', None)

                                # get the cover url
                                cover_img_href = cover.find("div", {'coverImage'}).a['href']
                                cover_img_url = URL + cover_img_href

                                # get cover page
                                cover_img_html = webscraper.simple_get(cover_img_url)
                                cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                                # get image urls from cover page
                                cover_img_soup.find_all('img')

                                cover_divs = cover_img_soup.find_all('div', {'class': 'issue_covers'})[0].find_all('div')

                                def get_variant_cover_name(cover_name: str):
                                    if get_brackets(cover_name) is None:
                                        return 'Original Cover'
                                    else:
                                        return get_brackets(cover_name).replace('[','').replace(']','')

                                # go into variant url and pull metadata
                                cover_images = [x.find_all('a')[0].contents[0]['src'] for x in cover_divs]
                                cover_name = [get_variant_cover_name(x.find_all('a')[1].contents[0]) for x in cover_divs]
                                cover_urls = [URL + x.find_all('a')[0]['href'] for x in cover_divs]

                                covers = list((zip(cover_names, cover_urls,  cover_images)))

                                covers_dict = {}
                                for cover in covers:
                                    name = cover[0]
                                    url = cover[1]
                                    image = cover[2]

                                    covers_dict[name] = {}
                                    covers_dict[name]['cover_url'] = url
                                    covers_dict[name]['image_url'] = image

                                metadata['variant_covers'] = {}

                                for variant_name in covers_dict:
                                    if 'Second Printing' in variant_name:
                                        pass
                                    elif ('Newsstand' in variant_name) & ('Direct Sales' in covers_dict.keys()):
                                        pass
                                    else:
                                        issue_url = covers_dict[variant_name]['cover_url']

                                        # get issue page
                                        issue_html = webscraper.simple_get(issue_url)
                                        issue_soup = webscraper.transform_simple_get_html(issue_html)

                                        cover = issue_soup.find("div", {"class": "cover"})

                                        cover_credits = list(zip(
                                                            [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                                            [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                                        ))

                                        cover_credits = {"cover_{}".format(x[0].lower()): x[1] for x in cover_credits}
                                        cover_credits.pop('cover_reprints', None)

                                        save_as = "{} -- {} -- {} -- {}".format(metadata['series_name'], strip_brackets_from_title(metadata['title']), variant_name, metadata['on_sale_date'], )
                                        save_to = './covers/' + save_as + '.jpg'

                                        cover_credits['cover_image_file_name'] = save_as

                                        metadata['variant_covers'][variant_name] = cover_credits

                                        # save cover image
                                        urllib.request.urlretrieve(covers_dict[variant_name]['image_url'], save_to)


                                # TODO: reason about response of save; if successful, save metadata, else contine
                                # save metadata
                                with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                    writer.write(metadata)

                                # TODO: write to log... timestamp/publisher/series/issue/
                                now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                                publisher_int = publisher_url.split('/')[-2]

                                log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                                with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                    writer.write(log)

                                # slow down the requests so we don't take too many resources and get blocked
                                sleep(random.uniform(5, 10))



In [5]:
#  ran page publisher/54/ page=1,2, 6

# get_all_from_publisher_page(publisher_url='https://www.comics.org/publisher/54/', page=26)

# TODO: finish running page 26

In [61]:
# # TODO: load metadata and return aggregate / summary statistics
# # TODO: write method to display cover image w/ cover metadata and add annotations to image
# # TODO: consider ideate/innotater for annotating directly in Jupyter notebooks

logged_metadata = []

with jsonlines.open('./metadata/covers.jsonl', mode='r') as reader:
    for item in reader:
        logged_metadata.append(item)
        
df = pd.DataFrame(logged_metadata)

In [62]:
# df

In [63]:
import numpy as np

def get_issue_number_from_title(title):
    issue = re.search(r"([#?])(\d+)\b", title.replace(',', ''))
    if issue is None:
        return np.nan
    else:
        return np.int(issue.group().replace('#', ''))

df['issue_number'] = df['title'].apply(get_issue_number_from_title)

df.drop(['issue_number', 'variant_covers'], axis=1).describe().T

Unnamed: 0,count,unique,top,freq
cover_characters,4268,2083,Superman,266
cover_colors,4718,292,?,2694
cover_editing,90,5,Curtis King,42
cover_first line of dialogue or text,428,347,"Ta-ta, Bats! Looks like this will be our final...",3
cover_genre,4683,30,superhero,3837
cover_inks,4733,702,Stan Kaye,253
cover_job number,635,461,C-422,4
cover_keywords,1153,792,celebrity,27
cover_letters,3978,51,?,2707
cover_pencils,4735,689,Curt Swan,373


In [64]:
# get list of unique characters across all  covers
def get_value_counts(df, column):
    """
    """
    return pd.Series("; ".join(df[column].dropna()).split('; ')).value_counts()

get_value_counts(df, 'cover_characters')[:20]

Batman [Bruce Wayne]        928
Superman                    757
Robin [Dick Grayson]        464
Kal-El]                     328
Superman [Clark Kent        320
Batman                      318
Superman [Clark Kent]       204
Lois Lane                   167
Superboy                    125
Bob Hope                    102
Lex Luthor                   97
Batgirl [Barbara Gordon]     81
Joker                        78
Supergirl                    77
Batgirl [Cassandra Cain]     73
Jimmy Olsen                  63
Jerry Lewis                  60
Superboy [Clark Kent]        52
Doiby Dickles                49
Robin                        47
dtype: int64

In [65]:
get_value_counts(df, 'cover_pencils')[:20]

Curt Swan                384
Win Mortimer             210
Sheldon Moldoff          137
Bob Oksner               113
Gil Kane                 103
Dave Johnson (signed)     99
Ross Andru (signed)       86
Owen Fitzgerald           78
Jim Aparo (signed)        76
Wayne Boring              68
Neal Adams                65
Dick Sprang               59
Jerry Grandenetti         59
Jack Burnley              58
Kelley Jones (signed)     51
Ed Hannigan (signed)      50
Al Plastino               49
Jerry Ordway (signed)     49
J. G. Jones (signed)      49
Tom Grummett (signed)     47
dtype: int64

In [66]:
get_value_counts(df, 'series_name')[:20]

Action Comics                                  1136
Batman                                         1089
Adventure Comics                                528
Detective Comics                                330
Adventures of Superman                          306
Batgirl                                         190
All-American Men of War                         116
100 Bullets                                     115
The Adventures of Bob Hope                      109
All-American Comics                             103
All Star Western                                 97
The Adventures of Jerry Lewis                    84
80 Page Giant Magazine                           56
52                                               52
Advanced Dungeons & Dragons Comic Book           50
The Adventures of Rex the Wonder Dog             46
Action Comics Weekly                             42
The Adventures of Dean Martin & Jerry Lewis      40
Batman '66                                       31
Adventures o

In [67]:
def concat_values_by_series(df, concat_column, series_name):    

    list_of_synopsis = df[
        (df['series_name'] == series_name) & df['issue_number'] != 0.0].sort_values('issue_number')[concat_column]

    return " | ".join(list_of_synopsis.values)

print(concat_values_by_series(df, concat_column='synopsis', series_name="Detective Comics")[:8000], '...')

Speed investigates the murder of several Chinamen along the wharf area and uncovers a human smuggling operation. | A millionaire that has just bought the fabulously valuable Rhangwa Pearls receives a letter from someone threatening to steal them. | Confronted by a series of murders, Bret forages into the Peruvian jungles to ascertain why a hole appears in each victim's throat, but they were not shot! | Venturing into downtown San Francisco, Bruce finds it odd to see a Chinese restaurant among the buildings, and no one to wait upon him when he enters desiring a meal. Stranger still, another couple enters and is served anything they desire. When he decides to stick around and see what develops, he and the couple are suddenly grabbed and blindfolded! | Gus is sent to a rich woman's home to watch over her pearls during a party and nabs a man he sees pocketing them. Unfortunately for Gus, it is the Chief! | Bart is called on to volunteer for a secret spy case, but in doing so, he must forgo

In [72]:
# dc US comics publisher page
url = 'https://www.comics.org/publisher/54/?page=1'

html = webscraper.simple_get(url)
soup = webscraper.transform_simple_get_html(html)

In [73]:
type(html)

bytes

In [74]:
# f = open('../comicvision/resources/publisher_54_page_1', 'wb')
# f.write(html)
# f.close()

In [75]:
# parse series' metadata

name = soup.find_all('td', {'class': 'name'})
year = soup.find_all('td', {'class': 'year'})
issue_count = soup.find_all('td', {'class': 'issue_count'})
covers = soup.find_all('td', {'class': 'covers'})
published = soup.find_all('td', {'class': 'published'})

In [76]:
# TODO: get list of series names and urls from a publisher page (e.g. page=1)

n = [result.find('a').contents[0] for result in name]
href = [result.find('a')['href'] for result in  name]
y = [result.contents[0] for result in year]
i = [result.contents[0] for result in issue_count]
# c = [result.find('a').contents[0] for result in covers]
p = [result.contents[0] for result in published]

In [77]:
# TODO: reason about a series' metadata (# of issues, #  of covers)

# create series dataframe
series_df = pd.DataFrame(list(zip(n, href,  y, i, p)), columns=['name', 'href', 'year', 'issue_count', 'published'])

# parse issue count as int from issue_count  column
series_df['issue_count_int'] = series_df['issue_count'].apply(lambda x: int(re.search(r'\d+', x).group()))

In [78]:
series_df[series_df['issue_count_int'] > 12]

Unnamed: 0,name,href,year,issue_count,published,issue_count_int
3,100 Bullets,/series/6133/,1999,100 issues (100 indexed),August 1999 - April 2009,100
4,100 Bullets,/series/24535/,2000,13 issues (8 indexed),[January] 2000 - [July] 2009,13
16,1st Issue Special,/series/2212/,1975,13 issues (13 indexed),April 1975 - April 1976,13
22,52,/series/16626/,2006,52 issues (52 indexed),July 2006 - July 2007,52
34,80 Page Giant Magazine,/series/1620/,1964,56 issues (15 indexed),August 1964 - February-March 1969,56
98,Action Comics,/series/97/,1938,866 issues (866 indexed),June 1938 - October 2011,866
99,Action Comics,/series/59922/,2011,117 issues (117 indexed),November 2011 - Present,117


In [79]:
url = 'https://www.comics.org'
series_urls = url + series_df['href']

# take a series  with many issues...
series_page_url = series_urls[98]

series_page_html = webscraper.simple_get(series_page_url)
series_page_soup = webscraper.transform_simple_get_html(series_page_html)

In [80]:
series_page_url

'https://www.comics.org/series/97/'

In [81]:
# f = open('../comicvision/resources/series_97_page_1', 'wb')
# f.write(html)
# f.close()

In [82]:
URL = 'https://www.comics.org'

# get 'series details cover gallery' url
cover_gallery_url = url + series_page_soup.find('a',  href=True, text='Cover Gallery')['href']
cover_gallery_url = cover_gallery_url  + '/?page=22'

cover_gallery_html = webscraper.simple_get(cover_gallery_url)
cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

if len(cover_gallery_soup.find_all('a', {'class': "btn btn-default btn-sm"})) > 0:

    # get issue hrefs from all linked issues on cover gallery
    cover_gallery_hrefs = filter(lambda x: '/issue/' in x['href'] and '/cover/' not in x['href'], cover_gallery_soup.find_all('a',  href=True))
    issue_hrefs = [x['href'] for x in cover_gallery_hrefs]

    #  construct issue urls from issue hrefs
    issue_urls = [URL + issue_href for issue_href in  issue_hrefs]

In [84]:
cover_gallery_url

'https://www.comics.org/series/97/covers/?page=22'

In [121]:
def get_brackets(title: str) -> Union[str, None]:
    """
    Return the substring of the first instance of bracketed text.
    """
    regex_brackets = re.search(r"\[(.*?)\]", title)
    if regex_brackets is None:
        return None
    else:
        return regex_brackets.group()
    

def is_reprinting(title: str) -> bool:
    """
    Check if a string contains some substrings.
    """
    is_second_printing = "Second Printing" in title
    is_2nd_printing = "2nd Printing" in title
    is_3rd_printing = "3rd Printing" in title
    is_4th_printing = "4th Printing" in title
    is_5th_printing = "5th Printing" in title
    is_6th_printing = "6th Printing" in title
    is_7th_printing = "7th Printing" in title
    is_8th_printing = "8th Printing" in title
    is_9th_printing = "9th Printing" in title
    is_10th_printing = "10th Printing" in title
    return (
        is_second_printing
        | is_2nd_printing
        | is_3rd_printing
        | is_4th_printing
        | is_5th_printing
        | is_6th_printing
        | is_7th_printing
        | is_8th_printing
        | is_9th_printing
        | is_10th_printing
    )


def is_newsstand_or_canadian(title) -> bool:
    """
    Check if an issue is a duplicate Newsstand issue.
    """
    return ("newsstand" in title.lower()) | ("canadian" in title.lower())


def is_variant(title) -> bool:
    """
    Check if an issue is variant cover.
    """
    return ("variant" in title.lower())


def is_redundant(title: str) -> bool:
    """
    Check if an issue is a redundant to a direct sale issue.
    """
    if title is None:
        return False
    else:
        return (is_reprinting(title) | is_newsstand_or_canadian(title) | is_variant(title)) | (('cover' in title.lower()) & ('direct' not in  title.lower()))


cover_refs = [(x.get_text(), x['href']) for x in cover_gallery_soup.find_all('a',  href=True)]

cover_refs = list(filter(lambda x: '/issue/' in x[1] and '/cover/' not in x[1], cover_refs))

cover_refs = [(get_brackets(x[0]), x[1]) for x in cover_refs]

cover_refs = list(filter(lambda x: not is_redundant(x[0]), cover_refs))

issue_hrefs = [x[1] for x  in cover_refs]

issue_urls = [URL + issue_href for issue_href in  issue_hrefs]

# cover_refs

['/issue/370657/',
 '/issue/370658/',
 '/issue/370659/',
 '/issue/370660/',
 '/issue/374477/',
 '/issue/374478/',
 '/issue/374479/',
 '/issue/374480/',
 '/issue/503361/',
 '/issue/512403/',
 '/issue/524135/',
 '/issue/524136/',
 '/issue/524137/',
 '/issue/524138/',
 '/issue/524139/',
 '/issue/524140/',
 '/issue/873707/',
 '/issue/524141/',
 '/issue/535174/',
 '/issue/545599/',
 '/issue/562837/',
 '/issue/562838/',
 '/issue/562839/',
 '/issue/606084/',
 '/issue/606085/',
 '/issue/606086/',
 '/issue/669657/',
 '/issue/669658/',
 '/issue/669659/',
 '/issue/682127/',
 '/issue/682128/',
 '/issue/702643/',
 '/issue/715100/',
 '/issue/732936/',
 '/issue/745730/',
 '/issue/750694/',
 '/issue/765369/']

In [35]:
issue_urls[0]

'https://www.comics.org/issue/370657/'

In [None]:
# f = open('../comicvision/resources/issue_370657', 'wb')
# f.write(html)
# f.close()

In [9]:
# take an issue url from the covers gallery..
# issue_url = issue_urls[0]
issue_url = "https://www.comics.org/issue/21497/cover/4/"

issue_html = webscraper.simple_get(issue_url)
issue_soup = webscraper.transform_simple_get_html(issue_html)

In [10]:
# issue_soup.find_all("a", {"class": "btn btn-default btn-sm"})

# cover_gallery_pages = list(
#                         filter(
#                             lambda x: x.isdigit(),
#                             [
#                                 x.contents[0]
#                                 for x in issue_soup.find_all(
#                                     "a", {"class": "btn btn-default btn-sm"}
#                                 )
#                             ],
#                         )
#                     )

# cover_gallery_range = max([int(x) for x in cover_gallery_pages])

# cover_gallery_range

In [11]:
f = open('../comicvision/resources/issue_21497_cover_4', 'wb')
f.write(issue_html)
f.close()

In [45]:
# issue_soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1].replace('/', '|')

In [46]:
def get_issue_metadata(soup, name):
    if len(soup.find_all('dd', id=name)) > 0:
        if (name != 'issue_indicia_publisher') & (name != 'issue_brand'):
            return soup.find_all('dd', id=name)[0].contents[0].strip()
        else:
            return soup.find_all('dd', id=name)[0].find('a').contents[0]
    else:
        return ""

In [49]:
# get metadata from issue url

# title, price, pages, color, dimension, paper_stock, binding, publishing_format
metadata = {}

metadata['title'] = issue_soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1].replace('/', '|')
metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')

metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

all_issue_credits = list(zip(
    issue_soup.find_all('span', {'class': 'credit_label'}), 
    issue_soup.find_all('span', {'class': 'credit_value'})))

metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))


# get cover section
cover = issue_soup.find("div", {"class": "cover"})

# cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
cover_credits = list(zip(
    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
))

metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
metadata.pop('cover_reprints', None)

# get the cover url
cover_img_href = cover.find("div", {'coverImage'}).a['href']
cover_img_url = URL + cover_img_href

# get cover page
cover_img_html = webscraper.simple_get(cover_img_url)
cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

# get image urls from cover page
cover_img_soup.find_all('img')

cover_divs = cover_img_soup.find_all('div', {'class': 'issue_covers'})[0].find_all('div')

def get_variant_cover_name(cover_name: str):
    if get_brackets(cover_name) is None:
        return 'Original Cover'
    else:
        return get_brackets(cover_name).replace('[','').replace(']','')

# go into variant url and pull metadata
cover_images = [x.find_all('a')[0].contents[0]['src'] for x in cover_divs]
cover_names = [get_variant_cover_name(x.find_all('a')[1].contents[0]) for x in cover_divs]
cover_urls = [URL + x.find_all('a')[0]['href'] for x in cover_divs]

covers = list((zip(cover_names, cover_urls,  cover_images)))

covers_dict = {}
for cover in covers:
    name = cover[0]
    url = cover[1]
    image = cover[2]

    covers_dict[name] = {}
    covers_dict[name]['cover_url'] = url
    covers_dict[name]['image_url'] = image

metadata['variant_covers'] = {}
    
for variant_name in covers_dict:
    if 'Second Printing' in variant_name:
        pass
    elif ('Newsstand' in variant_name) & ('Direct Sales' in covers_dict.keys()):
        pass
    else:
        issue_url = covers_dict[variant_name]['cover_url']

        # get issue page
        issue_html = webscraper.simple_get(issue_url)
        issue_soup = webscraper.transform_simple_get_html(issue_html)

        cover = issue_soup.find("div", {"class": "cover"})

        cover_credits = list(zip(
                            [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                            [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                        ))

        cover_credits = {"cover_{}".format(x[0].lower()): x[1] for x in cover_credits}
        cover_credits.pop('cover_reprints', None)
        
#         save_as = "{} -- {} -- {} -- {}".format(metadata['series_name'], strip_brackets_from_title(metadata['title']), variant_name, metadata['on_sale_date'], )
#         save_to = './covers/' + save_as + '.jpg'
        
#         cover_credits['cover_image_file_name'] = save_as

#         metadata['variant_covers'][variant_name] = cover_credits

#         # save cover image
#         urllib.request.urlretrieve(covers[variant_name], save_to)
    
    
# # TODO: reason about response of save; if successful, save metadata, else contine
# # save metadata
# with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
#     writer.write(metadata)

# # TODO: write to log... timestamp/publisher/series/issue/
# now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
# publisher_int = publisher_url.split('/')[-2]

# log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
# with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
#     writer.write(log)

# # slow down the requests so we don't take too many resources and get blocked
# sleep(random.uniform(5, 10))


In [52]:
for variant_name in covers_dict:
    print(covers_dict[variant_name]['cover_url'])

https://www.comics.org/issue/36858/
https://www.comics.org/issue/1248505/
https://www.comics.org/issue/1837315/


In [55]:
issue_url = "https://www.comics.org/issue/1837315/"

issue_html = webscraper.simple_get(issue_url)
issue_soup = webscraper.transform_simple_get_html(issue_html)

f = open('../comicvision/resources/issue_1837315', 'wb')
f.write(issue_html)
f.close()

In [1235]:
# go into variant url and pull metadata
cover_images = [x.find_all('a')[0].contents[0]['src'] for x in cover_divs]
cover_names = [get_variant_cover_name(x.find_all('a')[1].contents[0]) for x in cover_divs]
cover_urls = [URL + x.find_all('a')[0]['href'] for x in cover_divs]

covers = list((zip(cover_names, cover_urls,  cover_images)))

covers_dict = {}
for cover in covers:
    name = cover[0]
    url = cover[1]
    image = cover[2]

    covers_dict[name] = {}
    covers_dict[name]['cover_url'] = url
    covers_dict[name]['image_url'] = image
    
    
issue_url = covers_dict[variant_name]['cover_url']

# get issue page
issue_html = webscraper.simple_get(issue_url)
issue_soup = webscraper.transform_simple_get_html(issue_html)

cover = issue_soup.find("div", {"class": "cover"})

cover_credits = list(zip(
                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                ))

cover_credits = {"cover_{}".format(x[0].lower()): x[1] for x in cover_credits}
cover_credits.pop('cover_reprints', None)

In [1256]:
cover_credits

{'cover_pencils': 'Gary Frank (signed)',
 'cover_inks': 'Gary Frank (signed)',
 'cover_colors': '?',
 'cover_letters': 'typeset',
 'cover_genre': 'superhero',
 'cover_characters': 'Superman',
 'cover_keywords': 'Legion of Super-Heroes flight ring'}