In [26]:
import re
import jsonlines
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from requests.exceptions import RequestException

import comicvision.webscraper as webscraper

In [27]:
# TODO: refactor this method to make it more functional and easier to test
# TODO: add tests!

# TODO: add random wait statement to mock obfuscate web scraping application from server
# TODO: add log to track which publisher/series/issue has been completed

def get_all_from_publisher_page(publisher_url: str, page: int):
    """
    Do a thing...
    """
    # global val
    URL = 'https://www.comics.org'

    # get publisher page
    publisher_html = webscraper.simple_get(publisher_url + '?page={}'.format(page))
    publisher_soup = webscraper.transform_simple_get_html(publisher_html)
    
    # parse series table from publisher page
    series_name = [result.find('a').contents[0] for result in publisher_soup.find_all('td', {'class': 'name'})]
    series_href = [result.find('a')['href'] for result in  publisher_soup.find_all('td', {'class': 'name'})]
    series_year = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'year'})]
    series_issue_count = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'issue_count'})]
    # series_covers = [result.find('a').contents[0] for result in publisher_soup.find_all('td', {'class': 'covers'})]
    series_published = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'published'})]
    
    # create dataframe of publisher series (on page)
    series_df = pd.DataFrame(list(zip(series_name, series_href, series_year, series_issue_count, series_published)),
                             columns=['name', 'href', 'year', 'issue_count', 'published'])

    # parse issue count as int from issue_count column
    series_df['issue_count_int'] = series_df['issue_count'].apply(lambda x: int(re.search(r'\d+', x).group()))

    # iterate over series dataframe and get issue covers and metadata
    for series_name, series_page_href, issue_count in zip(series_df['name'], series_df['href'], series_df['issue_count_int']):
        
        if issue_count < 12:
            pass
        else:
            print(series_name, series_page_href, issue_count)
            
            # construct series page url
            series_page_url  = URL + series_page_href

            # get series page
            series_page_html = webscraper.simple_get(series_page_url)
            series_page_soup = webscraper.transform_simple_get_html(series_page_html)

            # get cover gallery url for series
            cover_gallery_href = series_page_soup.find('a',  href=True, text='Cover Gallery')['href']
            cover_gallery_url = URL + cover_gallery_href

            # get cover gallery page
            cover_gallery_html = webscraper.simple_get(cover_gallery_url)
            cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

            # get issue hrefs from all linked issues on cover gallery
            issue_hrefs = list()
            for i in range(1, (issue_count + 1)):
                if cover_gallery_soup.find('a',  href=True,  text=i) is not None:
                    issue_tag = cover_gallery_soup.find('a',  href=True,  text=i)
                    issue_hrefs.append(issue_hrefs.append(issue_tag['href']))
                    continue
                elif cover_gallery_soup.find('a', href=True,  text='{} [Direct]'.format(i)) is not None:
                    issue_tag = cover_gallery_soup.find('a', href=True,  text='{} [Direct]'.format(i))
                    issue_hrefs.append(issue_hrefs.append(issue_tag['href']))
                    continue
                else:
                    continue
                    
            # filter out junk hrefs (I dunno... this is scraping... <shrug>)
            issue_hrefs = list(filter(lambda x: 'issue' in x, [i for i in issue_hrefs if i]))

            #  construct issue urls from issue hrefs
            issue_urls = [URL + issue_href for issue_href in  issue_hrefs]

            # scrape issues
            for issue_url in issue_urls:

                # get issue page
                issue_html = webscraper.simple_get(issue_url)
                issue_soup = webscraper.transform_simple_get_html(issue_html)

                # metadata
                metadata = {}
                metadata['series_name'] = series_name.replace('/', '|')

                # scrape metadata from issue page
                # title, price, pages, color, dimension, paper_stock, binding, publishing_format
                def get_issue_metadata(soup, name):
                    if len(soup.find_all('dd', id=name)) > 0:
                        return soup.find_all('dd', id=name)[0].contents[0].strip()
                    else:
                        return ""
                
                metadata['title'] = issue_soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1].replace('/', '|')
                metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')

                # get cover section and credits from issue page
                cover = issue_soup.find("div", {"class": "cover"})

                # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                cover_credits = list(zip(
                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                ))

                metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                metadata.pop('cover_reprints', None)

                # get the cover url
                cover_img_href = cover.find("div", {'coverImage'}).a['href']
                cover_img_url = URL + cover_img_href

                # get cover page
                cover_img_html = webscraper.simple_get(cover_img_url)
                cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                # get image urls from cover page
                cover_images = cover_img_soup.select('img')

                # get raw, highest res image
                cover_images = list(filter(lambda x:'files1.comics.org//img/' in x['src'], cover_images))
                cover = cover_images[0]['src']

                # construct where to save the cover image
                save_as = "{} -- {}".format(metadata['series_name'], metadata['title'])
                save_to = './covers/' + save_as + '.jpg'
                
                metadata["cover_image_file_name"] = save_as
                
                # save cover image
                urllib.request.urlretrieve(cover, save_to)

                # TODO: reason about response of save; if successful, save metadata, else contine
                # save metadata
                with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                    writer.write(metadata)
                    
                # TODO: write to log... timestamp/publisher/series/issue/

In [28]:
get_all_from_publisher_page(publisher_url='https://www.comics.org/publisher/78/', page=2)

The Adventures of Spider-Man /series/17311/ 12
Adventures of the Big Boy /series/11758/ 20
The Adventures of the X-Men /series/11999/ 12
The Adventures of the X-Men / The Adventures of Spider-Man /series/52528/ 12
Age of Apocalypse /series/63931/ 14
Agent X /series/11848/ 15
Akira /series/3636/ 38
ALF /series/3637/ 50
Alias /series/10240/ 28
Alien Legion /series/2858/ 20
Alien Legion /series/3403/ 18
The All New Exiles /series/9833/ 12
All Surprise Comics /series/325/ 12
All True Crime /series/626/ 16
All-New Ghost Rider /series/79792/ 12
All-New Guardians of the Galaxy /series/113811/ 12


In [None]:
# TODO: load metadata and return aggregate / summary statistics


In [None]:
# TODO: write method to display cover image w/ cover metadata and add annotations to image
# TODO: consider ideate/innotater for annotating directly in Jupyter notebooks

In [29]:
# # marvel US comics publisher page
# url = 'https://www.comics.org/publisher/78/?page=1'

# html = webscraper.simple_get(url)
# soup = webscraper.transform_simple_get_html(html)

In [30]:
# # parse series' metadata

# name = soup.find_all('td', {'class': 'name'})
# year = soup.find_all('td', {'class': 'year'})
# issue_count = soup.find_all('td', {'class': 'issue_count'})
# covers = soup.find_all('td', {'class': 'covers'})
# published = soup.find_all('td', {'class': 'published'})

In [31]:
# # TODO: get list of series names and urls from a publisher page (e.g. page=1)

# n = [result.find('a').contents[0] for result in name]
# href = [result.find('a')['href'] for result in  name]
# y = [result.contents[0] for result in year]
# i = [result.contents[0] for result in issue_count]
# # c = [result.find('a').contents[0] for result in covers]
# p = [result.contents[0] for result in published]

In [32]:
# # TODO: reason about a series' metadata (# of issues, #  of covers)

# # create series dataframe
# series_df = pd.DataFrame(list(zip(n, href,  y, i, p)), columns=['name', 'href', 'year', 'issue_count', 'published'])

# # parse issue count as int from issue_count  column
# series_df['issue_count_int'] = series_df['issue_count'].apply(lambda x: int(re.search(r'\d+', x).group()))

In [33]:
# series_df[series_df['issue_count_int'] > 12]

In [34]:
# url = 'https://www.comics.org'
# series_urls = url + series_df['href']

# # take a series  with many issues...
# series_page_url = series_urls[6]

# series_page_html = webscraper.simple_get(series_page_url)
# series_page_soup = webscraper.transform_simple_get_html(series_page_html)

In [35]:
# # get 'series details cover gallery' url
# cover_gallery_url = url + series_page_soup.find('a',  href=True, text='Cover Gallery')['href']

# cover_gallery_html = webscraper.simple_get(cover_gallery_url)
# cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

In [36]:
# cover_gallery_url

In [37]:
# # get issue hrefs from all linked issues on cover gallery
# issue_hrefs = list()
# for i in range(1, (84 + 1)):
#     if cover_gallery_soup.find('a',  href=True,  text=i) is not None:
#         issue_tag = cover_gallery_soup.find('a',  href=True,  text=i)
#         issue_hrefs.append(issue_hrefs.append(issue_tag['href']))
#         continue
#     elif cover_gallery_soup.find('a', href=True,  text='{} [Direct]'.format(i)) is not None:
#         issue_tag = cover_gallery_soup.find('a', href=True,  text='{} [Direct]'.format(i))
#         issue_hrefs.append(issue_hrefs.append(issue_tag['href']))
#         continue
#     else:
#         continue

# # filter out junk hrefs (I dunno... this is scraping... <shrug>)
# issue_hrefs = list(filter(lambda x: 'issue' in x, [i for i in issue_hrefs if i]))

# #  construct issue urls from issue hrefs
# issue_urls = [url + issue_href for issue_href in  issue_hrefs]

In [38]:
# # take an issue url from the covers gallery..
# issue_url = issue_urls[0]

# issue_html = webscraper.simple_get(issue_url)
# issue_soup = webscraper.transform_simple_get_html(issue_html)

In [39]:
# issue_url

In [40]:
# # get metadata from issue url

# # title, price, pages, color, dimension, paper_stock, binding, publishing_format
# metadata = {}

# metadata['title'] = soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1]
# metadata['issue_price'] = soup.find_all('dd', id='issue_price')[0].contents[0].strip()
# metadata['issue_pages'] = soup.find_all('dd', id='issue_pages')[0].contents[0].strip()
# metadata['format_color'] = soup.find_all('dd', id='format_color')[0].contents[0].strip()
# metadata['format_dimensions'] = soup.find_all('dd', id='format_dimensions')[0].contents[0].strip()
# metadata['format_paper_stock'] = soup.find_all('dd', id='format_paper_stock')[0].contents[0].strip()
# metadata['format_binding'] = soup.find_all('dd', id='format_binding')[0].contents[0].strip()
# metadata['format_publishing_format'] = soup.find_all('dd', id='format_publishing_format')[0].contents[0].strip()


# # get cover section
# cover = soup.find("div", {"class": "cover"})

# # editing, script, pencils, inks, colors, letters, characters, etc...
# cover_credits = list(zip(
#     [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
#     [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
# ))

# metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
# metadata

In [41]:
# href = cover.find("div", {'coverImage'}).a['href']
# cover_img_url = 'https://www.comics.org' + href
# cover_img_url

In [42]:
# raw_html = webscraper.simple_get(cover_img_url)
# html = BeautifulSoup(raw_html, 'html.parser')
# images = html.select('img')

# cover_images = list(filter(lambda x:'files1.comics.org//img/' in x['src'], images))

# cover_image = cover_images[0]['src']
# cover_image

In [43]:
# issue_title = metadata['title']
# issue_title

In [44]:
# save_to = './' + issue_title + '.jpg'

# urllib.request.urlretrieve(cover_image, save_to)