In [73]:
import requests
from collections import namedtuple
# import validators
from bs4 import BeautifulSoup, NavigableString
from halo import Halo
def get_soup(url):
    '''
    Returns a BeautifulSoup version of a web page.
    '''
    response = requests.get(url)
    content = response.text
    return BeautifulSoup(content)

def get_links(table):
    '''
    Retrieves links from an embedded tabular structure.
    '''
    links = set()
    for child in table.children:
        for link in child.find_all('a', href=True):
            links.add(link['href'])

    return links



In [0]:
url = 'http://christmas-specials.wikia.com/'
    #data='xmas'
    # if not validators.url(url):
    #     print('Invalid URL')
    # path = pathlib.Path(data)

spinner = Halo(text='Fetching article list . . .')
spinner.start()
    
soup = get_soup(url + '/wiki/Special:AllPages')
spinner.succeed(text='Fetching list ===> Done')

'''Process the articles'''

articles = set()

spinner = Halo(text='Processing articles . . .')
spinner.start()

table_chunks = soup.find('table', 
        {'class': ['allpageslist', 'mw-allpages-table-chunk']})
    
chunks = set(get_links(table_chunks))

chunks_done = 0
chunks_left = len(chunks)

In [0]:
# proof of concept

article = '/wiki/Mr._Monk_and_the_Man_Who_Shot_Santa_Claus'

def get_article_data(article, url='http://christmas-specials.wikia.com/'):
    Article = namedtuple('Article', 'title contents categories related')
    article_soup = get_soup(url + article)
    article_title = article_soup.find('h1').text
    # article_contents = article_soup.find_all('div', {'class': ['mw-content-ltr', 'mw-content-text', 'mw-collapsible', 'mw-made-collapsible']})
    related = set()
    for tag in soup.select('h2 ~ ul > li'):
        related.add((tag.text, tag.a['href']))

    article_contents = ' '.join([p.text for p in article_soup.find_all('p')])
    article_categories = [li['data-name'] for li in article_soup.find_all('li', {'class': 'category normal', 'data-type': 'normal'})]
    article = Article(title=article_title, contents=article_contents, categories=article_categories, related=related)
    return article

monk = get_article_data(article)

In [220]:
print(monk.title)
print(monk.categories)
print(monk.related)
print(monk.contents)

Mr. Monk and the Man Who Shot Santa Claus
['Episodes', 'Originally aired on the USA Network', '2007 releases', 'Universal Studios', 'ABC Studios']
[('"Mr. Monk and the Secret Santa"\n', '/wiki/Mr._Monk_and_the_Secret_Santa'), ('"Mr. Monk Meets His Dad"\n', '/wiki/Mr._Monk_Meets_His_Dad'), ('"Mr. Monk and the Miracle"\n', '/wiki/Mr._Monk_and_the_Miracle'), ('"Mr. Monk and the Man Who Shot Santa Claus" at the Internet Movie Database\n', 'http://www.imdb.com/title/tttt1111043/'), ('"Mr. Monk and the Man Who Shot Santa Claus" at The Daily.WAV\n', 'http://www.dailywav.com/program.php?Program=Monk')]
He's a... bad Santa.
Very bad, bad, Santa.
Bad, bad, bad Santa.
 — Adrian Monk "Mr. Monk and the Man Who Shot Santa Claus" is the third Christmas episode of the USA Network original series Monk, produced and aired as the tenth episode of the its sixth season.
 Another Christmas shopping season, which fills Adrian Monk's heart with dread. Natalie Teeger is fighting a Christmastime traffic jam in 

In [0]:
spinner.text = f'Processing chunks: {chunks_done}/{chunks_left}'

'''Process the chunks'''
while len(chunks) > 0:
    current_link = chunks.pop()
    soup = get_soup(url + current_link)
    current_table = soup.find('table', 
    {'class': ['allpageslist', 'mw-allpages-table-chunk']})
    if 'allpageslist' in current_table.get('class'):
        child_chunks = get_links(current_table)
        chunks = chunks.union(child_chunks)
        chunks_left += len(child_chunks)
    if 'mw-allpages-table-chunk' in current_table.get('class'):
        for child in current_table.children:
            if isinstance(child, NavigableString):
                continue
            else:
                for link in child.find_all('a', href=True):
                    article_link = link['href']
                    # article = article_link.split('/')[-1]
                    # articles.add((article, article.replace('_', ' ')))
                    try:
                        articles.add(get_article_data(article_link))
                    except Exception as e:
                        print(e, article_link)
      
    chunks_done += 1
    spinner.text = f'Processing chunks: {chunks_done}/{chunks_left}'

spinner.succeed(text='Processing chunks ===> Done')

In [0]:
len(articles)


In [100]:
spinner = Halo(text=f'Printing {len(articles)} articles')
spinner.start()

# for article in sorted(articles):
#    with open(f'{article}.txt', 'w') as outfile:
#        outfile.write(article)
#        outfile.write('\n')

spinner.succeed(text=f'Printing {len(articles)} articles. ===> Done')

⠋ Printing 8684 articles⠙ Printing 8684 articles✔ Printing 8684 articles. ===> Done


<halo.halo.Halo at 0x10a5c2940>