## Imports

In [1]:
import sys
import urllib.request
import re
import pandas as pd
from bs4 import BeautifulSoup

## Helper Functions

In [141]:
def preprocess_page(content):
    ''' Remove extra spaces between HTML tags. '''
    content = ''.join([line.strip() for line in content.split('\n')])
    return content

def find_all_topic_page_links():
    topic_page_urls = []
    try:
        response = urllib.request.urlopen('http://jokes.cc.com/joke-categories')
        content = response.read().decode('utf8')
        content = preprocess_page(content) # Now *content* is a string containing the html page, ready for processing with BeautifulSoup
        soup = BeautifulSoup(content, "lxml")
        for category in soup.find("ul", class_='list_horiz'):
            for link in category.find_all('a'):
                topic_page_urls.append(link.get('href'))
    except IOError:
        print("Unable to open URL")
        exit(1)
    else:
        return topic_page_urls

def find_joke_page_links_from_topic_pages(url_list):
    '''Find and return all the individual joke page links from the current topic page.'''
    joke_page_urls = []
    try:
        for link in url_list:
            topic_category = ''
            response = urllib.request.urlopen(link)
            content = response.read().decode('utf8')
            content = preprocess_page(content) 
            # Now *content* is a string containing the html page, ready for processing with BeautifulSoup
            soup = BeautifulSoup(content, "lxml")
            for each in soup.find("div", class_='module_content'):
                topic_category = each.find('span', class_='bgb').get_text() 
                # this pulls in the greater topic or category of joke, to be saved with the link
                for item in each.find('ul'):
                    for link in item.find_all('a'):
                        joke_page_urls.append([topic_category, link.get('href')])
    except IOError:
        print("Unable to open URL")
        exit(1)
    else:
        return joke_page_urls

def soupify_jokes(url_list):
    '''Reads in the list of topics and URLs, visits each one, and then saves each joke 
    (and topic) to the list of jokes.'''
    jokes = []
    try:
        for idx in range(0,len(url_list)):
            joke = ""
            response = urllib.request.urlopen(url_list[idx][1])
            content = response.read().decode('utf8')
            content = preprocess_page(content) 
            # Now *content* is a string containing the html page, ready for processing with BeautifulSoup
            soup = BeautifulSoup(content, "lxml")
            for each in soup.find("div", class_='content_wrap'):
                if each.name == 'p':
                    for br in each.find_all("br"):
                        br.replace_with("\n")
                    joke += each.get_text() + " "
            jokes.append([url_list[idx][0],url_list[idx][1], joke])          
    except IOError:
        print("Unable to open URL")
        exit(1)
    else:
        return jokes
    
def soupify_jokes_from_csv(filename):
    '''Reads in the list of topics/categories and URLs from a csv, visits each one, and then saves each joke 
    (and topic) to the list of jokes.'''
    # start by importing the csv and saving those to a list (so I don't have to re-scrap those URLs each time)
    url_list = []
    with open(filename) as f:
        for each in f.readlines():
            url_list.append(each.strip().split(','))
    # then move onto processing those
    jokes = []
    try:
        for idx in range(11412, len(url_list)):
#         for idx in range(0,len(url_list)):
            joke = ""
            # going to try to use the Google cached version so I don't upset/inconvenience the Comedy Central site
#             cached_url = 'http://webcache.googleusercontent.com/search?q=cache:'+ url_list[idx][1]
#             response = urllib.request.urlopen(cached_url)
            req = urllib.request.Request(url_list[idx][1], data=None, headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})
            response = urllib.request.urlopen(req)
            content = response.read().decode('utf8')
            content = preprocess_page(content)
            # Now *content* is a string containing the html page, ready for processing with BeautifulSoup
            soup = BeautifulSoup(content, "lxml")
            for each in soup.find("div", class_='content_wrap'):
                if each.name == 'p':
                    for br in each.find_all("br"):
                        br.replace_with("\n ")
                    joke += each.get_text() + " "
            jokes.append([url_list[idx][0],url_list[idx][1], joke])          
    except IOError:
        print("Unable to open URL: " + url_list[idx][1])
        return jokes
        exit(1)
    else:
        return jokes
    
def write_to_file(jokes_list, filename='all_cc_jokes.csv'):
    final_result = pd.DataFrame(jokes_list)
    final_result.columns = ['Category', 'URL', 'Joke']
    final_result.to_csv(filename, sep=',', header=False, index_label=False)

In [20]:
# commenting this out now that I already ran it and saved out the URL file (so I don't have to scrape it again)

# topic_page_urls = find_all_topic_page_links()
# print(len(topic_page_urls))
# output = 33

In [67]:
# commenting this out now that I already ran it and saved out the URL file (so I don't have to scrape it again)

# joke_page_urls = find_joke_page_links_from_topic_pages(topic_page_urls)

# print(len(joke_page_urls))
# output = 14553

In [68]:
# commenting this out now that I already ran it and saved out the URL file (so I don't have to scrape it again)

# jokes_df = pd.DataFrame(joke_page_urls)
# jokes_df.to_csv('joke_urls.csv', index=False, header=False)

In [53]:
# did this for first batch, just modified the range of the soupify function.

write_to_file(soupify_jokes_from_csv('joke_urls.csv'))
%time

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 5.96 µs


In [69]:
# did this for the second batch, again modifying the range for the soupify function
write_to_file(soupify_jokes_from_csv('joke_urls.csv'))
# starting this call at 7:58pm on Friday, finished at 1:18am Saturday

Unable to open URL: http://jokes.cc.com/funny-pop-culture---celebrity/tsud16/jeff-cesario--greatest-democracy-in-the-world


In [71]:
# did this for the third batch, again, modifying the range for the soupify function
write_to_file(soupify_jokes_from_csv('joke_urls.csv'))
# starting this call at 7:01am on Saturday, ended at 8:35am Saturday
# this file got overwritten when I was saving out the whole long list of jokes...hopefully I don't need it again

seem to be missing from http://jokes.cc.com/funny-miscellaneous/wfnyak/jeff-caldwell--birth-control
to http://jokes.cc.com/funny-pick-up-lines/c4uzmn/zombie-booty-call----catch

* Not sure if this is actually happening, some jokes are cross-listed and appear with multiple URLs based on the different topics that they could fit into. When that happens, they all share the same final URL (after last slash)

In [113]:
def read_jokes_back_in(filename):
    df1 = pd.read_csv(filename, 
                     sep = ',', 
                     skiprows = 1, 
                     names = ['Index', 'Category', 'URL', 'Joke'])
    return df1

jokes_1 = read_jokes_back_in('jokes_result.csv')
jokes_2 = read_jokes_back_in('jokes_result_2.csv')
jokes_3 = read_jokes_back_in('jokes_result_3.csv')

# all_jokes = []
# with open('jokes_result.csv') as f1:
#     jokes_1 = f1.readlines()
# for each in jokes_1[:5]:
#     all_jokes.append(each.split(','))
# for each in all_jokes:
#     print(each)
# with open('jokes_result_2.csv') as f2:
#     jokes_2 = f2.readlines.strip().split(',')
# with open('jokes_result_3.csv') as f3:
#     jokes_3 = f3.readlines.strip().split(',')

In [117]:
print(len(jokes_1) + len(jokes_2) + len(jokes_3))

15054


In [127]:
#dropping the index column because it had reset with each new file I started
jokes_1.drop('Index', axis=1, inplace=True)
jokes_2.drop('Index', axis=1, inplace=True)
jokes_3.drop('Index', axis=1, inplace=True)

In [137]:
all_frames = [jokes_1, jokes_2, jokes_3]
all_jokes = pd.concat(all_frames, ignore_index=True)

In [139]:
all_jokes.loc[15050]

Category                                             Yo' Mama
URL         http://jokes.cc.com/funny-yo--mama/rx4x0c/yo--...
Joke        Yo' sister is so ugly, I thought she was Yo' M...
Name: 15050, dtype: object

In [142]:
write_to_file(all_jokes)