In [28]:
from lxml import html
import requests
import pickle
from IPython.display import clear_output
from time import sleep
import json
import numpy as np

# base URL for the jokes website
page_root_url = 'http://jokes.cc.com/'
# file name for the topics dictionary (topic_name -> topic_URL)
topics_filename = 'topicsCC.json'
# filename for the jokes dictionary (topic_name -> list of jokes)
jokes_filename = 'jokesCC.json'

In [19]:
# gather topics and save them to file

# sub-URL where the jokes are
page_topic_base_url = page_root_url + 'joke-categories'
# request HTML page
page_response = requests.get(page_topic_base_url)
# convert to tree for searching for elements with xpath
tree = html.fromstring(page_response.content)
# get all topics (but they come with many empty strings and \n stirngs)
raw_topics = tree.xpath('//div[@class="middle"]//ul[@class="list_horiz"]//text()')
# get the clean topics' urls
topics_links = tree.xpath('//div[@class="middle"]//ul[@class="list_horiz"]//li/a/@href')
# run through every raw topic and get only the clean ones (actual topic names)
topics = {}
ix = 0
for raw_topic in raw_topics:
    raw_topic_strip = raw_topic.strip()
    if len(raw_topic_strip) > 1:        
        topics[raw_topic] = topics_links[ix]
        ix += 1
# save the file as JSON
with open(topics_filename, 'w') as fp:
    json.dump(topics, fp)
# print information on gathered topics
print('Gathered a total of ' + str(len(topics)) + ' topics')
print(topics)

Gathered a total of 34 topics
{'Animal': 'http://jokes.cc.com/funny-animal', 'Blonde': 'http://jokes.cc.com/funny-blonde', 'Blue Collar': 'http://jokes.cc.com/funny-blue-collar', 'Cross the Road': 'http://jokes.cc.com/funny-cross-the-road', 'Dark Humor': 'http://jokes.cc.com/funny-dark-humor', 'Dirty': 'http://jokes.cc.com/funny-dirty', 'Doctor': 'http://jokes.cc.com/funny-doctor', 'Fat': 'http://jokes.cc.com/funny-fat', 'Food': 'http://jokes.cc.com/funny-food', 'God': 'http://jokes.cc.com/funny-god', 'Gross': 'http://jokes.cc.com/funny-gross', 'Insults': 'http://jokes.cc.com/funny-insults', 'Kids': 'http://jokes.cc.com/funny-kids', 'Lawyer': 'http://jokes.cc.com/funny-lawyer', 'Little Johnny': 'http://jokes.cc.com/funny-little-johnny', "Lookin' Good": 'http://jokes.cc.com/funny-lookin--good', 'Marriage': 'http://jokes.cc.com/funny-marriage', 'Men/Women': 'http://jokes.cc.com/funny-men-women', 'Miscellaneous': 'http://jokes.cc.com/funny-miscellaneous', 'Money': 'http://jokes.cc.com/fun

In [3]:
# function for gathering jokes given a topic
# the function updates the total number of jokes and the jokes dictionary
def get_topic_jokes(topics, topic, jokes, tot_n_jokes = 0, MAX_N_JOKES = 1000000):
    topic_jokes_url = topics[topic]
    page_response = requests.get(topic_jokes_url)
    tree = html.fromstring(page_response.content)
    joke_urls = tree.xpath('//div[@class="middle"]//ul//li//a/@href')
    joke_urls = joke_urls[1:]
    tot_n_jokes += len(joke_urls)
    ix_joke = 0
    jokes[topic] = []
    for joke_url in joke_urls:
        page_response = requests.get(joke_url, stream=True)
        tree = html.fromstring(page_response.content)
        joke_lines = tree.xpath('//div[@class="content_wrap"]//p//text()')
        joke = ''        
        for joke_line in joke_lines:
            if len(joke_line) > 1:
                joke += ' ' + joke_line.lstrip().rstrip()
        jokes[topic].append(joke)
        ix_joke = ix_joke + 1
        if ix_joke >= MAX_N_JOKES:
            print('Maximum number of jokes reached!')
            clear_output(wait=True)
            tot_n_jokes = ix_joke
            return jokes, tot_n_jokes
        print('Total number of topics: ' + str(len(topics.keys())))
        print('Total number of jokes: ' + str(tot_n_jokes))
        print('Total number of jokes for topic ' + topic + ': ' + str(ix_joke) +'/' + str(tot_n_jokes))
        print('Topic ' + topic + ': ' + joke)
        clear_output(wait=True)
    return jokes, tot_n_jokes

In [4]:
# this cell will continue form tha last processed topic
# it read the jokes file and gets the current topic as the last topic in jokes dictionary keys

# load topics file
topics = pickle.load( open( topics_filename, "rb" ) ) 
# get total number of jokes and current topic as last processed one
jokes = {}
tot_n_jokes = 0
topics_keys = list(topics.copy().keys())
curr_topic = topics_keys[0]
try:
    jokes = pickle.load( open(jokes_filename, "rb") )    
    topics_without_jokes = []
    for topic in topics:
        n_jokes_topic = 0
        if topic in jokes.keys():        
            jokes_topic = jokes[topic]
            for joke in jokes_topic:
                n_jokes_topic+=1
        else:
            topics_without_jokes.append(topic)
        tot_n_jokes += n_jokes_topic

    jokes_topics = sorted(jokes.keys())
    curr_topic = jokes_topics[-1]
    print('Last processed topic before interruption was: ' + curr_topic)
except:
    print('Could not find file ' + jokes_filename + '. Starting from scratch.')

    # run starting from last processed topic until last topic, gathering all jokes
found_last_topic = False
for topic in topics.keys():    
    if topic == curr_topic:
        found_last_topic = True
    if not found_last_topic:        
        continue
    jokes, tot_n_jokes = get_topic_jokes(topics, topic, jokes, tot_n_jokes)
    with open(jokes_filename, 'wb') as fp:
         pickle.dump(jokes, fp)

Total number of topics: 34
Total number of jokes: 20945
Total number of jokes for topic Yo' Mama: 824/20945
Topic Yo' Mama: 


In [44]:
# print stats on topics and jokes

with open(topics_filename, 'r') as fp:
    topics = json.load(fp)  
with open(jokes_filename, 'r') as fp:
    jokes = json.load(fp)

tot_n_jokes = 0
topics_without_jokes = []
jokes_per_topic = []
for topic in topics.keys():
    if topic in jokes.keys():
        n_jokes_topic = 0
        jokes_topic = jokes[topic]
        for joke in jokes_topic:
            n_jokes_topic+=1
        #print(topic + ': ' + str(n_jokes_topic))
    else:
        topics_without_jokes.append(topic)
    tot_n_jokes += n_jokes_topic
    jokes_per_topic.append(n_jokes_topic)

print('Total number of topics: ' + str(len(topics.keys())))
print('Total number of jokes: ' + str(tot_n_jokes))
print('Jokes per topic = ' + str(round(np.mean(jokes_per_topic))) + ' +- ' + str(round(np.std(jokes_per_topic))))

Animal: 819
Blonde: 400
Blue Collar: 370
Cross the Road: 47
Dark Humor: 474
Dirty: 1034
Doctor: 825
Fat: 403
Food: 472
God: 566
Gross: 679
Insults: 1034
Kids: 926
Lawyer: 149
Little Johnny: 68
Lookin' Good: 1034
Marriage: 821
Men/Women: 1031
Miscellaneous: 1033
Money: 701
NSFW: 36
Nationality: 773
News & Politics: 849
Partying & Bad Behavior: 909
Pick-Up Lines: 593
Police & Military: 575
Pop Culture & Celebrity: 984
School: 388
Sports & Athletes: 494
Technology: 384
Travel & Car: 575
Walks into a Bar: 78
Work: 597
Yo' Mama: 824

Total number of topics: 34
Total number of jokes: 20945
Jokes per topic = 616.0 +- 309.0
