In [1]:
import json, requests 

def get_sentiment(joke_text):
    # post to sentiment search API
    url = 'http://sentiment.vivekn.com/api/text/'
    response = requests.post(url=url, data=dict(txt = joke_text))
    
    # parse JSON response
    result = json.loads(response.text)['result']
    sentiment, confidence = result['sentiment'], result['confidence']
    
    # convert string result into numerical values
    # s: sentiment sign, 1 for positive, 0 for neutral, -1 for negative
    # I can also change it to output a length 3 vector with 0/1 value for positive, neutral, negative class
    # c: confidence score as a float between 0 and 1
    s = 1 if sentiment == 'Positive' else (-1 if sentiment == 'Negative' else 0)
    c = float(confidence)/100
    return s,c

In [2]:
# test
joke_1 = 'Today a man knocked on my door and asked for a small \
donation towards the local swimming pool. I gave him a glass of water.'
get_sentiment(joke_1)

(0, 0.561001)

In [3]:
joke_2 = 'Two wrongs don\'t make a right, take your parents as an example.'
get_sentiment(joke_2)

(-1, 0.814667)

***

In [4]:
import pandas as pd
import numpy as np
import nltk, re, pprint
"""
Helper functions to load data and tokenize sentence
"""
PRINT_VERBOSE = True
def cprint(s):
    if PRINT_VERBOSE: print(s)

# aggregate jokes and write to combined_jokes file
def aggregate_jokes():
    # comedy central jokes
    cc_jokes = pd.read_csv('../all_cc_jokes.csv', sep = ',', index_col = 0, names = ['type', 'link', 'text'])
    cprint('Number of jokes from Comedy Central: {}'.format(cc_jokes.shape[0]))
    cprint('There are {} types of jokes on Comendy Central'.format(cc_jokes['type'].nunique()))
    
    # one line fun jokes
    one_liner_jokes = pd.read_csv('../onelinefun.csv', encoding = "ISO-8859-1", sep = ',', 
                                  index_col = 0, skiprows = 1, 
                                  names = ['text', 'rating', 'num_ratings', 'tags'])
    cprint('Number of jokes from One Liner Fun: {}'.format(one_liner_jokes.shape[0]))
    
    # combining both jokes
    jokes_data = pd.concat([cc_jokes, one_liner_jokes], axis=0, ignore_index=True)
    cprint('Total number of jokes: {}'.format(jokes_data.shape[0]))
        
    jokes = jokes_data[['text', 'type', 'tags', 'rating', 'num_ratings', 'link']]
    jokes.to_csv('../combined_jokes.csv', index_label='ID')
    
    return True

In [6]:
aggregate_jokes()

Number of jokes from Comedy Central: 15054
There are 33 types of jokes on Comendy Central
Number of jokes from One Liner Fun: 2800
Total number of jokes: 17854


True

In [11]:
jokes = pd.read_csv('../combined_jokes.csv', sep = ',', index_col = 0)
jokes.shape

(17854, 6)

In [12]:
jokes.columns

Index(['text', 'type', 'tags', 'rating', 'num_ratings', 'link'], dtype='object')

In [24]:
tiny_jokes = jokes[0:10]

In [18]:
import os.path
os.path.isfile('../combined_jokes.csv') 


True

In [19]:
os.path.isfile('hello_world.csv') 


False

In [27]:
list(tiny_jokes.index)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [28]:
list(tiny_jokes['text'])

['A bear walks into a bar and says to the bartender, "I\'ll have a pint of beer and a.......... packet of peanuts." The bartender asks, "Why the big pause?"  ',
 'A grasshopper walks into a bar, and the bartender says, "Hey, we have a drink named after you!" The grasshopper looks surprised and asks, "You have a drink named Steve?" ',
 "Q: Why did the chewing gum cross the road? A: He was stuck to the chicken's foot.  ",
 'A kangaroo walks into a bar and tells the bartender, "Blood is the lipstick of all wounds." The bartender does not know how he said this or why.  ',
 'A man and his pet giraffe walk into a bar and start drinking. As the night goes on, they get  drunk, and the giraffe finally passes out. The  man decides to go home. As he\'s leaving, the man is approached by the barkeeper who says, "Hey, you\'re not gonna leave that lyin\' here, are ya?" "Hmph," says the man. "That\'s not a lion -- it\'s a giraffe."  ',
 'Never play leapfrog with a unicorn. ',
 'Two rednecks walk down 