# Jokes

In [1]:
# Path to ml-jokes folder
import os
os.chdir('..')

In [2]:
import pandas as pd
from bs4 import BeautifulSoup
import re
from nltk import word_tokenize
from nltk.corpus import stopwords, wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer

# Run the first time
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('wordnet')

stopwd = stopwords.words('english')

## Utils

In [3]:
def extract_joke(path):
    with open(path) as fp:
        soup = BeautifulSoup(fp, "html.parser")
        text = soup.find('table').get_text()
        text = re.sub('\n', ' ', text)
        text = re.sub('[\s]+', ' ', text)
        return text.strip()

def clean_text(text):
    lemmas = []
    tokens = word_tokenize(text)
    lemmas = [WordNetLemmatizer().lemmatize(token).lower() for token in tokens if token.lower() not in stopwd and re.match('^[\w\s]+$', token)]
    return ' '.join(lemmas)



## Read jokes

In [9]:
def read_jokes(path_jokes='./data/jokes'):
    jokes = []
    jokes_clean = []
    jokes_len = []


    for file in os.listdir(path_jokes):
        text = extract_joke(path_jokes+'/'+file)
        clean = clean_text(text) 
        jokes.append(text)
        jokes_clean.append(clean)
        jokes_len.append(len(text.split()))

    return pd.DataFrame({'text': jokes, 
                         'clean_text': jokes_clean, 
                         'len': jokes_len})

# Example
read_jokes()

## Read ratings

In [12]:
def read_ratings(path_ratings='./data/ratings'):
    ratings = pd.DataFrame()
    
    for file in os.listdir(path_ratings):
        filename = path_ratings+'/' + file
        print(filename)
        temp = pd.read_excel(filename,header=None)
        ratings = pd.concat([ratings, temp], ignore_index=True)

    ratings.rename(columns= {0: 'count_rated'}, inplace=True)
    
    return ratings

read_ratings()

./data/ratings/jester-data-3.xls
./data/ratings/jester-data-2.xls
./data/ratings/jester-data-1.xls


Unnamed: 0,count_rated,1,2,3,4,5,6,7,8,9,...,91,92,93,94,95,96,97,98,99,100
0,26,99.00,99.00,99.00,99.00,-1.65,99.00,-0.78,6.89,99.00,...,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00
1,33,99.00,99.00,99.00,99.00,-9.27,99.00,-9.17,-8.59,99.00,...,99.00,99.00,-2.77,99.00,99.00,99.00,99.00,99.00,99.00,99.00
2,16,99.00,99.00,99.00,99.00,-6.12,99.00,-7.48,-7.77,99.00,...,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00
3,24,99.00,0.05,99.00,99.00,-2.82,99.00,-4.85,-0.87,99.00,...,99.00,99.00,99.00,99.00,1.84,99.00,99.00,99.00,-4.08,99.00
4,22,99.00,99.00,99.00,99.00,-4.95,99.00,6.21,2.72,99.00,...,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73416,100,0.44,7.43,9.08,2.33,3.20,6.75,-8.79,-0.53,-8.74,...,8.83,-1.21,9.22,-6.70,8.45,9.03,6.55,8.69,8.79,7.43
73417,91,9.13,-8.16,8.59,9.08,0.87,-8.93,-3.50,5.78,-8.11,...,-1.17,-5.73,-1.46,0.24,9.22,-8.20,-7.23,-8.59,9.13,8.45
73418,39,99.00,99.00,99.00,99.00,-7.77,99.00,6.70,-6.75,99.00,...,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00
73419,37,99.00,99.00,99.00,99.00,-9.71,99.00,4.56,-8.30,99.00,...,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00,99.00
