In [1]:
import json
import csv
import pandas as pd
import math

# Creating datasets for finetuning scenarios

In [13]:
from os import listdir
from os.path import isfile, join
path = 'data/jokes_delimited/'

## Scenario 1: all jokes

In [15]:
files = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith(".txt")]
outfile = 'data/training_data/s1_all_jokes.txt'
merge_files(files,outfile)

In [16]:
files

['stupidstuff_delimited.txt',
 'reddit_delimited.txt',
 'wocka_delimited.txt',
 'joke-db_delimited.txt',
 'QA_jokes_delimited.txt',
 'funjokes_delimited.txt']

## Scenario 2: QA-jokes

In [18]:
files = ['QA_jokes_delimited.txt']
outfile = 'data/training_data/s2_qa_jokes.txt'
merge_files(files,outfile)
get_sample_size(outfile)

'Number of jokes in data/training_data/s2_qa_jokes.txt: 38270'

## Scenario 3: Curated jokes

In [19]:
files = ['stupidstuff_delimited.txt','wocka_delimited.txt', 'joke-db_delimited.txt', 'funjokes_delimited.txt' ]
outfile = 'data/training_data/s3_curated_jokes.txt'
merge_files(files,outfile)
get_sample_size(outfile)

'Number of jokes in data/training_data/s3_curated_jokes.txt: 28472'

In [10]:
def merge_files(file_list,outfile):
    all_jokes = ""
    for file in files: 
        with open(path+file, 'r') as file:
            data = file.read()
            all_jokes = all_jokes + data
    with open(outfile,'w') as file:
        file.write(all_jokes) 

In [11]:
def get_sample_size(filename):
    with open(filename, 'r') as file:
        data = file.read()
    jokes = data.split('$$$$$')
    msg = 'Number of jokes in ' + filename + ': ' + str(len(jokes))
    return msg
    

# Creating sub-collections

## Reddit jokes (Pungas 2017)

In [26]:
file = 'data/raw_data/reddit_jokes_taivop.json'
name = 'reddit'
json_to_txt(file,name,True)

Jokes saved as data/jokes_delimited/reddit_delimited.txt


194553

## Wocka.com (Pungas 2017)

In [28]:
file = 'data/raw_data/wocka_taivop.json'
name = 'wocka'
json_to_txt(file,name,True)

Jokes saved as data/jokes_delimited/wocka_delimited.txt


10019

## stupidstuff.org (Pungas 2017)

In [30]:
file = 'data/raw_data/stupidstuff_taivop.json'
name = 'stupidstuff'
json_to_txt(file,name,True)

Jokes saved as data/jokes_delimited/stupidstuff_delimited.txt


3773

## funjokes (Moudgil 2017)

In [31]:
file = 'data/raw_data/funjokes_amoudgl.csv'
name = 'funjokes'
csv_to_txt(file,name,True)

Jokes saved as data/jokes_delimited/funjokes_delimited.txt


9985

## jokedb (Moudgil 2017)

In [32]:
file = 'data/raw_data/joke-db_amoudgl.csv'
name = 'joke-db'
csv_to_txt(file,name,True)

Jokes saved as data/jokes_delimited/joke-db_delimited.txt


4694

## Q&A Jokes (Roznovjak 2017)

### Preprocess Q&A-jokes

In [33]:
file = 'data/raw_data/QA_jokes.csv' 
df = pd.read_csv(file, index_col='ID')
df["body"] = df["Question"].map(str) + ' ' + df["Answer"]
df = df.drop(['Question', 'Answer'], axis =1)
json_file = 'data/raw_data/QA_jokes.json'
df.to_json(json_file, orient="records")

In [34]:
file = 'data/raw_data/QA_jokes.json'
name = 'QA_jokes'
json_to_txt(file,name,True)

Jokes saved as data/jokes_delimited/QA_jokes_delimited.txt


38269

## Joke length statistics

In [35]:
from os import listdir
from os.path import isfile, join
path = 'data/jokes_delimited/'
files = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith(".txt")]

In [36]:
for file in files:
    avg_joke_len = get_avg_joke_length(path + file)
    print(file + ": " + str(avg_joke_len))

stupidstuff_delimited.txt: 582
reddit_delimited.txt: 211
wocka_delimited.txt: 616
joke-db_delimited.txt: 531
QA_jokes_delimited.txt: 90
funjokes_delimited.txt: 200


In [25]:
def get_avg_joke_length(file):
    with open(file, 'r') as file:
        data = file.read()
    total_len = len(data)    
    jokes = data.split("$$$$$")
    num_jokes = len(jokes)
    avg_joke_len = math.floor(total_len/num_jokes)  
    return avg_joke_len   
        

In [37]:
get_avg_joke_length('data/jokes_delimited/funjokes_delimited.txt')

200

In [23]:
# returns number of jokes

def json_to_txt(file,name,put_delimiter):
    with open(file, "r") as file:
        raw_json = json.load(file)
    all_jokes= ''
    joke_count = 0
    for j in range(len(raw_json)):
        body = raw_json[j]['body']
        if put_delimiter == True:
            body = body + ' $$$$$ '
        else: 
            body = body + " "
        joke_count = joke_count + 1   
        all_jokes = all_jokes + body
    if put_delimiter == True:
        outfile = 'data/jokes_delimited/' + name + '_delimited.txt'
    else:
        outfile = 'data/jokes_not_delimited/' + name + '_not_delimited.txt'  
    try: 
        with open(outfile, "w") as text_file:
            text_file.write(all_jokes)
        print("Jokes saved as " + outfile)
    except:
        print("Could not save jokes.")    
    return joke_count  

In [24]:
def csv_to_txt(file,name,put_delimiter):
    df = pd.read_csv(file, index_col='ID')
    df.columns=['body']
    json_file = 'data/raw_data/' + name + '.json'
    #with open(json_file, "w+") as output_file:
        #output_file.write(df.toJSON())
    df.to_json('data/raw_data/' + name + '.json', orient="records")
    joke_count = json_to_txt(json_file,name,put_delimiter)
    return joke_count

In [97]:
for i in range(len(files)):
    df = pd.read_csv(files[i], index_col='ID')
    df.columns=['body'] 
    df.to_json(prefix + filenames[i] +'.json', orient="records")
    json_list.append(prefix + filenames[i] +'.json')


In [98]:
taivop_data = ["stupidstuff", "reddit_jokes", "wocka"]
for item in taivop_data:
    json_list.append(prefix + item + '.json')
filenames = filenames + taivop_data    

In [101]:
json_list

['../data/funjokes.json',
 '../data/funnytweeter.json',
 '../data/funtweets.json',
 '../data/joke-db.json',
 '../data/stupidstuff.json',
 '../data/reddit_jokes.json',
 '../data/wocka.json']

In [99]:
all_jokes= ''
joke_count = 0
for i in range(len(json_list)):
    with open(json_list[i], "r") as file:
        rawa_json = json.load(file)
        for j in range(len(raw_json)):
            body = raw_json[j]['body']
            body = body + ' $$$$$ '
            joke_count = joke_count + 1
            all_jokes = all_jokes + body
            

In [100]:
joke_count

70133

In [102]:
with open("all_jokes.txt", "w") as text_file:
    text_file.write(all_jokes)

In [14]:
all_jokes= ''
for i in range(len(reddit_jokes)):
    body = reddit_jokes[i]['body']
    body = body + ' $$$$$ '
    all_jokes = all_jokes + body
for i in range(len(stupidstuff)):    
    body = stupidstuff[i]['body']
    body = body + ' $$$$$ '
    all_jokes = all_jokes + body
for i in range(len(wocka)):
    body = wocka[i]['body']
    body = body + ' $$$$$ '
    all_jokes = all_jokes + body    
    