In [1]:
import pandas as pd
import numpy as np
import os
from os.path import exists

path = './Data/new_humor_datasets/'
output_path = path + 'temp_run/'
datasets = ['amazon', 'yelp_reviews', 'sarcasm_headlines']

In [5]:
df_amazon = pd.read_csv(path + 'amazon/data.csv')
df_yelp = pd.read_csv(path + 'yelp_reviews/data.csv')
df_yelp_short = pd.read_csv(path + 'yelp_reviews/data_short.csv')
df_headlines = pd.read_csv(path + 'sarcasm_headlines/data.csv')

In [3]:
from sklearn.model_selection import train_test_split

def split_train_test_val(df, path):
    os.makedirs(path, exist_ok=True)
    train, test = train_test_split(df, test_size=0.3, shuffle=True, random_state=0)
    test, val = train_test_split(test, test_size=0.5, shuffle=True, random_state=0)
    test.to_csv(f'{path}/test.csv', index=False)
    train.to_csv(f'{path}/train.csv', index=False)
    val.to_csv(f'{path}/val.csv', index=False)

In [4]:
def process_and_save_dataset(df_new, path):
    df_new = df_new.sample(frac=1, random_state=0)
    cols = ['id', 'bert_sentence', 't5_sentence', 'target', 'label']
    df_new = df_new[cols]

    os.makedirs(path, exist_ok=True)
    df_new.to_csv(f'{path}/data.csv', index=False)

    split_train_test_val(df_new, f'{path}/with_val')

### Move amazon to temp_run dir

In [10]:
output_path_amzaon = output_path + 'amazon'
process_and_save_dataset(df_amazon, output_path_amzaon)

### Process Yelp Dataset & save

In [None]:
df_yelp_new = pd.DataFrame()
df_yelp_new['bert_sentence'] = df_yelp['sentence']
df_yelp_new['t5_sentence'] = df_yelp['sentence']
df_yelp_new['label'] = df_yelp['label']
df_yelp_new['target'] = df_yelp_new['label'].apply(lambda label: 'funny' if label == 1 else 'not funny')
df_yelp_new = df_yelp_new[df_yelp_new['bert_sentence'].notna()]
df_yelp_new = df_yelp_new[df_yelp_new['t5_sentence'].notna()]
df_yelp_new['id'] = range(0, len(df_yelp_new))

In [12]:
output_path_yelp = output_path + 'yelp'
process_and_save_dataset(df_yelp_new, output_path_yelp)

### Process YelpShort Dataset (shorter sentences)

In [6]:
df_yelp_short_new = pd.DataFrame()
df_yelp_short_new['bert_sentence'] = df_yelp_short['sentence']
df_yelp_short_new['t5_sentence'] = df_yelp_short['sentence']
df_yelp_short_new['label'] = df_yelp_short['label']
df_yelp_short_new['target'] = df_yelp_short_new['label'].apply(lambda label: 'funny' if label == 1 else 'not funny')
df_yelp_short_new = df_yelp_short_new[df_yelp_short_new['bert_sentence'].notna()]
df_yelp_short_new = df_yelp_short_new[df_yelp_short_new['t5_sentence'].notna()]
df_yelp_short_new['id'] = range(0, len(df_yelp_short_new))

In [7]:
output_path_yelp_short = output_path + 'yelp_short'
process_and_save_dataset(df_yelp_short_new, output_path_yelp_short)

### Process Sarcasm Headlines Dataset & Save

In [13]:
df_headlines_new = pd.DataFrame()
df_headlines_new['bert_sentence'] = df_headlines['sentence']
df_headlines_new['t5_sentence'] = df_headlines['sentence']
df_headlines_new['label'] = df_headlines['label']
df_headlines_new['target'] = df_headlines_new['label'].apply(lambda label: 'funny' if label == 1 else 'not funny')
df_headlines_new = df_headlines_new[df_headlines_new['bert_sentence'].notna()]
df_headlines_new = df_headlines_new[df_headlines_new['t5_sentence'].notna()]
df_headlines_new['id'] = range(0, len(df_headlines_new))

In [14]:
output_path_headlines = output_path + 'sarcasm_headlines'
process_and_save_dataset(df_headlines_new, output_path_headlines)

### Clean Reddit Dad Jokes (as much as possible)

In [3]:
dadjokes_path = './Data/new_humor_datasets/reddit_dadjokes/'
# load Reddit Dad Jokes dataset
df_dadjokes = pd.read_csv(dadjokes_path + 'reddit_dadjokes.csv')

# clean from duplicates and reposts
df_dadjokes = df_dadjokes[df_dadjokes['joke'].apply(lambda joke: 'reposted' not in joke.lower())]
df_dadjokes.drop_duplicates(inplace=True)


In [None]:
# df_dadjokes['id'] = range(1, len(df_dadjokes) + 1)

#### remove problematic samples

In [13]:
indices_to_remove = []

In [14]:
indices_to_remove.append(df_dadjokes[df_dadjokes.joke == 'test test'].index[0])
indices_to_remove.append(df_dadjokes[df_dadjokes['joke'].apply(lambda joke: 'Game Log Output Begins Here' in joke)].index[0])

In [15]:
df_dadjokes.drop(indices_to_remove, inplace=True)

In [17]:
df_dadjokes['id'] = range(1, len(df_dadjokes) + 1)
df_dadjokes.to_csv(dadjokes_path + 'reddit_dadjokes_with_id.csv', index=False)