In [152]:
import pandas as pd
import os, json

## 1. Formality Dataset

In [153]:
# Formality dataset (GYAFC)
data_dir = '../data/GYAFC_Corpus'
output_dir = '../data/processed/formality'
output_dir_toy = f'{output_dir}_toy'
entertainment = f"{data_dir}/Entertainment_Music"
family = f"{data_dir}/Family_Relationships"

train_sent = []
train_labels = []
dev_sent = []
dev_labels = []


In [154]:
for dir_ in [entertainment, family]:
    for l, label in enumerate(['informal', 'formal']):
        with open(f"{dir_}/train/{label}","r") as fob:
            temp = fob.readlines()
            train_sent += temp
            train_labels += ([l] * len(temp))
        with open(f"{dir_}/test/{label}","r") as fob:
            temp = fob.readlines()
            dev_sent += temp
            dev_labels += ([l] * len(temp))
            
train_sent = [x.strip() for x in train_sent]
dev_sent = [x.strip() for x in dev_sent]

In [155]:
train_df = pd.DataFrame({'sentence': train_sent, 'label': train_labels})
dev_df = pd.DataFrame({'sentence': dev_sent, 'label': dev_labels})

In [156]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])

original train size : (209124, 3), original dev size : (4849, 3)
filtered train size : (207366, 3), filtered dev size : (4803, 3)


In [157]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

In [158]:
#Data Config
config = {
    "name" : "formality",
    "description" : "Derived from the GYAFC Corpus",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from the GYAFC Corpus; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 2. Short Jokes

In [159]:
data_dir = '../data/ShortJokeKaggle/'
output_dir = '../data/processed/jokes'
output_dir_toy = f'{output_dir}_toy'

train_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t", header=None)
dev_df = pd.read_csv(f"{data_dir}/dev.tsv", sep="\t", header=None)

train_df.columns = ['idx', 'source', 'label', 'sentence']
dev_df.columns = ['idx', 'source', 'label', 'sentence']

In [160]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])

original train size : (406682, 5), original dev size : (22512, 5)
filtered train size : (357062, 5), filtered dev size : (19797, 5)


In [161]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

In [162]:
#Data Config
config = {
    "name" : "jokes",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)