In [91]:
import pandas as pd
import os, json

## 1. Formality Dataset

In [92]:
# Formality dataset (GYAFC)
data_dir = '../data/GYAFC_Corpus'
output_dir = '../data/processed/formality'
output_dir_toy = f'{output_dir}_toy'
entertainment = f"{data_dir}/Entertainment_Music"
family = f"{data_dir}/Family_Relationships"

train_sent = []
train_labels = []
dev_sent = []
dev_labels = []


In [93]:
for dir_ in [entertainment, family]:
    for l, label in enumerate(['informal', 'formal']):
        with open(f"{dir_}/train/{label}","r") as fob:
            temp = fob.readlines()
            train_sent += temp
            train_labels += ([l] * len(temp))
        with open(f"{dir_}/test/{label}","r") as fob:
            temp = fob.readlines()
            dev_sent += temp
            dev_labels += ([l] * len(temp))
            
train_sent = [x.strip() for x in train_sent]
dev_sent = [x.strip() for x in dev_sent]

In [94]:
train_df = pd.DataFrame({'sentence': train_sent, 'label': train_labels})
dev_df = pd.DataFrame({'sentence': dev_sent, 'label': dev_labels})

In [95]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])

#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")


original train size : (209124, 3), original dev size : (4849, 3)
filtered train size : (207366, 3), filtered dev size : (4803, 3)
shuffled train size : (169735, 2), shuffled dev size : (42434, 2)


In [96]:
train_df.head()

Unnamed: 0,sentence,label
0,(IE: Seeing #2 without #1 knowing.),0
1,Yea its Elton The FAG John there ya go !,0
2,My Java teacher is dumb and crazy.,1
3,What the hell is wrong with you?!,0
4,Have fun finding out because I don't know the ...,1


In [97]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.505335
1    0.494665
dtype: float64 

Dev Split Perc :  label
0    0.507282
1    0.492718
dtype: float64 

Train Split Perc :  label
0    0.496
1    0.504
dtype: float64 

Dev Split Perc :  label
0    0.54
1    0.46
dtype: float64 



In [98]:
#Data Config
config = {
    "name" : "formality",
    "description" : "Derived from the GYAFC Corpus",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from the GYAFC Corpus; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 2. Short Jokes

In [99]:
data_dir = '../data/ShortJokeKaggle/'
output_dir = '../data/processed/jokes'
output_dir_toy = f'{output_dir}_toy'

train_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t", header=None)
dev_df = pd.read_csv(f"{data_dir}/dev.tsv", sep="\t", header=None)

train_df.columns = ['idx', 'source', 'label', 'sentence']
dev_df.columns = ['idx', 'source', 'label', 'sentence']

In [100]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])


#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")

original train size : (406682, 5), original dev size : (22512, 5)
filtered train size : (357062, 5), filtered dev size : (19797, 5)
shuffled train size : (301487, 2), shuffled dev size : (75372, 2)


In [101]:
train_df.head()

Unnamed: 0,sentence,label
0,"haha, exactly what ive been thinking",0
1,usually security guards patrol the grounds at ...,0
2,"Insomnia sufferers, look on the bright side. o...",1
3,I have never once hit a drink or treated one b...,1
4,Outvoted 1-1 by my wife again.,1


In [102]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.433488
1    0.566512
dtype: float64 

Dev Split Perc :  label
0    0.434339
1    0.565661
dtype: float64 

Train Split Perc :  label
0    0.427
1    0.573
dtype: float64 

Dev Split Perc :  label
0    0.435
1    0.565
dtype: float64 



In [103]:
#Data Config
config = {
    "name" : "jokes",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 3. Metaphor

In [104]:
data_dir = '../data/VUA/'
output_dir = '../data/processed/metaphor'
output_dir_toy = f'{output_dir}_toy'

train_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t", header=None)
dev_df = pd.read_csv(f"{data_dir}/dev.tsv", sep="\t", header=None)
test_df = pd.read_csv(f"{data_dir}/test.tsv", sep="\t", header=None)

In [105]:
train_df.columns = ['source', 'sentence', 'label']
dev_df.columns = ['source', 'sentence', 'label']
test_df.columns = ['source', 'sentence', 'label']

In [106]:
dev_df = pd.concat([dev_df,test_df])

In [107]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])


#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")

original train size : (15157, 4), original dev size : (7511, 4)
filtered train size : (14484, 4), filtered dev size : (7061, 4)
shuffled train size : (17236, 2), shuffled dev size : (4309, 2)


In [108]:
train_df.head()

Unnamed: 0,sentence,label
0,As his eyes focused he realized he was looking...,1
1,The increase will not be matched by dividend r...,1
2,"If the complaint is proved , a nuisance order ...",0
3,"Let me chop you that much , you eat up that let",0
4,Workers in blue overalls drifted around us and...,1


In [109]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.715653
1    0.284347
dtype: float64 

Dev Split Perc :  label
0    0.707357
1    0.292643
dtype: float64 

Train Split Perc :  label
0    0.705
1    0.295
dtype: float64 

Dev Split Perc :  label
0    0.71
1    0.29
dtype: float64 



In [110]:
#Data Config
config = {
    "name" : "jokes",
    "description" : "Derived from VUA",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nometaphor",
        1 : "metaphor",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from VUA",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nometaphor",
        1 : "metaphor",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)