In [1]:
import pandas as pd
import os, json

## 1. Formality Dataset

In [92]:
# Formality dataset (GYAFC)
data_dir = '../data/GYAFC_Corpus'
output_dir = '../data/processed/formality'
output_dir_toy = f'{output_dir}_toy'
entertainment = f"{data_dir}/Entertainment_Music"
family = f"{data_dir}/Family_Relationships"

train_sent = []
train_labels = []
dev_sent = []
dev_labels = []


In [93]:
for dir_ in [entertainment, family]:
    for l, label in enumerate(['informal', 'formal']):
        with open(f"{dir_}/train/{label}","r") as fob:
            temp = fob.readlines()
            train_sent += temp
            train_labels += ([l] * len(temp))
        with open(f"{dir_}/test/{label}","r") as fob:
            temp = fob.readlines()
            dev_sent += temp
            dev_labels += ([l] * len(temp))
            
train_sent = [x.strip() for x in train_sent]
dev_sent = [x.strip() for x in dev_sent]

In [94]:
train_df = pd.DataFrame({'sentence': train_sent, 'label': train_labels})
dev_df = pd.DataFrame({'sentence': dev_sent, 'label': dev_labels})

In [95]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])

#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")


original train size : (209124, 3), original dev size : (4849, 3)
filtered train size : (207366, 3), filtered dev size : (4803, 3)
shuffled train size : (169735, 2), shuffled dev size : (42434, 2)


In [96]:
train_df.head()

Unnamed: 0,sentence,label
0,(IE: Seeing #2 without #1 knowing.),0
1,Yea its Elton The FAG John there ya go !,0
2,My Java teacher is dumb and crazy.,1
3,What the hell is wrong with you?!,0
4,Have fun finding out because I don't know the ...,1


In [97]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.505335
1    0.494665
dtype: float64 

Dev Split Perc :  label
0    0.507282
1    0.492718
dtype: float64 

Train Split Perc :  label
0    0.496
1    0.504
dtype: float64 

Dev Split Perc :  label
0    0.54
1    0.46
dtype: float64 



In [98]:
#Data Config
config = {
    "name" : "formality",
    "description" : "Derived from the GYAFC Corpus",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from the GYAFC Corpus; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 2. Short Jokes

In [99]:
data_dir = '../data/ShortJokeKaggle/'
output_dir = '../data/processed/jokes'
output_dir_toy = f'{output_dir}_toy'

train_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t", header=None)
dev_df = pd.read_csv(f"{data_dir}/dev.tsv", sep="\t", header=None)

train_df.columns = ['idx', 'source', 'label', 'sentence']
dev_df.columns = ['idx', 'source', 'label', 'sentence']

In [100]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])


#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")

original train size : (406682, 5), original dev size : (22512, 5)
filtered train size : (357062, 5), filtered dev size : (19797, 5)
shuffled train size : (301487, 2), shuffled dev size : (75372, 2)


In [101]:
train_df.head()

Unnamed: 0,sentence,label
0,"haha, exactly what ive been thinking",0
1,usually security guards patrol the grounds at ...,0
2,"Insomnia sufferers, look on the bright side. o...",1
3,I have never once hit a drink or treated one b...,1
4,Outvoted 1-1 by my wife again.,1


In [102]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.433488
1    0.566512
dtype: float64 

Dev Split Perc :  label
0    0.434339
1    0.565661
dtype: float64 

Train Split Perc :  label
0    0.427
1    0.573
dtype: float64 

Dev Split Perc :  label
0    0.435
1    0.565
dtype: float64 



In [103]:
#Data Config
config = {
    "name" : "jokes",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 3. Metaphor

In [104]:
data_dir = '../data/VUA/'
output_dir = '../data/processed/metaphor'
output_dir_toy = f'{output_dir}_toy'

train_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t", header=None)
dev_df = pd.read_csv(f"{data_dir}/dev.tsv", sep="\t", header=None)
test_df = pd.read_csv(f"{data_dir}/test.tsv", sep="\t", header=None)

In [105]:
train_df.columns = ['source', 'sentence', 'label']
dev_df.columns = ['source', 'sentence', 'label']
test_df.columns = ['source', 'sentence', 'label']

In [106]:
dev_df = pd.concat([dev_df,test_df])

In [107]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])


#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")

original train size : (15157, 4), original dev size : (7511, 4)
filtered train size : (14484, 4), filtered dev size : (7061, 4)
shuffled train size : (17236, 2), shuffled dev size : (4309, 2)


In [108]:
train_df.head()

Unnamed: 0,sentence,label
0,As his eyes focused he realized he was looking...,1
1,The increase will not be matched by dividend r...,1
2,"If the complaint is proved , a nuisance order ...",0
3,"Let me chop you that much , you eat up that let",0
4,Workers in blue overalls drifted around us and...,1


In [109]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.715653
1    0.284347
dtype: float64 

Dev Split Perc :  label
0    0.707357
1    0.292643
dtype: float64 

Train Split Perc :  label
0    0.705
1    0.295
dtype: float64 

Dev Split Perc :  label
0    0.71
1    0.29
dtype: float64 



In [110]:
#Data Config
config = {
    "name" : "jokes",
    "description" : "Derived from VUA",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nometaphor",
        1 : "metaphor",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from VUA",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nometaphor",
        1 : "metaphor",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 4. Abstracts

In [32]:
abstract_path = '../data/temp/abstract_sents.csv'
abstract_train_path = '../data/processed/abstract/train.csv'
abstract_dev_path = '../data/processed/abstract/dev.csv'
abstract_config_path = '../data/processed/abstract/config.json'
output_dir = '../data/processed/abstract'

In [33]:
with open(abstract_path) as fob:
    abs_data = fob.readlines()
len(abs_data)

57116

In [34]:
formality_df = pd.read_csv('../data/processed_filtered/formality_full/train.csv', header=None)
formality_df[1] = 0
formality_df.to_csv('../data/processed_filtered/formality_full/negative.csv',header=False, index=False)

In [35]:
abs_data = [x.strip().strip('\"').strip() for x in abs_data]
abs_df = pd.DataFrame({'abs':abs_data, 'class': 1})
formality = pd.read_csv('../data/processed_filtered/formality_full/negative.csv', header=None)
formality.columns = ['abs','class']
formality = formality.sample(n=60000,replace=False)


In [36]:
abs_df = pd.concat([abs_df,formality], axis=0)
abs_df

Unnamed: 0,abs,class
0,"38% are observed for OSIRIS and IriCore, respe...",1
1,A smartphone with a mobile app is connected to...,1
2,Good scalability is achieved through flexible ...,1
3,This problem is further compounded due to the ...,1
4,This deep learning based technique is shown to...,1
...,...,...
11295,I believe that if you really are in love with ...,0
98270,Final destination 3 unless u don't like really...,0
144365,YOu should be asking your self what kind of pe...,0
128053,just go for a tiny bit more each time.,0


In [37]:
abs_df = abs_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(abs_df) *  0.8)
dev_samples = len(abs_df) - train_samples

dev_df = abs_df.tail(dev_samples)
train_df = abs_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")


shuffled train size : (93692, 2), shuffled dev size : (23424, 2)


In [38]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(abstract_train_path, index=False, header=False)
dev_df.to_csv(abstract_dev_path, index=False, header=False)

print("Train Split Perc : ", train_df.groupby('class').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('class').size()/len(dev_df),'\n')

Train Split Perc :  class
0    0.512723
1    0.487277
dtype: float64 

Dev Split Perc :  class
0    0.510673
1    0.489327
dtype: float64 



In [39]:
#Data Config
config = {
    "name" : "abstract",
    "description" : "Derived from Abstracts of papers",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "noabstract",
        1 : "abstract",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)

## 5. shakespeare

In [2]:
shake_path = '../data/temp/shake_original.txt'
shake_mod_path = '../data/temp/shake_modern.txt'
shake_train_path = '../data/processed/shakespeare/train.csv'
shake_dev_path = '../data/processed/shakespeare/dev.csv'
shake_config_path = '../data/processed/shakespeare/config.json'
output_dir = '../data/processed/shakespeare'


In [3]:
with open(shake_path) as fob:
    shake = fob.readlines()
print(len(shake))

with open(shake_mod_path) as fob:
    shake_mod = fob.readlines()
print(len(shake_mod))

28239
28239


In [4]:
shake = [x.strip().strip('\"') for x in shake]
shake_mod = [x.strip().strip('\"') for x in shake_mod]
shake_df = pd.DataFrame({'shake':shake, 'class': 1})
shake_mod_df = pd.DataFrame({'shake':shake_mod, 'class': 0})
shake_df

Unnamed: 0,shake,class
0,I have a mind to strike thee ere thou speak’st.,1
1,"Yet if thou say Antony lives, is well, Or frie...",1
2,"Madam, he’s well.",1
3,Well said.,1
4,And friends with Caesar.,1
...,...,...
28234,What a thrice-double ass Was I to take this dr...,1
28235,"Go to, away!",1
28236,"to Stephano and Trinculo] Hence, and bestow yo...",1
28237,"Or stole it, rather.",1


In [5]:
train_samples = int(len(shake_df) *  0.8)
dev_samples = len(shake_df) - train_samples

dev_df = pd.concat([shake_df.tail(dev_samples),shake_mod_df.tail(dev_samples)])
train_df = pd.concat([shake_df.head(train_samples),shake_mod_df.head(train_samples)])
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")


shuffled train size : (45182, 2), shuffled dev size : (11296, 2)


In [6]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(shake_train_path, index=False, header=False)
dev_df.to_csv(shake_dev_path, index=False, header=False)

print("Train Split Perc : ", train_df.groupby('class').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('class').size()/len(dev_df),'\n')

Train Split Perc :  class
0    0.5
1    0.5
dtype: float64 

Dev Split Perc :  class
0    0.5
1    0.5
dtype: float64 



In [7]:
#Data Config
config = {
    "name" : "shakespeare",
    "description" : "Derived from Shakespeare Plays (https://github.com/cocoxu/Shakespeare)",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "noshakespeare",
        1 : "shakespeare",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)

# Filter Data

In [1]:
import pandas as pd
import os, json, shutil
from collections import Counter, OrderedDict

In [9]:
## formality
data_input = '../data/processed/formality'
data_output = '../data/processed_filtered/formality'
data_output_full = '../data/processed_filtered/formality_full'

formality_t = pd.read_csv(f'{data_input}/train.csv', header=None)
formality_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

In [10]:
p = OrderedDict(Counter(list(' '.join(formality_t[0]) + ' '.join(formality_d[0]))).most_common())
print(p)
print('\n',[k for k in p])

OrderedDict([(' ', 2280015), ('e', 982719), ('t', 770665), ('o', 756069), ('a', 641948), ('i', 572556), ('n', 558029), ('s', 509686), ('h', 457329), ('r', 428444), ('l', 360810), ('u', 313756), ('d', 292019), ('y', 262229), ('m', 221332), ('.', 220916), ('w', 188720), ('g', 176637), ('c', 170854), ('f', 155649), ('b', 134281), ('p', 118800), ('k', 110853), ('I', 103104), ('v', 95533), (',', 84555), ('T', 47063), ("'", 45414), ('!', 40085), ('E', 33123), ('A', 32658), ('S', 30865), ('O', 29809), ('H', 26929), ('?', 26852), ('N', 24120), ('Y', 23070), ('D', 20951), ('L', 19131), ('M', 19070), ('W', 18745), ('j', 18628), ('R', 17674), ('B', 16991), ('C', 14101), ('G', 13595), ('x', 13413), ('U', 11484), ('P', 11439), ('F', 9829), ('-', 9403), ('"', 9262), ('q', 7274), ('z', 7037), ('K', 6774), (')', 6382), ('1', 6195), ('J', 6089), ('0', 5422), ('V', 5253), ('2', 5204), (':', 4589), ('(', 4524), ('3', 2791), ('5', 2565), ('*', 2404), ('4', 2355), (';', 2335), ('9', 2076), ('/', 1909), ('8

In [11]:
formality_t['weird']= formality_t[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '_', '@', 'Q', '=', '#', '>', '^', '%', '~', '`', ']', '+', '[', '<', '’', '¨', '´', '}', '{', '“', '—', '…', '”', 'é', '|', '¡', 'Ü', '–', 'ı', '♥', '¿', 'ñ', '·', '\\', '\u200b', 'ü', '¢', 'ö', '§', 'á', 'è', '˝', '嘉', '義', '人', '因', '為', '綠', '豆', '加', '薏', '仁', '©', '™', '‘', '☺', 'ŕ', 'ā', 'ə', '®', 'š', '†', 'Æ', '恭', '喜', '發', '財', '♡', '½', 'í', 'ƒ', 'Ä', 'ù', 'س', 'ا', 'م', 'ه', 'º', '¹', 'œ', '•', 'ó', '►', 'λ', '◄', 'à', '»', 'ĕ', 'û', 'ï', '(',')',':','--', '....', '!!!', 'www', 'http']])) )
formality_d['weird']= formality_d[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '_', '@', 'Q', '=', '#', '>', '^', '%', '~', '`', ']', '+', '[', '<', '’', '¨', '´', '}', '{', '“', '—', '…', '”', 'é', '|', '¡', 'Ü', '–', 'ı', '♥', '¿', 'ñ', '·', '\\', '\u200b', 'ü', '¢', 'ö', '§', 'á', 'è', '˝', '嘉', '義', '人', '因', '為', '綠', '豆', '加', '薏', '仁', '©', '™', '‘', '☺', 'ŕ', 'ā', 'ə', '®', 'š', '†', 'Æ', '恭', '喜', '發', '財', '♡', '½', 'í', 'ƒ', 'Ä', 'ù', 'س', 'ا', 'م', 'ه', 'º', '¹', 'œ', '•', 'ó', '►', 'λ', '◄', 'à', '»', 'ĕ', 'û', 'ï', '(',')',':','--', '....', '!!!', 'www', 'http']])) )

formality_t['tokens'] = formality_t[0].apply(lambda x: len(x.split(' ')))
formality_d['tokens'] = formality_d[0].apply(lambda x: len(x.split(' ')))


In [12]:
print(formality_t.shape, formality_t[(~formality_t['weird']) & (formality_t['tokens']>=5) & (formality_t['tokens']<=30)].shape)
print(formality_d.shape, formality_d[(~formality_d['weird']) & (formality_d['tokens']>=5) & (formality_d['tokens']<=30)].shape)

(169735, 4) (154938, 4)
(42434, 4) (38709, 4)


In [13]:
formality_t = formality_t[(~formality_t['weird']) & (formality_t['tokens']>=5) & (formality_t['tokens']<=30)].filter([0,1])
formality_d = formality_d[(~formality_d['weird']) & (formality_d['tokens']>=5) & (formality_d['tokens']<=30)].filter([0,1])

formality_t = formality_t.sample(frac=1).reset_index(drop=True)
formality_d = formality_d.sample(frac=1).reset_index(drop=True)

formality_tt = formality_t[:100000]
formality_dd = formality_d[:25000]


In [15]:
formality_tt.groupby(1).size()

1
0    47230
1    52770
dtype: int64

In [16]:
if not os.path.exists(data_output):
    os.makedirs(data_output)
formality_tt.to_csv(f'{data_output}/train.csv', header=False, index=False)
formality_dd.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/formality/config.json'

In [17]:
if not os.path.exists(data_output_full):
    os.makedirs(data_output_full)
formality_t.to_csv(f'{data_output_full}/train.csv', header=False, index=False)
formality_d.to_csv(f'{data_output_full}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output_full}/config.json')

'../data/processed_filtered/formality_full/config.json'

In [52]:
## Arousal
data_input = '../data/processed/arousal'
data_output = '../data/processed_filtered/arousal'

arousal_t = pd.read_csv(f'{data_input}/train.csv', header=None)
arousal_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

In [53]:
p = OrderedDict(Counter(list(' '.join(arousal_t[0]) + ' '.join(arousal_d[0]))).most_common())
print(p)
print('\n',[k for k in p])

OrderedDict([(' ', 144718), ('e', 79546), ('t', 57680), ('a', 53839), ('o', 52043), ('n', 46969), ('i', 46767), ('s', 43431), ('r', 40361), ('h', 31456), ('l', 28804), ('d', 24916), ('u', 19126), ('c', 18819), ('m', 15939), ('f', 14100), ('g', 13587), ('p', 13325), ('y', 11882), ('w', 11653), ('b', 9178), (',', 8470), ('.', 7980), ('v', 6690), ('k', 5292), ('I', 3115), ('T', 2694), ('A', 2191), ('S', 2020), ('-', 1960), ("'", 1796), ('C', 1693), ('"', 1616), ('0', 1516), ('M', 1386), ('x', 1325), ('1', 1148), ('H', 1070), ('N', 1056), ('W', 1051), ('B', 1012), ('P', 959), ('’', 881), ('E', 854), ('R', 848), ('F', 831), ('j', 767), ('2', 757), ('D', 720), ('9', 696), ('L', 691), ('z', 672), ('q', 638), (':', 638), ('O', 637), ('”', 605), ('“', 601), ('G', 577), ('5', 551), ('Y', 491), ('K', 470), (')', 470), ('3', 469), ('(', 465), ('U', 455), ('J', 412), ('?', 380), ('8', 344), ('4', 340), ('V', 320), ('7', 315), ('6', 307), ('$', 301), (';', 276), ('—', 153), ('!', 140), ('/', 119), (

In [54]:
arousal_t['weird']= arousal_t[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“"]])) )
arousal_d['weird']= arousal_d[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in ['\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“"]])) )

arousal_t['tokens'] = arousal_t[0].apply(lambda x: len(x.split(' ')))
arousal_d['tokens'] = arousal_d[0].apply(lambda x: len(x.split(' ')))


In [55]:
print(arousal_t.shape, arousal_t[(~arousal_t['weird']) & (arousal_t['tokens']>=5) & (arousal_t['tokens']<=40)].shape)
print(arousal_d.shape, arousal_d[(~arousal_d['weird']) & (arousal_d['tokens']>=5) & (arousal_d['tokens']<=40)].shape)

(6901, 4) (5178, 4)
(1726, 4) (1320, 4)


In [56]:
arousal_t.groupby(1).size()

1
0    2422
1    4479
dtype: int64

In [57]:
arousal_t =  arousal_t[(~arousal_t['weird']) & (arousal_t['tokens']>=5) & (arousal_t['tokens']<=40)].filter([0,1])
arousal_d = arousal_d[(~arousal_d['weird']) & (arousal_d['tokens']>=5) & (arousal_d['tokens']<=40)].filter([0,1])

if not os.path.exists(data_output):
    os.makedirs(data_output)
arousal_t.to_csv(f'{data_output}/train.csv', header=False, index=False)
arousal_d.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/arousal/config.json'

In [10]:
## Emo
data_input = '../data/processed/emo'
data_output = '../data/processed_filtered/emo'

emo_t = pd.read_csv(f'{data_input}/train.csv', header=None)
emo_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

In [11]:
p = OrderedDict(Counter(list(' '.join(emo_t[0]) + ' '.join(emo_d[0]))).most_common())
print(p)
print('\n',[k for k in p])

OrderedDict([(' ', 550295), ('e', 250830), ('t', 180123), ('o', 165422), ('a', 160428), ('i', 151709), ('n', 131936), ('s', 114983), ('l', 102090), ('r', 100713), ('h', 97975), ('d', 73543), ('u', 64931), ('m', 60146), ('y', 58064), ('g', 51959), ('f', 49950), ('c', 44549), ('w', 43190), ('p', 34100), ('b', 31669), ('.', 31654), ('k', 27177), ('v', 22213), ('I', 14849), (',', 9914), ("'", 9734), ('!', 9656), ('T', 6468), ('j', 4802), ('@', 4690), ('x', 3847), ('S', 3578), ('?', 3516), ('W', 3406), ('H', 3190), ('A', 3087), ('O', 2970), ('M', 2844), ('Y', 2767), ('L', 2042), ('’', 1973), ('D', 1930), ('N', 1921), ('z', 1911), ('B', 1907), ('G', 1905), ('C', 1834), ('E', 1704), ('q', 1656), ('/', 1500), ('-', 1358), ('P', 1256), ('R', 1202), (';', 1180), ('F', 1049), ('J', 1021), ('&', 993), ('0', 976), ('1', 956), (':', 875), ('2', 865), ('K', 805), ('3', 648), ('U', 639), ('V', 596), ('4', 555), ('_', 555), ('5', 435), ('6', 399), (')', 384), ('8', 366), ('7', 344), ('(', 330), ('9', 3

In [12]:
emo_t['weird']= emo_t[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“",'ï', '¿', '½', '$', '~', '+', ']', '%', '[', '^', '`', '”', '“', '|', '‘', '\\', '—', '–', '。', 'Â', '¡', '′', '{', '}','=', 'ï', '¿', '½', '~', '+', ']']])) )
emo_d['weird']= emo_d[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“",'ï', '¿', '½', '$', '~', '+', ']', '%', '[', '^', '`', '”', '“', '|', '‘', '\\', '—', '–', '。', 'Â', '¡', '′', '{', '}','=', 'ï', '¿', '½', '~', '+', ']']])) )

emo_t['tokens'] = emo_t[0].apply(lambda x: len(x.split(' ')))
emo_d['tokens'] = emo_d[0].apply(lambda x: len(x.split(' ')))


In [13]:
print(emo_t.shape, emo_t[(~emo_t['weird']) & (emo_t['tokens']>=5) & (emo_t['tokens']<=40)].shape)
print(emo_d.shape, emo_d[(~emo_d['weird']) & (emo_d['tokens']>=5) & (emo_d['tokens']<=40)].shape)

(27432, 4) (21866, 4)
(6858, 4) (5533, 4)


In [14]:
emo_t = emo_t[(~emo_t['weird']) & (emo_t['tokens']>=5) & (emo_t['tokens']<=40)].filter([0,1])
emo_d =  emo_d[(~emo_d['weird']) & (emo_d['tokens']>=5) & (emo_d['tokens']<=40)].filter([0,1])

In [15]:
emo_t.groupby(1).size()

1
0     6733
1    15133
dtype: int64

In [16]:
if not os.path.exists(data_output):
    os.makedirs(data_output)
emo_t.to_csv(f'{data_output}/train.csv', header=False, index=False)
emo_d.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/emo/config.json'

In [2]:
## Abstract
data_input = '../data/processed/abstract'
data_output = '../data/processed_filtered/abstract'

abs_t = pd.read_csv(f'{data_input}/train.csv', header=None)
abs_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

p = OrderedDict(Counter(list(' '.join([str(x) for x in abs_t[0]]) + ' '.join([str(x) for x in abs_d[0]]))).most_common())
print(p)
print('\n',[k for k in p])


OrderedDict([(' ', 1851612), ('e', 1120534), ('t', 805990), ('a', 750895), ('i', 701984), ('o', 699165), ('n', 672654), ('s', 585102), ('r', 571847), ('l', 384240), ('h', 359264), ('d', 325966), ('c', 306539), ('u', 261465), ('m', 249750), ('p', 220992), ('g', 191552), ('f', 191140), ('y', 156985), ('w', 132771), ('b', 116400), ('v', 105537), (',', 78585), ('k', 64870), ('.', 55108), ('I', 42135), ('-', 37229), ('T', 31419), ('x', 24423), ('A', 19706), ('N', 19291), ('S', 19198), ('D', 17868), ('M', 16675), ('W', 14817), ('E', 14684), ('L', 14513), ('C', 14183), ("'", 14040), ('R', 13380), ('O', 13050), (')', 12756), ('(', 12422), ('q', 11982), ('z', 11921), ('H', 11374), ('j', 9638), ('P', 9290), ('B', 8623), ('F', 8351), ('G', 7810), ('0', 6971), ('!', 6851), ('Y', 6648), ('?', 6600), ('1', 6359), ('U', 6141), ('2', 5297), ('"', 4648), ('V', 3934), ('3', 3349), ('5', 2677), ('K', 2656), ('9', 2581), ('8', 2168), ('%', 2161), ('4', 2152), ('J', 2103), ('6', 2009), ('/', 1952), (':', 1

In [3]:
abs_t['weird']= abs_t[0].apply(lambda x: bool(sum([str(x).find(i)>=0 for i in [ '_', '–', '”', '“', '^', '\xa0', '\u2009', '`', '~', '—', '×', 'ı', '|', '@', '‘', '±', '#', '\u202f', '\u200b', 'ş', 'ü', 'μ', '‐', '°', 'Θ', 'ε', '∗', '∼', 'ö', '≈', 'ï', 'α', '©', 'ç', '−', 'Φ', 'ã', '≥', 'ğ', 'δ', 'θ', 'η', '®', '…', 'Å', 'Ö', '→', '‡', '√', '\x8c', '\x80', 'â', 'é', 'ä', 'ŷ', '≤', '′', 'σ', '•', 'ˆ', 'Š', 'Ω', '₅', '₀', 'β', '†', 'ν', 'Δ', 'æ', '£', 'à', 'ω', '″', '²', 'ê', 'á', '⊤', 'γ', 'ú', '̆', '║', 'ℜ', 'Æ', 'å', '˜', 'ϵ', '§', 'ξ', 'ĭ', '\u2005', 'ă', '¾', 'Ã', '‰', 'ﬀ', 'µ', 'ź', 'ℓ', '™', '\x83', '\x8a', 'ó', '∞', '✩', 'ø', 'λ', 'ë', '∈', 'κ', '\x93', 'ﬁ', 'ρ', 'ﬃ', 'Ă', 'Ť', '↔','http','www','--','!!']])) )
abs_d['weird']= abs_d[0].apply(lambda x: bool(sum([str(x).find(i)>=0 for i in [ '_', '–', '”', '“', '^', '\xa0', '\u2009', '`', '~', '—', '×', 'ı', '|', '@', '‘', '±', '#', '\u202f', '\u200b', 'ş', 'ü', 'μ', '‐', '°', 'Θ', 'ε', '∗', '∼', 'ö', '≈', 'ï', 'α', '©', 'ç', '−', 'Φ', 'ã', '≥', 'ğ', 'δ', 'θ', 'η', '®', '…', 'Å', 'Ö', '→', '‡', '√', '\x8c', '\x80', 'â', 'é', 'ä', 'ŷ', '≤', '′', 'σ', '•', 'ˆ', 'Š', 'Ω', '₅', '₀', 'β', '†', 'ν', 'Δ', 'æ', '£', 'à', 'ω', '″', '²', 'ê', 'á', '⊤', 'γ', 'ú', '̆', '║', 'ℜ', 'Æ', 'å', '˜', 'ϵ', '§', 'ξ', 'ĭ', '\u2005', 'ă', '¾', 'Ã', '‰', 'ﬀ', 'µ', 'ź', 'ℓ', '™', '\x83', '\x8a', 'ó', '∞', '✩', 'ø', 'λ', 'ë', '∈', 'κ', '\x93', 'ﬁ', 'ρ', 'ﬃ', 'Ă', 'Ť', '↔','http','www','--','!!']])) )

abs_t['tokens'] = abs_t[0].apply(lambda x: len(str(x).split(' ')))
abs_d['tokens'] = abs_d[0].apply(lambda x: len(str(x).split(' ')))


In [4]:
abs_t.groupby('tokens').agg({'weird':'count'}).head(40)*100/len(abs_t)

Unnamed: 0_level_0,weird
tokens,Unnamed: 1_level_1
1,1.319216
2,0.372497
3,0.517654
4,0.331939
5,1.369381
6,6.242796
7,6.521368
8,6.394356
9,5.974896
10,5.632285


In [5]:
print(abs_t.shape, abs_t[(~abs_t['weird']) & (abs_t['tokens']>5) & (abs_t['tokens']<=30)].shape)
print(abs_d.shape, abs_d[(~abs_d['weird']) & (abs_d['tokens']>5) & (abs_d['tokens']<=30)].shape)

(93692, 4) (81523, 4)
(23424, 4) (20306, 4)


In [6]:
abs_t = abs_t[(~abs_t['weird']) & (abs_t['tokens']>5) & (abs_t['tokens']<=30)].filter([0,1])
abs_d =  abs_d[(~abs_d['weird']) & (abs_d['tokens']>5) & (abs_d['tokens']<=30)].filter([0,1])
abs_t.groupby(1).size()

1
0    46607
1    34916
dtype: int64

In [7]:
if not os.path.exists(data_output):
    os.makedirs(data_output)
abs_t.to_csv(f'{data_output}/train.csv', header=False, index=False)
abs_d.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/abstract/config.json'

In [2]:
# Shakespeare
## Abstract
data_input = '../data/processed/shakespeare'
data_output = '../data/processed_filtered/shakespeare'

abs_t = pd.read_csv(f'{data_input}/train.csv', header=None)
abs_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

p = OrderedDict(Counter(list(' '.join([str(x) for x in abs_t[0]]) + ' '.join([str(x) for x in abs_d[0]]))).most_common())
print(p)
print('\n',[k for k in p])



OrderedDict([(' ', 565826), ('e', 270473), ('o', 192783), ('t', 187990), ('a', 162903), ('s', 140861), ('n', 139712), ('h', 138712), ('r', 130948), ('i', 130668), ('l', 99223), ('d', 87066), ('u', 78010), ('m', 65292), ('y', 63586), (',', 52309), ('w', 49863), ('f', 42731), ('c', 40971), ('.', 40937), ('g', 40660), ('b', 30265), ('p', 29289), ('v', 23606), ('I', 23084), ('k', 21827), ('’', 14066), ('T', 11577), ('?', 10178), ('W', 9258), ('A', 9070), ('!', 6388), ('H', 5643), ('B', 4871), ('S', 4568), ('C', 4323), ("'", 4262), ('O', 3952), ('M', 3947), ('Y', 3429), (';', 3160), ('N', 3064), ('G', 3028), ('—', 2815), ('L', 2790), ('D', 2627), ('F', 2430), ('P', 1937), ('j', 1919), ('x', 1639), ('-', 1509), ('q', 1476), ('E', 1334), ('R', 1226), ('z', 894), ('”', 773), (':', 713), ('K', 609), ('“', 585), ('J', 556), ('\xa0', 497), ('U', 415), ('V', 373), ('è', 241), ('Q', 196), ('\u2003', 183), (']', 168), ('[', 75), ('(', 48), (')', 48), ('"', 36), ('‘', 32), ('…', 17), ('é', 15), ('–',

In [3]:
abs_t['weird']= abs_t[0].apply(lambda x: bool(sum([str(x).find(i)>=0 for i in [ '-', '”', ':', '“', '\xa0', 'è', '\u2003', '…', 'é', '–', 'ç', 'à', 'Æ', 'ï','æ', '/', 'http','www','--','!!']])) )
abs_d['weird']= abs_d[0].apply(lambda x: bool(sum([str(x).find(i)>=0 for i in ['-', '”', ':', '“', '\xa0', 'è', '\u2003', '…', 'é', '–', 'ç', 'à', 'Æ', 'ï','æ', '/', 'http','www','--','!!']])) )

abs_t['tokens'] = abs_t[0].apply(lambda x: len(str(x).split(' ')))
abs_d['tokens'] = abs_d[0].apply(lambda x: len(str(x).split(' ')))


In [4]:
abs_t.groupby('tokens').agg({'weird':'count'}).head(40)*100/len(abs_t)

Unnamed: 0_level_0,weird
tokens,Unnamed: 1_level_1
1,2.636005
2,5.06175
3,8.388296
4,12.443008
5,9.154088
6,8.051879
7,6.161746
8,6.70621
9,5.924926
10,4.273826


In [5]:
print(abs_t.shape, abs_t[(~abs_t['weird']) & (abs_t['tokens']>=4) & (abs_t['tokens']<=25)].shape)
print(abs_d.shape, abs_d[(~abs_d['weird']) & (abs_d['tokens']>=4) & (abs_d['tokens']<=25)].shape)

(45182, 4) (34302, 4)
(11296, 4) (7759, 4)


In [6]:
abs_t = abs_t[(~abs_t['weird']) & (abs_t['tokens']>=4) & (abs_t['tokens']<=25)].filter([0,1])
abs_d =  abs_d[(~abs_d['weird']) & (abs_d['tokens']>=4) & (abs_d['tokens']<=25)].filter([0,1])
abs_t.groupby(1).size()

1
0    17345
1    16957
dtype: int64

In [7]:
if not os.path.exists(data_output):
    os.makedirs(data_output)
abs_t.to_csv(f'{data_output}/train.csv', header=False, index=False)
abs_d.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/shakespeare/config.json'