In [1]:
import pandas as pd
import os, json

## 1. Formality Dataset

In [92]:
# Formality dataset (GYAFC)
data_dir = '../data/GYAFC_Corpus'
output_dir = '../data/processed/formality'
output_dir_toy = f'{output_dir}_toy'
entertainment = f"{data_dir}/Entertainment_Music"
family = f"{data_dir}/Family_Relationships"

train_sent = []
train_labels = []
dev_sent = []
dev_labels = []


In [93]:
for dir_ in [entertainment, family]:
    for l, label in enumerate(['informal', 'formal']):
        with open(f"{dir_}/train/{label}","r") as fob:
            temp = fob.readlines()
            train_sent += temp
            train_labels += ([l] * len(temp))
        with open(f"{dir_}/test/{label}","r") as fob:
            temp = fob.readlines()
            dev_sent += temp
            dev_labels += ([l] * len(temp))
            
train_sent = [x.strip() for x in train_sent]
dev_sent = [x.strip() for x in dev_sent]

In [94]:
train_df = pd.DataFrame({'sentence': train_sent, 'label': train_labels})
dev_df = pd.DataFrame({'sentence': dev_sent, 'label': dev_labels})

In [95]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])

#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")


original train size : (209124, 3), original dev size : (4849, 3)
filtered train size : (207366, 3), filtered dev size : (4803, 3)
shuffled train size : (169735, 2), shuffled dev size : (42434, 2)


In [96]:
train_df.head()

Unnamed: 0,sentence,label
0,(IE: Seeing #2 without #1 knowing.),0
1,Yea its Elton The FAG John there ya go !,0
2,My Java teacher is dumb and crazy.,1
3,What the hell is wrong with you?!,0
4,Have fun finding out because I don't know the ...,1


In [97]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.505335
1    0.494665
dtype: float64 

Dev Split Perc :  label
0    0.507282
1    0.492718
dtype: float64 

Train Split Perc :  label
0    0.496
1    0.504
dtype: float64 

Dev Split Perc :  label
0    0.54
1    0.46
dtype: float64 



In [98]:
#Data Config
config = {
    "name" : "formality",
    "description" : "Derived from the GYAFC Corpus",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from the GYAFC Corpus; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "informal",
        1 : "formal",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 2. Short Jokes

In [99]:
data_dir = '../data/ShortJokeKaggle/'
output_dir = '../data/processed/jokes'
output_dir_toy = f'{output_dir}_toy'

train_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t", header=None)
dev_df = pd.read_csv(f"{data_dir}/dev.tsv", sep="\t", header=None)

train_df.columns = ['idx', 'source', 'label', 'sentence']
dev_df.columns = ['idx', 'source', 'label', 'sentence']

In [100]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])


#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")

original train size : (406682, 5), original dev size : (22512, 5)
filtered train size : (357062, 5), filtered dev size : (19797, 5)
shuffled train size : (301487, 2), shuffled dev size : (75372, 2)


In [101]:
train_df.head()

Unnamed: 0,sentence,label
0,"haha, exactly what ive been thinking",0
1,usually security guards patrol the grounds at ...,0
2,"Insomnia sufferers, look on the bright side. o...",1
3,I have never once hit a drink or treated one b...,1
4,Outvoted 1-1 by my wife again.,1


In [102]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.433488
1    0.566512
dtype: float64 

Dev Split Perc :  label
0    0.434339
1    0.565661
dtype: float64 

Train Split Perc :  label
0    0.427
1    0.573
dtype: float64 

Dev Split Perc :  label
0    0.435
1    0.565
dtype: float64 



In [103]:
#Data Config
config = {
    "name" : "jokes",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from SARC, shortjokes.csv, BiasSum; Toy dataset",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nojoke",
        1 : "joke",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

## 3. Metaphor

In [104]:
data_dir = '../data/VUA/'
output_dir = '../data/processed/metaphor'
output_dir_toy = f'{output_dir}_toy'

train_df = pd.read_csv(f"{data_dir}/train.tsv", sep="\t", header=None)
dev_df = pd.read_csv(f"{data_dir}/dev.tsv", sep="\t", header=None)
test_df = pd.read_csv(f"{data_dir}/test.tsv", sep="\t", header=None)

In [105]:
train_df.columns = ['source', 'sentence', 'label']
dev_df.columns = ['source', 'sentence', 'label']
test_df.columns = ['source', 'sentence', 'label']

In [106]:
dev_df = pd.concat([dev_df,test_df])

In [107]:
#Filter the dataset
train_df['words'] = train_df['sentence'].apply(lambda x: len(x.split(' ')))
dev_df['words'] = dev_df['sentence'].apply(lambda x: len(x.split(' ')))
print(f"original train size : {train_df.shape}, original dev size : {dev_df.shape}")

# Filter out sentences with tokens less than 5 and greater than 64
train_df = train_df[(train_df['words']>4) & (train_df['words']<64)]
dev_df = dev_df[(dev_df['words']>4) & (dev_df['words']<64)]
print(f"filtered train size : {train_df.shape}, filtered dev size : {dev_df.shape}")


#Select necessary columns
train_df = train_df.filter(['sentence','label'])
dev_df = dev_df.filter(['sentence','label'])


#mix train and dev, and reseparate them based on train: 80% and dev 20%
total_df = pd.concat([train_df,dev_df])
total_df = total_df.sample(frac=1).reset_index(drop=True)

train_samples = int(len(total_df) *  0.8)
dev_samples = len(total_df) - train_samples

dev_df = total_df.tail(dev_samples)
train_df = total_df.head(train_samples)
print(f"shuffled train size : {train_df.shape}, shuffled dev size : {dev_df.shape}")

original train size : (15157, 4), original dev size : (7511, 4)
filtered train size : (14484, 4), filtered dev size : (7061, 4)
shuffled train size : (17236, 2), shuffled dev size : (4309, 2)


In [108]:
train_df.head()

Unnamed: 0,sentence,label
0,As his eyes focused he realized he was looking...,1
1,The increase will not be matched by dividend r...,1
2,"If the complaint is proved , a nuisance order ...",0
3,"Let me chop you that much , you eat up that let",0
4,Workers in blue overalls drifted around us and...,1


In [109]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)
train_df.to_csv(f'{output_dir}/train.csv', index=False, header=False)
dev_df.to_csv(f'{output_dir}/dev.csv', index=False, header=False)

print("Train Split Perc : ", train_df.groupby('label').size()/len(train_df),'\n')
print("Dev Split Perc : ", dev_df.groupby('label').size()/len(dev_df),'\n')

if not os.path.exists(output_dir_toy):
    os.makedirs(output_dir_toy)
    
train_df = train_df.sample(frac=1).reset_index(drop=True)
dev_df = dev_df.sample(frac=1).reset_index(drop=True)

print("Train Split Perc : ", train_df.head(1000).groupby('label').size()/1000,'\n')
print("Dev Split Perc : ", dev_df.head(200).groupby('label').size()/200,'\n')

train_df.head(1000).to_csv(f'{output_dir_toy}/train.csv', index=False, header=False)
dev_df.head(200).to_csv(f'{output_dir_toy}/dev.csv', index=False, header=False)

Train Split Perc :  label
0    0.715653
1    0.284347
dtype: float64 

Dev Split Perc :  label
0    0.707357
1    0.292643
dtype: float64 

Train Split Perc :  label
0    0.705
1    0.295
dtype: float64 

Dev Split Perc :  label
0    0.71
1    0.29
dtype: float64 



In [110]:
#Data Config
config = {
    "name" : "jokes",
    "description" : "Derived from VUA",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nometaphor",
        1 : "metaphor",
    }
}

with open(f"{output_dir}/config.json","w") as fob:
    json.dump(config, fob)
    
config = {
    "name" : "formality_toy",
    "description" : "Derived from VUA",
    "input_files" : {
        "train" : "train.csv",
        "dev" : "dev.csv"
    },
    "classes" : {
        0 : "nometaphor",
        1 : "metaphor",
    }
}

with open(f"{output_dir_toy}/config.json","w") as fob:
    json.dump(config, fob)

# Filter Data

In [39]:
import pandas as pd
import os, json, shutil
from collections import Counter, OrderedDict

In [32]:
## formality
data_input = '../data/processed/formality'
data_output = '../data/processed_filtered/formality'

formality_t = pd.read_csv(f'{data_input}/train.csv', header=None)
formality_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

In [33]:
p = OrderedDict(Counter(list(' '.join(formality_t[0]) + ' '.join(formality_d[0]))).most_common())
print(p)
print('\n',[k for k in p])

OrderedDict([(' ', 2280015), ('e', 982719), ('t', 770665), ('o', 756069), ('a', 641948), ('i', 572556), ('n', 558029), ('s', 509686), ('h', 457329), ('r', 428444), ('l', 360810), ('u', 313756), ('d', 292019), ('y', 262229), ('m', 221332), ('.', 220916), ('w', 188720), ('g', 176637), ('c', 170854), ('f', 155649), ('b', 134281), ('p', 118800), ('k', 110853), ('I', 103104), ('v', 95533), (',', 84555), ('T', 47063), ("'", 45414), ('!', 40085), ('E', 33123), ('A', 32658), ('S', 30865), ('O', 29809), ('H', 26929), ('?', 26852), ('N', 24120), ('Y', 23070), ('D', 20951), ('L', 19131), ('M', 19070), ('W', 18745), ('j', 18628), ('R', 17674), ('B', 16991), ('C', 14101), ('G', 13595), ('x', 13413), ('U', 11484), ('P', 11439), ('F', 9829), ('-', 9403), ('"', 9262), ('q', 7274), ('z', 7037), ('K', 6774), (')', 6382), ('1', 6195), ('J', 6089), ('0', 5422), ('V', 5253), ('2', 5204), (':', 4589), ('(', 4524), ('3', 2791), ('5', 2565), ('*', 2404), ('4', 2355), (';', 2335), ('9', 2076), ('/', 1909), ('8

In [34]:
formality_t['weird']= formality_t[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '_', '@', 'Q', '=', '#', '>', '^', '%', '~', '`', ']', '+', '[', '<', '’', '¨', '´', '}', '{', '“', '—', '…', '”', 'é', '|', '¡', 'Ü', '–', 'ı', '♥', '¿', 'ñ', '·', '\\', '\u200b', 'ü', '¢', 'ö', '§', 'á', 'è', '˝', '嘉', '義', '人', '因', '為', '綠', '豆', '加', '薏', '仁', '©', '™', '‘', '☺', 'ŕ', 'ā', 'ə', '®', 'š', '†', 'Æ', '恭', '喜', '發', '財', '♡', '½', 'í', 'ƒ', 'Ä', 'ù', 'س', 'ا', 'م', 'ه', 'º', '¹', 'œ', '•', 'ó', '►', 'λ', '◄', 'à', '»', 'ĕ', 'û', 'ï', '(',')',':','--', '....', '!!!', 'www', 'http']])) )
formality_d['weird']= formality_d[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '_', '@', 'Q', '=', '#', '>', '^', '%', '~', '`', ']', '+', '[', '<', '’', '¨', '´', '}', '{', '“', '—', '…', '”', 'é', '|', '¡', 'Ü', '–', 'ı', '♥', '¿', 'ñ', '·', '\\', '\u200b', 'ü', '¢', 'ö', '§', 'á', 'è', '˝', '嘉', '義', '人', '因', '為', '綠', '豆', '加', '薏', '仁', '©', '™', '‘', '☺', 'ŕ', 'ā', 'ə', '®', 'š', '†', 'Æ', '恭', '喜', '發', '財', '♡', '½', 'í', 'ƒ', 'Ä', 'ù', 'س', 'ا', 'م', 'ه', 'º', '¹', 'œ', '•', 'ó', '►', 'λ', '◄', 'à', '»', 'ĕ', 'û', 'ï', '(',')',':','--', '....', '!!!', 'www', 'http']])) )

formality_t['tokens'] = formality_t[0].apply(lambda x: len(x.split(' ')))
formality_d['tokens'] = formality_d[0].apply(lambda x: len(x.split(' ')))


In [35]:
print(formality_t.shape, formality_t[(~formality_t['weird']) & (formality_t['tokens']>=5) & (formality_t['tokens']<=30)].shape)
print(formality_d.shape, formality_d[(~formality_d['weird']) & (formality_d['tokens']>=5) & (formality_d['tokens']<=30)].shape)

(169735, 4) (154938, 4)
(42434, 4) (38709, 4)


In [36]:
formality_t = formality_t[(~formality_t['weird']) & (formality_t['tokens']>=5) & (formality_t['tokens']<=30)].filter([0,1])
formality_d = formality_d[(~formality_d['weird']) & (formality_d['tokens']>=5) & (formality_d['tokens']<=30)].filter([0,1])

formality_t = formality_t.sample(frac=1).reset_index(drop=True)
formality_d = formality_d.sample(frac=1).reset_index(drop=True)

formality_t = formality_t[:50000]
formality_d = formality_d[:12500]


In [37]:
formality_t.groupby(1).size()

1
0    23560
1    26440
dtype: int64

In [40]:
if not os.path.exists(data_output):
    os.makedirs(data_output)
formality_t.to_csv(f'{data_output}/train.csv', header=False, index=False)
formality_d.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/formality/config.json'

In [52]:
## Arousal
data_input = '../data/processed/arousal'
data_output = '../data/processed_filtered/arousal'

arousal_t = pd.read_csv(f'{data_input}/train.csv', header=None)
arousal_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

In [53]:
p = OrderedDict(Counter(list(' '.join(arousal_t[0]) + ' '.join(arousal_d[0]))).most_common())
print(p)
print('\n',[k for k in p])

OrderedDict([(' ', 144718), ('e', 79546), ('t', 57680), ('a', 53839), ('o', 52043), ('n', 46969), ('i', 46767), ('s', 43431), ('r', 40361), ('h', 31456), ('l', 28804), ('d', 24916), ('u', 19126), ('c', 18819), ('m', 15939), ('f', 14100), ('g', 13587), ('p', 13325), ('y', 11882), ('w', 11653), ('b', 9178), (',', 8470), ('.', 7980), ('v', 6690), ('k', 5292), ('I', 3115), ('T', 2694), ('A', 2191), ('S', 2020), ('-', 1960), ("'", 1796), ('C', 1693), ('"', 1616), ('0', 1516), ('M', 1386), ('x', 1325), ('1', 1148), ('H', 1070), ('N', 1056), ('W', 1051), ('B', 1012), ('P', 959), ('’', 881), ('E', 854), ('R', 848), ('F', 831), ('j', 767), ('2', 757), ('D', 720), ('9', 696), ('L', 691), ('z', 672), ('q', 638), (':', 638), ('O', 637), ('”', 605), ('“', 601), ('G', 577), ('5', 551), ('Y', 491), ('K', 470), (')', 470), ('3', 469), ('(', 465), ('U', 455), ('J', 412), ('?', 380), ('8', 344), ('4', 340), ('V', 320), ('7', 315), ('6', 307), ('$', 301), (';', 276), ('—', 153), ('!', 140), ('/', 119), (

In [54]:
arousal_t['weird']= arousal_t[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“"]])) )
arousal_d['weird']= arousal_d[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in ['\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“"]])) )

arousal_t['tokens'] = arousal_t[0].apply(lambda x: len(x.split(' ')))
arousal_d['tokens'] = arousal_d[0].apply(lambda x: len(x.split(' ')))


In [55]:
print(arousal_t.shape, arousal_t[(~arousal_t['weird']) & (arousal_t['tokens']>=5) & (arousal_t['tokens']<=40)].shape)
print(arousal_d.shape, arousal_d[(~arousal_d['weird']) & (arousal_d['tokens']>=5) & (arousal_d['tokens']<=40)].shape)

(6901, 4) (5178, 4)
(1726, 4) (1320, 4)


In [56]:
arousal_t.groupby(1).size()

1
0    2422
1    4479
dtype: int64

In [57]:
arousal_t =  arousal_t[(~arousal_t['weird']) & (arousal_t['tokens']>=5) & (arousal_t['tokens']<=40)].filter([0,1])
arousal_d = arousal_d[(~arousal_d['weird']) & (arousal_d['tokens']>=5) & (arousal_d['tokens']<=40)].filter([0,1])

if not os.path.exists(data_output):
    os.makedirs(data_output)
arousal_t.to_csv(f'{data_output}/train.csv', header=False, index=False)
arousal_d.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/arousal/config.json'

In [71]:
## Emo
data_input = '../data/processed/emo'
data_output = '../data/processed_filtered/emo'

emo_t = pd.read_csv(f'{data_input}/train.csv', header=None)
emo_d = pd.read_csv(f'{data_input}/dev.csv', header=None)

In [72]:
p = OrderedDict(Counter(list(' '.join(emo_t[0]) + ' '.join(emo_d[0]))).most_common())
print(p)
print('\n',[k for k in p])

OrderedDict([(' ', 313640), ('e', 119655), ('o', 97454), ('t', 94683), ('a', 87416), ('i', 68036), ('n', 67804), ('s', 61513), ('r', 56078), ('h', 53958), ('l', 48032), ('u', 37771), ('d', 37031), ('y', 35929), ('.', 31654), ('m', 31511), ('g', 27342), ('c', 23940), ('w', 23914), ('f', 19013), ('p', 17490), ('k', 16851), ('b', 16071), ('I', 14849), ('v', 11605), (',', 9914), ("'", 9734), ('!', 9656), ('T', 6468), ('@', 4690), ('S', 3578), ('?', 3516), ('W', 3406), ('H', 3190), ('A', 3087), ('O', 2970), ('M', 2844), ('Y', 2767), ('j', 2736), ('x', 2387), ('L', 2042), ('’', 1973), ('D', 1930), ('N', 1921), ('B', 1907), ('G', 1905), ('C', 1834), ('E', 1704), ('/', 1500), ('-', 1358), ('z', 1292), ('P', 1256), ('R', 1202), (';', 1180), ('q', 1050), ('F', 1049), ('J', 1021), ('&', 993), ('0', 976), ('1', 956), (':', 875), ('2', 865), ('K', 805), ('3', 648), ('U', 639), ('V', 596), ('4', 555), ('_', 555), ('5', 435), ('6', 399), (')', 384), ('8', 366), ('7', 344), ('(', 330), ('9', 321), ('*

In [73]:
emo_t['weird']= emo_t[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“",'ï', '¿', '½', '$', '~', '+', ']', '%', '[', '^', '`', '”', '“', '|', '‘', '\\', '—', '–', '。', 'Â', '¡', '′', '{', '}']])) )
emo_d['weird']= emo_d[0].apply(lambda x: bool(sum([x.find(i)>=0 for i in [ '\u2002', '–', '%', '♭', 'Q', '…', '\xad', '♯', '_', '[', ']', 'X', 'ó', '#', '‘', '·', '=', '+', '\xa0', '|', '`', 'é', '<', '>', 'ç', '®', '*', 'ñ', 'ã', '@', 'è', 'õ', '♮', 'ü', 'í', '(',')',':','--', '....', '!!!', 'www', 'http','_',':',"”","“",'ï', '¿', '½', '$', '~', '+', ']', '%', '[', '^', '`', '”', '“', '|', '‘', '\\', '—', '–', '。', 'Â', '¡', '′', '{', '}']])) )

emo_t['tokens'] = emo_t[0].apply(lambda x: len(x.split(' ')))
emo_d['tokens'] = emo_d[0].apply(lambda x: len(x.split(' ')))


In [74]:
print(emo_t.shape, emo_t[(~emo_t['weird']) & (emo_t['tokens']>=5) & (emo_t['tokens']<=40)].shape)
print(emo_d.shape, emo_d[(~emo_d['weird']) & (emo_d['tokens']>=5) & (emo_d['tokens']<=40)].shape)

(17617, 4) (12708, 4)
(4405, 4) (3230, 4)


In [75]:
emo_t = emo_t[(~emo_t['weird']) & (emo_t['tokens']>=5) & (emo_t['tokens']<=40)].filter([0,1])
emo_d =  emo_d[(~emo_d['weird']) & (emo_d['tokens']>=5) & (emo_d['tokens']<=40)].filter([0,1])

In [76]:
emo_t.groupby(1).size()

1
0     2611
1    10097
dtype: int64

In [78]:
if not os.path.exists(data_output):
    os.makedirs(data_output)
emo_t.to_csv(f'{data_output}/train.csv', header=False, index=False)
emo_d.to_csv(f'{data_output}/dev.csv', header=False, index=False)
shutil.copy(f'{data_input}/config.json', f'{data_output}/config.json')

'../data/processed_filtered/emo/config.json'