# Reformat the data into one-hot format
**Author**: Paokuan Chin (lkkcpaul@gmail.com)

**Achievements**: The new csv files are saved in train_wide.csv. Same for test and val data.

In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer


In [18]:
test_df = pd.read_csv('../data/test.tsv', sep = '\t', header=None, names=['Text','Classes','ID'])
test_df.head()

Unnamed: 0,Text,Classes,ID
0,I’m really sorry about your situation :( Altho...,25,eecwqtt
1,It's wonderful because it's awful. At not with.,0,ed5f85d
2,"Kings fan here, good luck to you guys! Will be...",13,een27c3
3,"I didn't know that, thank you for teaching me ...",15,eelgwd1
4,They got bored from haunting earth for thousan...,27,eem5uti


In [16]:
lab2idx = {}
idx2lab = {}
i = 0
with open('../data/emotions.txt','r') as f:
    for line in f:
        emo = line.strip()
        lab2idx[emo]=i
        idx2lab[i] = emo 
        i+=1
lab2idx

{'admiration': 0,
 'amusement': 1,
 'anger': 2,
 'annoyance': 3,
 'approval': 4,
 'caring': 5,
 'confusion': 6,
 'curiosity': 7,
 'desire': 8,
 'disappointment': 9,
 'disapproval': 10,
 'disgust': 11,
 'embarrassment': 12,
 'excitement': 13,
 'fear': 14,
 'gratitude': 15,
 'grief': 16,
 'joy': 17,
 'love': 18,
 'nervousness': 19,
 'optimism': 20,
 'pride': 21,
 'realization': 22,
 'relief': 23,
 'remorse': 24,
 'sadness': 25,
 'surprise': 26,
 'neutral': 27}

In [20]:
def idx_list_2_label_list(indices):
    return [idx2lab[int(i)] for i in indices.split(',')]
idx_list_2_label_list("1,25")

['amusement', 'sadness']

In [22]:
test_df['Labels'] = test_df['Classes'].apply(idx_list_2_label_list)
test_df.head(10)

Unnamed: 0,Text,Classes,ID,Labels
0,I’m really sorry about your situation :( Altho...,25,eecwqtt,[sadness]
1,It's wonderful because it's awful. At not with.,0,ed5f85d,[admiration]
2,"Kings fan here, good luck to you guys! Will be...",13,een27c3,[excitement]
3,"I didn't know that, thank you for teaching me ...",15,eelgwd1,[gratitude]
4,They got bored from haunting earth for thousan...,27,eem5uti,[neutral]
5,Thank you for asking questions and recognizing...,15,ef2nq7i,[gratitude]
6,You’re welcome,15,efdbh17,[gratitude]
7,100%! Congrats on your job too!,15,ef0ec3b,[gratitude]
8,I’m sorry to hear that friend :(. It’s for the...,24,ee8utmi,[remorse]
9,"Girlfriend weak as well, that jump was pathetic.",25,eeni74k,[sadness]


In [23]:
OHLabeler = MultiLabelBinarizer()

temp = pd.DataFrame(OHLabeler.fit_transform(test_df['Labels']))
temp.columns = OHLabeler.classes_
temp.head()

In [35]:
emos = list(lab2idx.keys())
emos.remove('neutral')
test_df = test_df.join(temp[emos+['neutral']])
test_df.head()

Unnamed: 0,Text,Classes,ID,Labels,admiration,amusement,anger,annoyance,approval,caring,...,love,nervousness,optimism,pride,realization,relief,remorse,sadness,surprise,neutral
0,I’m really sorry about your situation :( Altho...,25,eecwqtt,[sadness],0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
1,It's wonderful because it's awful. At not with.,0,ed5f85d,[admiration],1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,"Kings fan here, good luck to you guys! Will be...",13,een27c3,[excitement],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,"I didn't know that, thank you for teaching me ...",15,eelgwd1,[gratitude],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,They got bored from haunting earth for thousan...,27,eem5uti,[neutral],0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [36]:
test_df.to_csv('test_wide.csv',index=False)

In [38]:
train_df = pd.read_csv('../data/train.tsv', sep = '\t', header=None, names=['Text','Classes','ID'])
train_df['Labels'] = train_df['Classes'].apply(idx_list_2_label_list)
temp = pd.DataFrame(OHLabeler.transform(train_df['Labels']), columns = OHLabeler.classes_)
train_df = train_df.join(temp[emos+['neutral']])
train_df.to_csv('train_wide.csv',index=False)

In [39]:
val_df = pd.read_csv('../data/dev.tsv', sep = '\t', header=None, names=['Text','Classes','ID'])
val_df['Labels'] = val_df['Classes'].apply(idx_list_2_label_list)
temp = pd.DataFrame(OHLabeler.transform(val_df['Labels']), columns = OHLabeler.classes_)
val_df = val_df.join(temp[emos+['neutral']])
val_df.to_csv('val_wide.csv',index=False)