<H1>Loading Dataset & Creating Train / Val / Test Samples for Next Sequence of Labels Prediction</h1>
We load the dataset, split every conversation using a context window of 3 in the format: [UTTERANCE 1 UTTERANCE 2 UTTERANCE 3] [Sequence of Labels associated with UTTERANCE 4]


<i>vers. 10/2023</i>

In [None]:
from datasets import load_dataset
import pandas as pd
from tqdm import tqdm

In [None]:
# IMPORT DATASET
dataset = load_dataset('daily_dialog')

In [None]:
# MAPPING FUNCTIONS

def get_act(label):

    if label == 0:
        return '__dummy__'
    
    elif label == 1:
        return 'inform'

    elif label == 2:
        return 'question'

    elif label == 3:
        return 'directive'
    
    elif label == 4:
        return 'commissive'

    else:
        print('wtf act : ', label)



def get_emo(label):
   
    if label == 0:
            return 'neutral'
        
    elif label == 1:
        return 'anger'

    elif label == 2:
        return 'disgust'

    elif label == 3:
        return 'fear'
    
    elif label == 4:
        return 'happiness'

    elif label == 5:
        return 'sadness'
    
    elif label == 6:
        return 'surprise'

    else:
        print('wtf emotion : ', label)

In [None]:
# EXTRACT CONVERSATIONAL SAMPLES, WITH A WINDOW OF 3: 3 SPEAKER TURNS + RESPONSE
# SPEAKER TURNS ARE SEPARATED WITH THE <SEP> TAG. EX: U1<SEP>U2<SEP>U3
# SEQUENCE OF LABELS IS DIVIDED WITH THE + SIGN. EX: LABEL1+LABEL2+LABEL3

def extract_dataset(ds, window=3):
    dialogues = []
    annots = [] 
    for i in tqdm(range(len(ds))):
        #print(i, ' out of ', len(ds))
        dialog = ds['dialog'][i]
        acts=ds['act'][i]
        emotion = ds['emotion'][i]
        for j in range(window, len(dialog)):
            dialogues.append('<SEP>'.join(dialog[j-window:j]))
            if emotion[j] != 0:
                annot = [get_act(acts[j])]
                annot.append(get_emo(emotion[j]))
                annots.append('+'.join(annot))
            
            else:
                annots.append(get_act(acts[j]))

    return dialogues, annots

In [None]:
cols = ['inform', 'question', 'directive', 'commissive', 'neutral', 'anger', 'disgust', 'fear', 'happiness', 'sadness','surprise']

In [None]:
# CONVERT TEST SAMPLES LABELS TO ONE HOT ENCODING FOR MULTILABEL CLASSIFICATION

def one_hot_encode(ds):
    dialogs = []
    ohe_dic= {x:[] for x in cols}
    for i in tqdm(range(len(ds))):
        text = ds['text'][i]
        dialogs.append(text)
        labels = ds['label'][i].split('+')

        if i == 0:
            print(labels)
            print(ohe_dic.keys())

        for k in ohe_dic.keys():
            if k in labels:
                ohe_dic[k].append(1)
            
            else:
                ohe_dic[k].append(0)

    
    ohe_dic['input'] = dialogs

    return ohe_dic

In [None]:
# GET TEST SAMPLES FOR TRAIN SPLIT 

dialogues, annots = extract_dataset(dataset['train'])
df = pd.DataFrame({'text': dialogues, "label": annots})
df.to_csv('daily_dialog_train_next_window3.csv', encoding = 'UTF-8', index = False)

ohe_dic = one_hot_encode(df)
df_ohe = pd.DataFrame(ohe_dic)
df_ohe.to_csv('daily_dialog_train_next_ohe_window3.csv', encoding = 'UTF-8', index = False)

In [None]:
# GET TEST SAMPLES FOR VAL SPLIT 

dialogues, annots = extract_dataset(dataset['validation'])
df = pd.DataFrame({'text': dialogues, "label": annots})
df.to_csv('daily_dialog_val_next_window3.csv', encoding = 'UTF-8', index = False)

ohe_dic = one_hot_encode(df)
df_ohe = pd.DataFrame(ohe_dic)
df_ohe.to_csv('daily_dialog_val_next_ohe_window3.csv', encoding = 'UTF-8', index = False)

In [None]:
# GET TEST SAMPLES FOR TEST SPLIT 

dialogues, annots = extract_dataset(dataset['test'])
df = pd.DataFrame({'text': dialogues, "label": annots})
df.to_csv('daily_dialog_test_next_window3.csv', encoding = 'UTF-8', index = False)

ohe_dic = one_hot_encode(df)
df_ohe = pd.DataFrame(ohe_dic)
df_ohe.to_csv('daily_dialog_test_next_ohe_window3.csv', encoding = 'UTF-8', index = False)