In [10]:
from transformers import pipeline
import logging
logging.getLogger("transformers").setLevel(logging.WARNING)

def classify_text(text, classifier):
    predicted_label = classifier(text)

    # # Sort the list of dictionaries based on the 'score' in descending order
    # sorted_data = sorted(predicted_label[0], key=lambda x: x['score'], reverse=True)
    # return sorted_data[0]['label']
    return predicted_label[0]['label']

model = pipeline("text-classification", model="helper_models/emo_clf/best-model", return_all_scores=False)



### Empathetic Dialogue Datasets

In [2]:
import pandas as pd

# read data
train_data = pd.read_csv('train_emo_prompt.tsv', sep='\t')
eval_data = pd.read_csv('valid_emo_prompt.tsv', sep='\t')

# combine data
data = pd.concat([train_data, eval_data], axis=0)

# change the header
header = {'context': 'label', 'prompt': 'text'}
data = data.rename(columns=header)
len(data)

20408

In [3]:
# data

In [4]:
plutchik = ['anticipating', 'joy', 'trust', 'fear', 'surprise', 'sad', 'disgust', 'anger']

def filter_data(df, label):
    # Define a dictionary to map the values
    mapping = {
        'anxious': 'anticipating',
        'joyful': 'joy',
        'content': 'joy',
        'trusting': 'trust',
        'afraid': 'fear',
        'terrified': 'fear',
        'surprised': 'surprise',
        'lonely': 'sad',
        'devastated': 'sad',
        'disgusted': 'disgust',
        'angry': 'anger',
        'annoyed': 'anger',
        'furious': 'anger'
    }
    # Replace values in the 'context' column using the dictionary
    df['label']  = df['label'].replace(mapping)
   
   # get the data with plutchuik label
    p_df = df[df['label'].isin(label)] 

    # get the data with no plutchuik label
    np_df = df[~df['label'].isin(label)] 

    return p_df, np_df

p_data, np_data = filter_data(data, plutchik)
print(len(p_data))
print(len(np_data))

10089
10319


In [5]:
text = []
old_label = []
new_label = []

for i in range(len(np_data)):
    # classify the text to get new label
    label = classify_text(np_data.iloc[i]['text'], model)
    text.append(np_data.iloc[i]['text'])
    old_label.append(np_data.iloc[i]['label'])
    new_label.append(label)

# combine in one dataframe
new_p_data = pd.DataFrame({
    'label': new_label,
    'text': text,
})


In [6]:
# combine with p_data
new_data = pd.concat([p_data, new_p_data], axis=0)
len(new_data)

20408

In [7]:
# save to file
new_data.to_csv("emo_plutchik_label.tsv", "\t", index=False)

### DailyDialog Datasets

In [2]:
def clean_str(str):
    str = str.replace(" . ", ". ")
    str = str.replace(" , ", ", ")
    str = str.replace(" ? ", "? ")
    str = str.replace(" ' ", "'")
    str = str.replace(" ’ ", "'")
    return str

def get_emotion(value):
    mapping = {'0': 'other', '1': 'anger', '2': 'disgust', '3': 'fear', '4': 'happiness', '5': 'sad', '6': 'surprise'}
    return mapping.get(str(value), 'Invalid')


In [4]:
import pandas as pd

# Open file
path = '../datasets/raw/dailydialog/'
text = open(path+'dialogues_text.txt', 'r')
emo = open(path+'dialogues_emotion.txt', 'r')

# read data
data_text = []
for line in text:
    data_text.append(line)

data_emo = []
for line in emo:
    data_emo.append(line)

text = []
label = []

for i in range(len(data_text)):
    split_text = data_text[i].split("__eou__")
    split_emo= data_emo[i].split(" ")

    for j in range(len(split_text)-1):
        utt_label = get_emotion(split_emo[j])
        utt_text = clean_str(split_text[j])
        if utt_label == 'Invalid':
            print(f"Label: {utt_label} \nText: {utt_text}\n")
        else:
            # print(f"Label: {label} \nText: {text}\n")
            text.append(utt_text)
            label.append(utt_label)

dd_combine = pd.DataFrame({
    'label': label,
    'text': text,
})

dd_combine.head()

Label: Invalid 
Text:  OK, let's go and ask. 



Unnamed: 0,label,text
0,disgust,The kitchen stinks.
1,other,I'll throw out the garbage.
2,happiness,"So Dick, how about getting some coffee for ton..."
3,disgust,Coffee? I don't honestly like that kind of st...
4,other,"Come on, you can at least try a little, besid..."


Plutchik's emotion wheel define happiness is derive from Joy and Trust.<br>
From Daily Dialog datasets, except 'happiness' and 'other' is included in plutchik's primary emotion. Therefore, we will re-labelling only for data with those label with classifier we build before.

In [12]:
# get the data with no plutchuik label
np_dd = dd_combine[~dd_combine['label'].isin(plutchik)] 
np_dd.label.unique()

np_text = []
new_label = []

for i in range(len(np_dd)):
    # classify the text to get new label
    label = classify_text(np_dd.iloc[i]['text'], model)
    np_text.append(np_dd.iloc[i]['text'])
    # old_label.append(np_dd.iloc[i]['label'])
    new_label.append(label)

# combine in one dataframe
new_dd_np = pd.DataFrame({
    'label': new_label,
    'text': np_text,
})
new_dd_np.head()

Unnamed: 0,label,text
0,disgust,I'll throw out the garbage.
1,disgust,"So Dick, how about getting some coffee for ton..."
2,joy,"Come on, you can at least try a little, besid..."
3,sad,"Not for me, Dick."
4,anger,Are things still going badly with your housegu...


In [13]:
len(np_text)

98457

In [14]:
# get data with plutchik label
p_dd = dd_combine[dd_combine['label'].isin(plutchik)] 

# combine with classified non plutchik data
new_dd_data = pd.concat([p_dd, new_dd_np], axis=0)
len(new_dd_data)

102979

In [8]:
# save to file
new_dd_data.to_csv("dd_plutchik_label.tsv", "\t", index=False)

### Topical-Chat

In [1]:
from transformers import pipeline
import logging
logging.getLogger("transformers").setLevel(logging.WARNING)

def classify_pbi(text, classifier):
    predicted_label = classifier(text)
    return predicted_label[0]['label']

model = pipeline("text-classification", model="helper_models/bg_clf/best-model", return_all_scores=False)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
text = 'Yeah, their services are good.'
classify_pbi(text, model)

'No'

In [4]:
# Access the data
import json
import pandas as pd

json_path = "../datasets/raw/topicalchat/"
with open(json_path+"train.json", "r") as json_files:
    data = json.load(json_files)

# Flatten the nested structure
flattened_data = []
for key, value in data.items():
    for content_item in value['content']:
        flattened_data.append({
            'conversation_id': key,
            'article_url': value['article_url'],
            'config': value['config'],
            'message': content_item['message'],
            'agent': content_item['agent'],
            'sentiment': content_item['sentiment'],
            'knowledge_source': content_item['knowledge_source'],
            'turn_rating': content_item['turn_rating']
        })

# Create a DataFrame
df = pd.DataFrame(flattened_data)

In [6]:
df_explode = df.explode('knowledge_source')
unique_values = df_explode['knowledge_source'].unique()
print(unique_values)

['FS1' 'FS2' 'FS3' 'Personal Knowledge' 'AS1' 'AS2' 'AS4' 'AS3']


In [40]:
pk = df_explode[df_explode['knowledge_source'] == 'Personal Knowledge']
fs = df_explode[df_explode['knowledge_source'] == 'FS2']
df_concate = pd.concat([pk, fs])
len(df_concate)

100749

In [41]:
col_to_remove = ['conversation_id', 'article_url', 'config', 'agent', 'sentiment', 'turn_rating']
df_shrink = df_concate.drop(columns= col_to_remove)

In [44]:
# labelling the prepared data
pb_text = []
pb_label = []

for i in range(len(df_shrink)):
    # classify the text to get new label
    label = classify_pbi(df_shrink.iloc[i]['message'], model)
    pb_text.append(df_shrink.iloc[i]['message'])
    pb_label.append(label)

In [45]:
# combine in one dataframe
tc_pb_data = pd.DataFrame({
    'label': pb_label,
    'text': pb_text,
})

print(f"Data size:{len(tc_pb_data)}")

# remove duplicate
tc_pb_clean = tc_pb_data.drop_duplicates()
print(f"    Clean data size:{len(tc_pb_clean)}\n")

# check each label data size
t = tc_pb_clean[tc_pb_clean['label']=='Yes']
f = tc_pb_clean[tc_pb_clean['label']=='No']

print(f"Yes label: {len(t)}")
print(f"No label: {len(f)}")

Data size:100749
    Clean data size:90585

Yes label: 24707
No label: 65878


In [47]:
fs1 = df_explode[df_explode['knowledge_source'] == 'FS1']
fs3 = df_explode[df_explode['knowledge_source'] == 'FS3']
fs_13 = pd.concat([fs1, fs3])
df_fs = fs_13.drop(columns= col_to_remove)

In [51]:
from tqdm import tqdm

# labelling the prepared data
pb2_text = []
pb2_label = []

for i in range(len(df_fs)):
    # classify the text to get new label
    label = classify_pbi(df_fs.iloc[i]['message'], model)
    pb2_text.append(df_fs.iloc[i]['message'])
    pb2_label.append(label)

In [52]:
# combine in one dataframe
tc_pb_data2 = pd.DataFrame({
    'label': pb2_label,
    'text': pb2_text,
})

print(f"Data size:{len(tc_pb_data2)}")

# remove duplicate
tc_pb_clean2 = tc_pb_data2.drop_duplicates()
print(f"    Clean data size:{len(tc_pb_clean)}\n")

# check each label data size
t2 = tc_pb_clean2[tc_pb_clean2['label']=='Yes']
f2 = tc_pb_clean2[tc_pb_clean2['label']=='No']

print(f"Yes label: {len(t2)}")
print(f"No label: {len(f2)}")

Data size:101177
    Clean data size:90585

Yes label: 25286
No label: 73598


In [54]:
yes_label = pd.concat([t, t2])
no_label = pd.concat([f, f2])
print(f"yes total: {len(yes_label)}")
print(f"no total: {len(no_label)}")

yes total: 49993
no total: 139476


In [57]:
split_no_label = no_label.sample(frac=0.35, random_state=42)
split_no_label.reset_index(drop=True, inplace=True)
len(split_no_label)

48817

In [58]:
final_data = pd.concat([yes_label, split_no_label])
final_data

Unnamed: 0,label,text
2,Yes,I love to dance a lot. How about you?
12,Yes,I would love to go there. I used to like readi...
21,Yes,"I used to in my childhood but not any more, I ..."
22,Yes,On Paper and yes I do recall seeing Super-hero...
23,Yes,"Right me neither, there were so many good choi..."
...,...,...
48812,No,I would hope that it would force my NO Saints ...
48813,No,Wow. In Japan they love baseball too. Many fan...
48814,No,The one in Canada is the Toronto Raptors right...
48815,No,Ouch! That's all I'll say. I would have brok...


In [59]:
len(final_data)

98810

In [60]:
# save to file
final_data.to_csv("tc_personal_background.tsv", "\t", index=False)

In [32]:
# personal_background data overview
data = pd.read_csv('personal_background.tsv', sep='\t')
data["label"] = data["label"].replace({True: 'Yes', False: 'No'})
t = data[data['label']=='Yes']
f = data[data['label']=='No']

print(f"Yes label: {len(t)}")
print(f"No label: {len(f)}")

Yes label: 3227
No label: 3789
