# Dataset prepration for Emory NLP Fine-tune

In [1]:
import os
import glob
import json
import random
from pathlib import Path
import pandas as pd # type: ignore

### Read data files

In [2]:
df_train = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/emory_nlp_FT/data_files/emorynlp_train_final.csv")
df_test = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/emory_nlp_FT/data_files/emorynlp_test_final.csv")
df_dev = pd.read_csv("/Utilisateurs/umushtaq/emotion_analysis_comics/emory_nlp_FT/data_files/emorynlp_dev_final.csv")

In [5]:
df_train

Unnamed: 0,Utterance,Speaker,Emotion,Scene_ID,Utterance_ID,Season,Episode,Start_Time,End_Time
0,"What you guys don't understand is, for us, kis...",['Monica Geller'],Joyful,1,1,1,2,00:00:02.877,00:00:07.548
1,"Yeah, right!.......Y'serious?",['Joey Tribbiani'],Neutral,1,2,1,2,00:00:04.504,00:00:07.548
2,"Oh, yeah!",['Phoebe Buffay'],Joyful,1,3,1,2,00:00:07.924,00:00:09.508
3,Everything you need to know is in that first k...,['Rachel Green'],Powerful,1,4,1,2,00:00:11.970,00:00:17.683
4,Absolutely.,['Monica Geller'],Powerful,1,5,1,2,00:00:14.139,00:00:15.097
...,...,...,...,...,...,...,...,...,...
7546,Fuggetaboutit. How you doin?,['Joey Tribbiani'],Peaceful,17,7,4,24,00:21:13.313,00:21:17.650
7547,Mmm.,['Felicity'],Peaceful,17,8,4,24,00:21:16.108,00:21:17.650
7548,"Oh, yeah.",['Joey Tribbiani'],Peaceful,17,9,4,24,00:21:20.404,00:21:24.573
7549,Dad!!,['Ross Geller'],Scared,17,14,4,24,00:21:22.614,00:21:24.573


In [6]:
df_train.Emotion.value_counts()

Emotion
Neutral     2485
Joyful      1677
Scared       941
Mad          785
Peaceful     638
Powerful     551
Sad          474
Name: count, dtype: int64

In [7]:
df_train.columns

Index(['Utterance', 'Speaker', 'Emotion', 'Scene_ID', 'Utterance_ID', 'Season',
       'Episode', 'Start_Time', 'End_Time'],
      dtype='object')

In [9]:
df_train[(df_train.Season == 1) & (df_train.Episode == 2)]


Unnamed: 0,Utterance,Speaker,Emotion,Scene_ID,Utterance_ID,Season,Episode,Start_Time,End_Time
0,"What you guys don't understand is, for us, kis...",['Monica Geller'],Joyful,1,1,1,2,00:00:02.877,00:00:07.548
1,"Yeah, right!.......Y'serious?",['Joey Tribbiani'],Neutral,1,2,1,2,00:00:04.504,00:00:07.548
2,"Oh, yeah!",['Phoebe Buffay'],Joyful,1,3,1,2,00:00:07.924,00:00:09.508
3,Everything you need to know is in that first k...,['Rachel Green'],Powerful,1,4,1,2,00:00:11.970,00:00:17.683
4,Absolutely.,['Monica Geller'],Powerful,1,5,1,2,00:00:14.139,00:00:15.097
...,...,...,...,...,...,...,...,...,...
120,No.,['Monica Geller'],Neutral,11,10,1,2,00:21:55.063,00:21:56.522
121,"You are, you're welling up.",['Ross Geller'],Powerful,11,11,1,2,00:21:56.732,00:22:00.443
122,Am not!,['Monica Geller'],Mad,11,12,1,2,00:21:58.191,00:22:00.443
123,You're gonna be an aunt.,['Ross Geller'],Joyful,11,13,1,2,00:22:00.652,00:22:03.779


### Paths

In [8]:
current_dir = Path.cwd()
dataset_dir = current_dir / "emotion_analysis_comics" / "emory_nlp_FT" / "datasets"

In [9]:
dataset_dir

PosixPath('/Utilisateurs/umushtaq/emotion_analysis_comics/emory_nlp_FT/datasets')

### Prepare prompts

In [10]:
# Formatting Fx
# Build questoin
# Build answer

In [11]:
def formatting_fct(instruction="", input="", output=""):
    
    prompt_d ={
            
        "instruction": f"""{instruction}""",
        "input": f"""{input}""",
        "output": f"""{output}"""
            
        }
    
    return prompt_d

In [20]:
def build_instruction():
    
    emotion_classes = ["Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", "Neutral"]
    formatted_classes = ", ".join([f'"{emotion}"' for emotion in emotion_classes])

    instruction = f"""### You are an expert in Emotion Analysis for the Friends TV show.

You are given an utterance from a Friends episode.

Your task is to classify the utterance with a single emotion class from these options: "Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", or "Neutral".

Output Instructions:
1. Return ONLY a JSON object with a single emotion class
2. The JSON must have this exact structure: {{"emotion_class": "EMOTION"}}
3. EMOTION must be one of the specified emotion classes
4. Do NOT include any additional text or explanation
5. Identify only one applicable emotions only from the following classes:
   {formatted_classes}

"""    

    return instruction

In [21]:
def build_tagged_text(utterance):

    # tagged_utterances_l = []

    # for idx, utterance in enumerate(utterances_l):
        
    #     start_tag = "<UT" + str(idx+1) + ">"
    #     end_tag = "</UT" + str(idx+1) + ">"
    #     tagged_utterance = start_tag + utterance + end_tag
    #     tagged_utterances_l.append(tagged_utterance)
        
    # tagged_title_text = ''.join(tagged_utterances_l)
    
    #tagged_utterance = "<UT>" + utterance + "</UT>"
    question = f"""\n\nNow classify this utterance: {utterance}"""

    return question

In [22]:
# utterance_emotions_l = []
#     emotion_class_labels = ["Anger", "Disgust", "Fear", "Sadness", "Surprise", "Joy"]

#     if utterance_emotions == 'Neutral':
        
#         utterance_emotions_l.append([utterance_emotions])
    
#     else:
#         utterance_emotions = utterance_emotions.split("-")
       
#         #emotion_annotation_l = []

#         for idx, emotion_annotation in enumerate(utterance_emotions):

#             if '0' not in emotion_annotation:
         
#                 #emotion_annotation_l.append(emotion_class_labels[idx])
#                 utterance_emotions_l.append(emotion_annotation[:-1])
            
#         #title_emotions_l.append(emotion_annotation_l)

In [23]:
def build_answer(utterance_emotion):    
                

    return json.dumps({"emotion_class": [utterance_emotion]})

### Build Data Files

In [24]:
#df_train = df[df.split == 'TRAIN'].reset_index()

data_file_train = []

for index, _ in df_train.iterrows():
    
    i = index

    instruction = build_instruction()
    question = build_tagged_text(df_train.iloc[i].Utterance)
    answer = build_answer(df_train.iloc[i].Emotion)
    
    data_file_train.append( formatting_fct(instruction, question, answer) )

In [25]:
len(data_file_train)

7551

In [28]:
for i in range(5):
    print(data_file_train[i])

{'instruction': '### You are an expert in Emotion Analysis for the Friends TV show.\n\nYou are given an utterance from a Friends episode.\n\nYour task is to classify the utterance with a single emotion class from these options: "Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", or "Neutral".\n\nOutput Instructions:\n1. Return ONLY a JSON object with a single emotion class\n2. The JSON must have this exact structure: {"emotion_class": "EMOTION"}\n3. EMOTION must be one of the specified emotion classes\n4. Do NOT include any additional text or explanation\n5. Identify only one applicable emotions only from the following classes:\n   "Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", "Neutral"\n\n', 'input': "\n\nNow classify this utterance: What you guys don't understand is, for us, kissing is as important as any part of it.", 'output': '{"emotion_class": ["Joyful"]}'}
{'instruction': '### You are an expert in Emotion Analysis for the Friends TV show.\n\nYou are given an ut

In [29]:
#df_test = df[df.split == 'TEST'].reset_index()

data_file_test = []

for index, _ in df_test.iterrows():
    
    i = index

    instruction = build_instruction()
    question = build_tagged_text(df_test.iloc[i].Utterance)
    answer = build_answer(df_test.iloc[i].Emotion)
    
    data_file_test.append( formatting_fct(instruction, question, answer) )

In [30]:
len(data_file_test)

984

In [31]:
for i in range(5):
    print(data_file_test[i])

{'instruction': '### You are an expert in Emotion Analysis for the Friends TV show.\n\nYou are given an utterance from a Friends episode.\n\nYour task is to classify the utterance with a single emotion class from these options: "Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", or "Neutral".\n\nOutput Instructions:\n1. Return ONLY a JSON object with a single emotion class\n2. The JSON must have this exact structure: {"emotion_class": "EMOTION"}\n3. EMOTION must be one of the specified emotion classes\n4. Do NOT include any additional text or explanation\n5. Identify only one applicable emotions only from the following classes:\n   "Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", "Neutral"\n\n', 'input': "\n\nNow classify this utterance: I'm supposed to attach a brackety thing to the side things, using a bunch of these little worm guys. I have no brackety thing, I see no whim guys whatsoever and- I cannot feel my legs.", 'output': '{"emotion_class": ["Mad"]}'}
{'instruct

In [32]:
#df_test = df[df.split == 'TEST'].reset_index()

data_file_dev = []

for index, _ in df_dev.iterrows():
    
    i = index

    instruction = build_instruction()
    question = build_tagged_text(df_dev.iloc[i].Utterance)
    answer = build_answer(df_dev.iloc[i].Emotion)
    
    data_file_dev.append( formatting_fct(instruction, question, answer) )

In [33]:
len(data_file_dev)

954

In [34]:
for i in range(5):
    print(data_file_dev[i])

{'instruction': '### You are an expert in Emotion Analysis for the Friends TV show.\n\nYou are given an utterance from a Friends episode.\n\nYour task is to classify the utterance with a single emotion class from these options: "Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", or "Neutral".\n\nOutput Instructions:\n1. Return ONLY a JSON object with a single emotion class\n2. The JSON must have this exact structure: {"emotion_class": "EMOTION"}\n3. EMOTION must be one of the specified emotion classes\n4. Do NOT include any additional text or explanation\n5. Identify only one applicable emotions only from the following classes:\n   "Mad", "Scared", "Sad", "Powerful", "Peaceful", "Joyful", "Neutral"\n\n', 'input': '\n\nNow classify this utterance: Coffee.', 'output': '{"emotion_class": ["Neutral"]}'}
{'instruction': '### You are an expert in Emotion Analysis for the Friends TV show.\n\nYou are given an utterance from a Friends episode.\n\nYour task is to classify the utterance wit

### Create and save JSON files

In [35]:
dataset_dir.mkdir(parents=True, exist_ok=True)

In [36]:
file_path = Path(dataset_dir) / "emorynlp_utterance_p2_train.json"

with open(file_path, 'w') as file:
    
    json.dump(data_file_train, file)

In [37]:
file_path = Path(dataset_dir) / "emorynlp_utterance_p2_test.json"

with open(file_path, 'w') as file:
    
    json.dump(data_file_test, file)

In [38]:
file_path = Path(dataset_dir) / "emorynlp_utterance_p2_dev.json"

with open(file_path, 'w') as file:
    
    json.dump(data_file_dev, file)