In [1]:
import json 

def transform_conversations(data):
    transformed_data = []
    for conversation in data:
        transformed_conversation = {
            "conversation_ID": conversation["conversation_ID"],
            "conversation": [],
            "emotion-cause_pairs": conversation["emotion-cause_pairs"]
        }
        for utterance in conversation["conversation"]:
            transformed_text = f"{utterance['speaker']} {utterance['emotion']} said: {utterance['text']}"
            transformed_conversation["conversation"].append({
                "utterance_ID": utterance["utterance_ID"],
                "text": transformed_text,
                "speaker": utterance["speaker"],
                "emotion": utterance["emotion"]
            })
        transformed_data.append(transformed_conversation)
    return transformed_data

# extract data from train_file.json

import os 

input_file_path ="val_file.json"

data = []

with open(input_file_path, "r") as infile:
    data = json.load(infile)


processed_data = transform_conversations(data)

output_file_path = "val_new.json"
with open(output_file_path, "w") as outfile:
    json.dump(processed_data, outfile)

output_file_path


'val_new.json'

In [2]:
def process_sliding_padded_conversation_with_speaker_bio(conversation, num_padding=5):
    processed = []
    conversation_texts = [utterance["text"] for utterance in conversation["conversation"]]
    speakers = [utterance["speaker"] for utterance in conversation["conversation"]]
    num_utterances = len(conversation_texts)
    padding_token = "[PAD]"
    emotion_causes = {}
    
    for pair in conversation["emotion-cause_pairs"]:
        target_id, cause_text = pair[0].split("_")[0], pair[1].split("_")[1]
        if target_id in emotion_causes:
            emotion_causes[target_id].append(cause_text)
        else:
            emotion_causes[target_id] = [cause_text]
    
    for i in range(num_utterances):
        utterance_id_str = str(conversation["conversation"][i]["utterance_ID"])
        left_context_text = ([padding_token] * num_padding + conversation_texts[:i+1])[-(num_padding+1):]
        left_context_speakers = ([None] * num_padding + speakers[:i+1])[-(num_padding+1):]
        right_context_text = (conversation_texts[i+1:] + [padding_token] * num_padding)[:num_padding]
        right_context_speakers = (speakers[i+1:] + ['null'] * num_padding)[:num_padding]
        
        context_parts = left_context_text + right_context_text
        context_speakers = left_context_speakers + right_context_speakers
        full_text = " [SEP] ".join(context_parts)
        full_speakers = []
        
        for part, speaker in zip(context_parts, context_speakers):
            full_speakers.extend([speaker] * len(part.split()) + ['null'])
        full_speakers.pop()
        
        tokens = full_text.split()
        bio_tags = ["O"] * len(tokens)
        
        if utterance_id_str in emotion_causes:
            for cause in emotion_causes[utterance_id_str]:
                start_pos = full_text.find(cause)
                if start_pos != -1:
                    cause_tokens = cause.split()
                    start_index = len(full_text[:start_pos].split())
                    end_index = start_index + len(cause_tokens)
                    if start_index < len(bio_tags):
                        bio_tags[start_index] = "B"
                        for j in range(start_index + 1, end_index):
                            if j < len(bio_tags):
                                bio_tags[j] = "I"
        
        processed.append({
            "conversation_ID": conversation["conversation_ID"],
            "utterance_ID": conversation["conversation"][i]["utterance_ID"],
            "padded_text": full_text,
            "bio_tags": bio_tags, 
            "utterance_emotion": conversation["conversation"][i]["emotion"], 
            "utterance_speaker": conversation["conversation"][i]["speaker"],
            "speakers_in_context": full_speakers
        })
        
    return processed


In [3]:
# read data from train_file.json 

import json 

input_file_path = "val_new.json"

data = []

with open(input_file_path, "r") as infile:
    data = json.load(infile)

def process_full_dataset(dataset):
    processed_dataset = []
    for conversation in dataset:
        processed_conversation = process_sliding_padded_conversation_with_speaker_bio(conversation)
        processed_dataset.extend(processed_conversation)
    return processed_dataset

full_dataset = data
processed_full_dataset = process_full_dataset(full_dataset)

output_file_path = "val_file_new.json"
with open(output_file_path, "w") as outfile:
    json.dump(processed_full_dataset, outfile)

print(f"File saved in {output_file_path}")

File saved in val_file_new.json
