In [3]:
import json, pickle, json
from collections import defaultdict

In [4]:
path='/home/maryam/llamaPersonaResp/Data'

### **Loading Data**

In [5]:
with open('/home/maryam/llamaPersonaResp/Data/common/wikipedia_conv.pkl', 'rb') as file:
    wikipedia = pickle.load(file)

In [6]:
# save last-user and last-utterance
gtruth = []
for root, conversations in wikipedia.items():
        last_user = conversations[-1]['user']
        last_utterance = conversations[-1]['text']
        gtruth.append((last_user, last_utterance))
with open(path + '/wikipedia_gtruth.json', 'w') as f:
    json.dump(gtruth, f, indent=4)

In [7]:
user_for_profiles = defaultdict(list)

for root, conversation in wikipedia.items():
    if not conversation:
        continue
    last_user = conversation[-1]['user']
    for utterance in conversation:
        if utterance['user'] == last_user and utterance['text'].strip():
            user_for_profiles[last_user].append(utterance['text'])

In [8]:
with open('/home/maryam/llamaPersonaResp/Data/common/user_messages_profile.pkl', 'wb') as handle:
    pickle.dump(user_for_profiles, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [9]:
# Dictionary of all the messages each user sent
user_messages = defaultdict(list)
for root, conversation in wikipedia.items():
  for utterance in conversation:
    user = utterance['user']
    message = utterance['text']
    user_messages[user].append(message)

with open('/home/maryam/llamaPersonaResp/Data/common/user_messages.pkl', 'wb') as handle:
    pickle.dump(user_messages, handle, protocol=pickle.HIGHEST_PROTOCOL)

### **Generate Dialouge History for each conversation**

In [17]:
from collections import defaultdict

def generate_histories(conversation):
    children = defaultdict(list)
    id_to_message = {}
    root_id = None

    for message in conversation:
        current_id = int(message['id'])
        reply_to = message['reply_to']
        id_to_message[current_id] = message
        if reply_to is not None:
            children[int(reply_to)].append(current_id)
        else:
            root_id = current_id

    # Now do DFS from root to build all paths
    all_paths = []

    def dfs(node, path):
        path.append(node)
        if node not in children or len(children[node]) == 0:
            all_paths.append(path.copy())
        else:
            for child in children[node]:
                dfs(child, path)
        path.pop()

    if root_id is not None:
        dfs(root_id, [])

    return all_paths, id_to_message


def extract_dialogue_histories(conversations):
    dialogue_histories = []
    utterance_dict = {}

    for conversation in conversations:
        paths, id_to_message = generate_histories(conversation)

        # Collect utterance text
        for msg_id, msg in id_to_message.items():
            utterance_dict[int(msg_id)] = msg['text']

        # Extract history-response pairs from paths
        for path in paths:
            for i in range(1, len(path)):
                history = tuple(path[:i])
                response_id = path[i]
                dialogue_histories.append((history, response_id))

    return dialogue_histories, utterance_dict


In [18]:
dialogue_histories, utterance_dict = extract_dialogue_histories(wikipedia.values())
# Save the dialogue histories to a pickle file
with open('/home/maryam/llamaPersonaResp/Data/common/dialogue_histories_id.pkl', 'wb') as f:
    pickle.dump(dialogue_histories, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('/home/maryam/llamaPersonaResp/Data/common/utterance_dict.pkl', 'wb') as f:
    pickle.dump(utterance_dict, f, protocol=pickle.HIGHEST_PROTOCOL)


### **Split DH to Train/Test/Dev**

In [19]:
import random
random.shuffle(dialogue_histories)

total_samples = len(dialogue_histories)
train_split = int(0.8 * total_samples)
test_split = int(0.9 * total_samples)

train_data = dialogue_histories[:train_split]
test_data = dialogue_histories[train_split:test_split]
dev_data = dialogue_histories[test_split:]

print("Train data size:", len(train_data))
print("Test data size:", len(test_data))
print("Dev data size:", len(dev_data))


Train data size: 211343
Test data size: 26418
Dev data size: 26418


In [20]:
with open('/home/maryam/llamaPersonaResp/Data/common/train_data_id.pkl', 'wb') as f:
  pickle.dump(train_data, f)

with open('/home/maryam/llamaPersonaResp/Data/common/test_data_id.pkl', 'wb') as f:
  pickle.dump(test_data, f)

with open('/home/maryam/llamaPersonaResp/Data/common/dev_data_id.pkl', 'wb') as f:
  pickle.dump(dev_data, f)
