In [3]:
import json, pickle, json
from collections import defaultdict

In [4]:
path='/home/maryam/llamaPersonaResp/Data'

### **Loading Data**

In [24]:
def read_jsonl_file(file_path):
  """Reads a JSONL file and returns a list of dictionaries."""
  data = []
  with open(file_path, 'r') as f:
    for line in f:
      try:
        data.append(json.loads(line))
      except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
  return data


file_path = f'{path}/original/wikipedia.jsonl'
data = read_jsonl_file(file_path)


In [None]:
# create conversations based on the root
wikipedia = defaultdict(list)
for utterance in data:
    wikipedia[utterance['root']].append(utterance)
    
# Save the conversations to a pickle file
with open('/home/maryam/llamaPersonaResp/Data/common/wikipedia_conv.pkl', 'wb') as f:
    pickle.dump(wikipedia, f)

In [None]:
# Dictionary of all the messages each user sent
user_messages = defaultdict(list)
for root, conversation in wikipedia.items():
  for utterance in conversation:
    user = utterance['user']
    message = utterance['text']
    user_messages[user].append(message)

# Save the user messages to a pickle file
with open('/home/maryam/llamaPersonaResp/Data/common/user_messages.pkl', 'wb') as handle:
    pickle.dump(user_messages, handle, protocol=pickle.HIGHEST_PROTOCOL)

### **Generate Dialouge History for each conversation**

In [None]:
def generate_histories(conversation):
    history_dict = {}
    for message in conversation:
        current_id = message['id']
        reply_to = message['reply-to']
        if reply_to is None:
            history_dict[current_id] = [current_id]
        else:
            if reply_to in history_dict:
                history_dict[current_id] = history_dict[reply_to] + [current_id]
            else:
                history_dict[current_id] = [reply_to, current_id]
    return history_dict

def extract_dialogue_histories(conversations):
    dialogue_histories = []
    seen_histories = set()

    for conversation in conversations:
        history_dict = generate_histories(conversation)
        for message in conversation:
            current_id = message['id']
            reply_to = message['reply-to']
            if reply_to is not None:
                history = tuple(history_dict[current_id][:-1][-8:])
                history = tuple(history_dict[current_id][:-1])
                if history not in seen_histories:
                    dialogue_histories.append((history, current_id))
                    seen_histories.add(history)

    return dialogue_histories

In [27]:
dialogue_histories = extract_dialogue_histories(wikipedia.values())
# Save the dialogue histories to a pickle file
with open('/home/maryam/llamaPersonaResp/Data/common/dialogue_histories.pkl', 'wb') as f:
    pickle.dump(dialogue_histories, f, protocol=pickle.HIGHEST_PROTOCOL)

### **Split DH to Train/Test/Dev**

In [28]:
import random
random.shuffle(dialogue_histories)

total_samples = len(dialogue_histories)
train_split = int(0.8 * total_samples)
test_split = int(0.9 * total_samples)

train_data = dialogue_histories[:train_split]
test_data = dialogue_histories[train_split:test_split]
dev_data = dialogue_histories[test_split:]

print("Train data size:", len(train_data))
print("Test data size:", len(test_data))
print("Dev data size:", len(dev_data))


Train data size: 171935
Test data size: 21492
Dev data size: 21492


In [16]:
with open('/home/maryam/llamaPersonaResp/Data/common/train_data_id.pkl', 'wb') as f:
  pickle.dump(train_data, f)

with open('/home/maryam/llamaPersonaResp/Data/common/test_data_id.pkl', 'wb') as f:
  pickle.dump(test_data, f)

with open('/home/maryam/llamaPersonaResp/Data/common/dev_data_id.pkl', 'wb') as f:
  pickle.dump(dev_data, f)


In [17]:
# ======================================================================================================
# This function returns last user in the conversation (the one we are going to select the response for)
# ======================================================================================================

def get_user(id_histories, dataset):
  response_user_map = {}
  for user, conversation in dataset.items():
      for utterance in conversation:
          response_user_map[utterance['id']] = utterance['user']
  response_user = []
  for history, response_id in id_histories:
      if response_id in response_user_map:
          response_user.append(response_user_map[response_id])

  return response_user

In [18]:
train_response_user = get_user(train_data, wikipedia)
dev_response_user = get_user(dev_data, wikipedia)
test_response_user = get_user(test_data, wikipedia)

In [20]:
with open('/home/maryam/llamaPersonaResp/Data/common/train_response_user.pkl', 'wb') as f:
    pickle.dump(train_response_user, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('/home/maryam/llamaPersonaResp/Data/common/dev_response_user.pkl', 'wb') as f:
    pickle.dump(dev_response_user, f, protocol=pickle.HIGHEST_PROTOCOL)
with open('/home/maryam/llamaPersonaResp/Data/common/test_response_user.pkl', 'wb') as f:
    pickle.dump(test_response_user, f, protocol=pickle.HIGHEST_PROTOCOL)