In [3]:
import json, pickle, json
from collections import defaultdict

# **Main Data**

In [4]:
def read_jsonl_file(file_path):
  """Reads a JSONL file and returns a list of dictionaries."""
  data = []
  with open(file_path, 'r') as f:
    for line in f:
      try:
        data.append(json.loads(line))
      except json.JSONDecodeError as e:
        print(f"Error decoding JSON: {e}")
  return data

# =============================
# 1️⃣ Load Dataset
# =============================
file_path = '/home/maryam/llamaPersonaResp/Original_Data/wikipedia.jsonl'
data = read_jsonl_file(file_path)

In [5]:
wikipedia = defaultdict(list)

for utterance in data:
    wikipedia[utterance['root']].append(utterance)

In [6]:
wikipedia

defaultdict(list,
            {'524288': [{'id': '524288',
               'root': '524288',
               'text': 'You should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for \\"information purposes\\". ',
               'user': 'Frightner',
               'meta': {'is-admin': False},
               'reply-to': None,
               'timestamp': '1.189190940E09'},
              {'id': '524289',
               'root': '524288',
               'text': 'Yes I agree. The law permits usage of documents, photographs and other materials for educational and informational purposes. There was a normative act issued by the government of the Republic of Macedonia that even allowed citizens to make photocopies or photograph rare archive materials. ',
               'user': 'Revizionist',
               'meta': {'is-admin': False},
               'reply-to': None,
     

In [7]:
# Dictionary of all the messages each user sent
user_messages = defaultdict(list)
for root, conversation in wikipedia.items():
  for utterance in conversation:
    user = utterance['user']
    message = utterance['text']
    user_messages[user].append(message)

In [None]:
with open('/content/drive/MyDrive/Perspective Paper/Original_Data/user_messages.pkl', 'wb') as handle:
    pickle.dump(user_messages, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [10]:
def generate_histories(conversation):
    history_dict = {}
    for message in conversation:
        current_id = message['id']
        reply_to = message['reply-to']
        if reply_to is None:
            history_dict[current_id] = [current_id]
        else:
            if reply_to in history_dict:
                history_dict[current_id] = history_dict[reply_to] + [current_id]
            else:
                history_dict[current_id] = [reply_to, current_id]
    return history_dict

def extract_dialogue_histories(conversations):
    dialogue_histories = []
    seen_histories = set()

    for conversation in conversations:
        history_dict = generate_histories(conversation)
        for message in conversation:
            current_id = message['id']
            reply_to = message['reply-to']
            if reply_to is not None:
                history = tuple(history_dict[current_id][:-1][-8:])
                history = tuple(history_dict[current_id][:-1])
                if history not in seen_histories:
                    dialogue_histories.append((history, current_id))
                    seen_histories.add(history)

    return dialogue_histories

dialogue_histories = extract_dialogue_histories(wikipedia.values())


In [11]:
len(dialogue_histories)

214919

In [13]:
with open('/home/maryam/llamaPersonaResp/Original_Data/dialogue_histories.pkl', 'wb') as f:
    pickle.dump(dialogue_histories, f, protocol=pickle.HIGHEST_PROTOCOL)

In [14]:
with open('/home/maryam/llamaPersonaResp/Original_Data/dialogue_histories.pkl', 'rb') as f:
  dialogue_histories = pickle.load(f)

In [15]:
import random
random.shuffle(dialogue_histories)

total_samples = len(dialogue_histories)
train_split = int(0.8 * total_samples)
test_split = int(0.9 * total_samples)

train_data = dialogue_histories[:train_split]
test_data = dialogue_histories[train_split:test_split]
dev_data = dialogue_histories[test_split:]

print("Train data size:", len(train_data))
print("Test data size:", len(test_data))
print("Dev data size:", len(dev_data))


Train data size: 171935
Test data size: 21492
Dev data size: 21492


In [16]:
with open('/home/maryam/llamaPersonaResp/Original_Data/train_data.pkl', 'wb') as f:
  pickle.dump(train_data, f)

with open('/home/maryam/llamaPersonaResp/Original_Data/test_data.pkl', 'wb') as f:
  pickle.dump(test_data, f)

with open('/home/maryam/llamaPersonaResp/Original_Data/dev_data.pkl', 'wb') as f:
  pickle.dump(dev_data, f)


In [18]:
with open('/home/maryam/llamaPersonaResp/Original_Data/train_data.pkl', 'rb') as f:
  train_data = pickle.load(f)

with open('/home/maryam/llamaPersonaResp/Original_Data/test_data.pkl', 'rb') as f:
  test_data = pickle.load(f)

with open('/home/maryam/llamaPersonaResp/Original_Data/dev_data.pkl', 'rb') as f:
  dev_data = pickle.load(f)

print("Train data size:", len(train_data))
print("Test data size:", len(test_data))
print("Dev data size:", len(dev_data))


Train data size: 171935
Test data size: 21492
Dev data size: 21492


# **Preparing Data For Without User Profile Experiment**

In [19]:
train_data

[(('164666',), '164667'),
 (('416144',), '416145'),
 (('88050',), '88051'),
 (('576508', '576511', '576512', '576513'), '576514'),
 (('74025',), '74026'),
 (('459880',), '459881'),
 (('581521',), '581522'),
 (('149014',), '149015'),
 (('516298',), '516299'),
 (('263013', '263021'), '263022'),
 (('239342',), '239343'),
 (('546695', '546696', '546697', '546698', '546699'), '546700'),
 (('91766',), '91767'),
 (('78203',), '78204'),
 (('387778',), '387779'),
 (('71441',), '71442'),
 (('343678', '343685'), '343686'),
 (('205188', '205189', '205190'), '205191'),
 (('31536',), '31537'),
 (('326845',), '326846'),
 (('259246', '259247', '259248'), '259249'),
 (('85214', '85215', '85216', '85217'), '85218'),
 (('376817',), '376818'),
 (('419835', '419836', '419837', '419838'), '419839'),
 (('389206',), '389207'),
 (('299016',), '299017'),
 (('267028', '267029', '267030', '267031', '267032'), '267033'),
 (('91766', '91769', '91770'), '91771'),
 (('407971', '407972', '407973', '407974'), '407975')

In [20]:
# ======================================================================================================
# This function returns last user in the conversation (the one we are going to select the response for)
# ======================================================================================================

def get_user(id_histories, dataset):
  response_user_map = {}
  for user, conversation in dataset.items():
      for utterance in conversation:
          response_user_map[utterance['id']] = utterance['user']
  response_user = []
  for history, response_id in id_histories:
      if response_id in response_user_map:
          response_user.append(response_user_map[response_id])

  return response_user

In [21]:
train_response_user = get_user(train_data, wikipedia)
dev_response_user = get_user(dev_data, wikipedia)
test_response_user = get_user(test_data, wikipedia)

In [22]:
# =============================================================
# Creating Distractor list among same user's all responses.
# =============================================================
def get_distractor(response_user):
  distractor_list = []
  for user in response_user:
    distractor = random.choice(list(user_messages[user]))
    distractor_list.append(distractor)

  return distractor_list

In [23]:
# ========================================================
# This function maps the ids in the DH to their real texts
# ========================================================
def id_to_text(id_histories, dataset):
    utterance_dict = {}
    for _ , conversation in dataset.items():
        for utterance in conversation:
            utterance_dict[utterance['id']] = utterance['text']

    text_histories = []
    for history, response_id in id_histories:
        text_history = [utterance_dict[utt_id] for utt_id in history if utt_id in utterance_dict]
        true_response = utterance_dict.get(response_id, "")
        text_history = '__eou__'.join(text_history)
        text_histories.append((text_history, true_response))


    return text_histories, utterance_dict

In [24]:
def generate_final_data(text_histories, wrong_response):
  final_list = []
  for (history, correct_response), wrong_response in zip(text_histories, wrong_response):
      # Randomly choose between the correct and wrong response
      if random.choice([True, False]):
          chosen_response = correct_response
          label = 1  # 1 if true response chosen
      else:
          chosen_response = wrong_response
          label = 0  # 0 if wrong response chosen
      # Append a tuple to the final list: (history, chosen_response, label)
      final_list.append((history, chosen_response, label))

  return final_list

In [25]:
train_distractor = get_distractor(train_response_user)
dev_distractor = get_distractor(dev_response_user)
test_distractor = get_distractor(test_response_user)

In [26]:
train_text_histories, utterance_dict = id_to_text(train_data, wikipedia)
dev_text_histories, utterance_dict = id_to_text(dev_data, wikipedia)
test_text_histories, utterance_dict = id_to_text(test_data, wikipedia)

In [27]:
final_train_data = generate_final_data(train_text_histories, train_distractor)
final_dev_data = generate_final_data(dev_text_histories, dev_distractor)

In [28]:
final_test_data = [
    [history[0], history[1], wrong_response]
    for history, wrong_response in zip(test_text_histories, test_distractor)
]

In [30]:
# These are the data for response selection with distractor choosed among user's prev messages.

with open('/home/maryam/llamaPersonaResp/Original_Data/final_train.pkl', 'wb') as f:
  pickle.dump(final_train_data, f)

with open('/home/maryam/llamaPersonaResp/Original_Data/final_dev.pkl', 'wb') as f:
  pickle.dump(final_dev_data, f)

with open('/home/maryam/llamaPersonaResp/Original_Data/final_test.pkl', 'wb') as f:
  pickle.dump(final_test_data, f)

## **Preparing Data For With UserProfile Experiment**

In [None]:
with open('/content/drive/MyDrive/Thesis/Experiment/Data/Experiment1/TrainSet.pkl', 'rb') as f:
  Ex1_train = pickle.load(f)

with open('/content/drive/MyDrive/Thesis/Experiment/Data/Experiment1/DevSet.pkl', 'rb') as f:
  Ex1_dev = pickle.load(f)

with open('/content/drive/MyDrive/Thesis/Experiment/Data/Experiment1/TestSet.pkl', 'rb') as f:
  Ex1_test = pickle.load(f)

In [None]:
def generate_datase(data, response_user):
  merged_dataset = []
  for main, user in zip(data, response_user):
    user_profile = user_profiles[user]
    profile_text = ', '.join([f"{key}:{value}" for key, value in user_profile.items()])
    common_words = ' '.join(user_entity[user])
    merged_dataset.append((f"{main[0]}|Speaker Profile: {profile_text}|Speaker Frequent Words: {common_words}", main[1], main[2]))

  return merged_dataset

In [None]:
trainset = generate_datase(Ex1_train, train_response_user)
devset = generate_datase(Ex1_dev, dev_response_user)
testset = generate_datase(Ex1_test, test_response_user)

In [None]:
with open('/content/drive/MyDrive/Thesis/Experiment/Data/Experiment1/TrainSet_SUP.pkl', 'wb') as f:
  pickle.dump(trainset, f)

with open('/content/drive/MyDrive/Thesis/Experiment/Data/Experiment1/DevSet_SUP.pkl', 'wb') as f:
  pickle.dump(devset, f)

with open('/content/drive/MyDrive/Thesis/Experiment/Data/Experiment1/TestSet_SUP.pkl', 'wb') as f:
  pickle.dump(testset, f)