In [31]:
import json, re, pickle
from typing import List, Dict, Any
from collections import defaultdict

## **Read File**

In [23]:
def read_jsonl_file(file_path: str) -> List[Dict[str, Any]]:
  data: List[Dict[str, Any]] = []
  with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
      data.append(json.loads(line))
  return data

**Wikipedia data**

In [24]:
def clean_text(text):
    text = text.strip()
    text = text.lower()
    text = re.sub(r"\\[nrt]", " ", text)                  # Remove escape characters like \n, \t
    text = re.sub(r"\\?\'\'", '"', text)                  # Replace double single-quotes with "
    text = re.sub(r'\s+', ' ', text)                      # Normalize spaces
    text = re.sub(r'[^a-z0-9.,!?\'" ]', '', text)         # Remove strange characters
    text = re.sub(r'\.{2,}', '.', text)                   # Replace multiple dots with one
    return text.strip()


In [16]:
wikipedia = read_jsonl_file("/home/maryam/llamaPersonaResp/Data/original/wikipedia.jsonl")

In [25]:
wikipedia

[{'id': '524288',
  'root': '524288',
  'text': 'you should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for "information purposes".',
  'user': 'Frightner',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189190940E09'},
 {'id': '524289',
  'root': '524288',
  'text': 'yes i agree. the law permits usage of documents, photographs and other materials for educational and informational purposes. there was a normative act issued by the government of the republic of macedonia that even allowed citizens to make photocopies or photograph rare archive materials.',
  'user': 'Revizionist',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189204860E09'},
 {'id': '1',
  'root': '1',
  'text': "yes, that's good. revathy's page looked very reliable, that's why we used that as a source.",
  'user': 'Johannes003',
  'meta': {'is-a

In [26]:
for item in wikipedia:
    item['text'] = clean_text(item['text'])

In [27]:
wikipedia[:] = [item for item in wikipedia if item['text'].strip() != '']

## **Converting Original Datasets to prefered format.**

In [28]:
def data_transformation(dataset: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    for utt in dataset:
        if 'reply-to' in utt:
            utt['reply_to'] = utt.pop('reply-to')

    filtered_utt: List[Dict[str, Any]] = []
    for utt in dataset:
        filtered_utt.append({key: utt[key] for key in ["id","user",'root', "reply_to", 'text', "timestamp"]})

    conversations: defaultdict[str, List[Dict[str, Any]]] = defaultdict(list)
    for utt in filtered_utt:
        root = utt["root"]
        conversations[root].append(utt)

    for id, conversation in conversations.items():
        for utterance in conversation:
            if utterance.get('reply_to') is not None:
                try:
                    utterance['reply_to'] = int(utterance['reply_to'])
                except ValueError:
                    utterance['reply_to'] = utterance['reply_to']
            if utterance.get('id') == utterance.get('root'):
                pass
            elif utterance.get('reply_to') is None:
                try:
                    utterance['reply_to'] = int(utterance['id']) - 1
                except ValueError:
                    utterance['reply_to'] = utterance['root']

    print(f"We have {len(conversations)} conversations in this dataset")

    return conversations


In [33]:
wikipedia_dataset = data_transformation(wikipedia)

We have 125059 conversations in this dataset


In [34]:
wikipedia_dataset

defaultdict(list,
            {'524288': [{'id': '524288',
               'user': 'Frightner',
               'root': '524288',
               'reply_to': None,
               'text': 'you should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for "information purposes".',
               'timestamp': '1.189190940E09'},
              {'id': '524289',
               'user': 'Revizionist',
               'root': '524288',
               'reply_to': 524288,
               'text': 'yes i agree. the law permits usage of documents, photographs and other materials for educational and informational purposes. there was a normative act issued by the government of the republic of macedonia that even allowed citizens to make photocopies or photograph rare archive materials.',
               'timestamp': '1.189204860E09'}],
             '1': [{'id': '1',
               'us

In [35]:
with open('/home/maryam/llamaPersonaResp/Data/common/wikipedia_conv.pkl', 'wb') as f:
    pickle.dump(wikipedia_dataset, f)