In [1]:
import json
from typing import List, Dict, Any
from collections import defaultdict

## **Read File**

In [2]:
def read_jsonl_file(file_path: str) -> List[Dict[str, Any]]:
  data: List[Dict[str, Any]] = []
  with open(file_path, 'r', encoding='utf-8') as f:
    for line in f:
      data.append(json.loads(line))
  return data

**Wikipedia data**

In [3]:
wikipedia = read_jsonl_file("/home/maryam/llamaPersonaResp/Original_Data/wikipedia.jsonl")

In [4]:
wikipedia

[{'id': '524288',
  'root': '524288',
  'text': 'You should look at all of the point on the template not just the last one, the template also says the image belonging to the republic of macedonia is in the public domain if it being used for \\"information purposes\\". ',
  'user': 'Frightner',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189190940E09'},
 {'id': '524289',
  'root': '524288',
  'text': 'Yes I agree. The law permits usage of documents, photographs and other materials for educational and informational purposes. There was a normative act issued by the government of the Republic of Macedonia that even allowed citizens to make photocopies or photograph rare archive materials. ',
  'user': 'Revizionist',
  'meta': {'is-admin': False},
  'reply-to': None,
  'timestamp': '1.189204860E09'},
 {'id': '1',
  'root': '1',
  'text': "Yes, that's good. Revathy's page looked very reliable, that's why we used that as a source. ",
  'user': 'Johannes003',
  'meta':

## **Converting Original Datasets to prefered format.**

In [5]:
def data_transformation(dataset: List[Dict[str, Any]]) -> Dict[str, List[Dict[str, Any]]]:
    for utt in dataset:
        if 'reply-to' in utt:
            utt['reply_to'] = utt.pop('reply-to')

    filtered_utt: List[Dict[str, Any]] = []
    for utt in dataset:
        filtered_utt.append({key: utt[key] for key in ["id","user",'root', "reply_to", 'text', "timestamp"]})

    conversations: defaultdict[str, List[Dict[str, Any]]] = defaultdict(list)
    for utt in filtered_utt:
        root = utt["root"]
        conversations[root].append(utt)

    for id, conversation in conversations.items():
        # print(id)
        for utterance in conversation:
            if utterance.get('reply_to') is not None:
                try:
                    utterance['reply_to'] = int(utterance['reply_to'])
                except ValueError:
                    utterance['reply_to'] = utterance['reply_to']
            if utterance.get('id') == utterance.get('root'):
                pass
            elif utterance.get('reply_to') is None:
                try:
                    utterance['reply_to'] = int(utterance['id']) - 1
                except ValueError:
                    utterance['reply_to'] = utterance['root']

    print(f"We have {len(conversations)} conversations in this dataset")

    return conversations


In [6]:
wikipedia_dataset = data_transformation(wikipedia)

We have 125292 conversations in this dataset


In [None]:
wikipedia_dataset