In [None]:
 #visualization tool for displaying long load/processing times
!pip install tqdm --quiet
#data processing
!pip install pandas --quiet
#workhorse for converting text into embeddings/vectors
!pip install sentence-transformers==2.2.2 --quiet
#data framework for LLM applications
!pip install llama-index==0.9.29  --quiet
#logging output
!pip install loguru==0.7.0 --quiet
#convenient pretty printing library
!pip install rich --quiet


In [None]:
!curl -o Data_Med.json https://raw.githubusercontent.com/paramjeetn/Doctor_Copilot_V1/main/Data/Data_Med.json


In [4]:
%load_ext autoreload
%autoreload 2

#standard libraries
import sys
sys.path.append('../')

import json
import os
import time
from typing import List, Tuple
from math import ceil

#external libraries
import pandas as pd
import numpy as np
from rich import print
from rich.pretty import pprint #nifty library for pretty printing
from torch import cuda
from tqdm import tqdm



In [5]:
#root folder on Google Colab is: /content/
root_folder = '/content/'  #'../data'
data_file = 'Data_Med.json'
data_path = os.path.join(root_folder, data_file)


In [6]:
def load_json(file_path: str) -> list[dict]:
  with open(file_path) as f:
    data = json.load(f)
  return data

data = load_json(data_path)
print(f'Total # of papers: {len(data)}')

In [7]:
contents = [d['content'] for d in data]
content_lengths = [len(content.split()) for content in contents]
df = pd.DataFrame(content_lengths, columns=['# Words'])
df.describe()

Unnamed: 0,# Words
count,14.0
mean,862.285714
std,423.42258
min,362.0
25%,639.0
50%,789.5
75%,1025.75
max,2116.0


In [8]:
#Tokenisation not needed for chroma
from llama_index.text_splitter import SentenceSplitter #one of the best on the market

#set chunk size and instantiate your SentenceSplitter
chunk_size = 256
gpt35_txt_splitter = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=0)


In [9]:
def split_contents(corpus: list[dict],
                   text_splitter: SentenceSplitter,
                   content_field: str='content'
                   ) -> list[list[str]]:

    return [text_splitter.split_text(d[content_field]) for d in corpus]


In [None]:
content_splits = split_contents(data, gpt35_txt_splitter,'content')
print(content_splits[0])

In [12]:
def join_metadata(data: list[dict], content_splits: list[list[str]]) -> list[dict]:

    joined_documents = []

    for i, episode in enumerate(data):
        for j, text in enumerate(content_splits[i]):
            new_doc = {}
            if j == 0:
                summary_doc = {}
                summary_doc["doc_id"] = data[i]["doc_id"]
                summary_doc["content_id"] = data[i]["doc_id"] + "_Summary"
                summary_doc["doc_name"] = data[i]["doc_name"]
                summary_doc["content"] = data[i]["Summary"]
                joined_documents.append(summary_doc)

            new_doc["doc_id"] = data[i]["doc_id"]
            new_doc["content_id"] = data[i]["doc_id"] + "_cont_" + str(j+1)
            new_doc["doc_name"] = data[i]["doc_name"]
            new_doc["content"] = text
            joined_documents.append(new_doc)

    return joined_documents

In [None]:
docs = join_metadata(data, content_splits )
print(docs[:5])

In [14]:
print(len(docs))

In [15]:
import json

with open('Split_Data_Med.json', 'w') as f:
    json.dump(docs, f, indent=4, ensure_ascii=False)