![image](https://miro.medium.com/v2/resize:fit:720/format:webp/0*BeCPaB7nKSlMQJlx)

In [None]:
import random, os
import pandas as pd
import qdrant_client
from faker import Faker
from llama_index.core.schema import TextNode
from reportlab.lib.pagesizes import letter
from reportlab.lib.styles import ParagraphStyle
from llama_index.llms.llama_cpp import LlamaCPP
from reportlab.platypus import SimpleDocTemplate, Paragraph
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.core.node_parser.text import SentenceSplitter
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import (
                            VectorStoreIndex,
                            SimpleDirectoryReader,
                            load_index_from_storage,
                            StorageContext,
                            ServiceContext
                            )
os.chdir("/Users/1zuu/Desktop/LLM RESEARCH 2024/Cool LLM")

In [2]:
fake = Faker()

medical_condition_data = {
                        'Hypertension': {
                                        'medications': ['Lisinopril', 'Amlodipine', 'Losartan', 'Hydrochlorothiazide'],
                                        'cholesterol_range': (100, 200),
                                        'glucose_range': (70, 110),
                                        'blood_pressure_range': (140, 90)
                                        },
                        'Diabetes': {
                                    'medications': ['Metformin', 'Insulin', 'Glipizide', 'Sitagliptin'],
                                    'cholesterol_range': (100, 200),
                                    'glucose_range': (130, 200),
                                    'blood_pressure_range': (130, 80)
                                    }
                        }

qdrant_client = qdrant_client.QdrantClient(location=":memory:")

## Build Synthetic Medical Records

In [3]:
def generate_patient_records(num_patients):
    patient_records = []
    for _ in range(num_patients):
        patient_id = fake.uuid4()
        name = fake.name()
        age = random.randint(18, 90)
        gender = random.choice(['Male', 'Female'])
        blood_type = random.choice(['A+', 'B+', 'AB+', 'O+', 'A-', 'B-', 'AB-', 'O-'])
        medical_condition = random.choice(list(medical_condition_data.keys()))
        patient_records.append({
                            'Patient_ID': patient_id,
                            'Name': name,
                            'Age': age,
                            'Gender': gender,
                            'Blood_Type': blood_type,
                            'Medical_Condition': medical_condition
                            })
    return patient_records

def generate_test_results(num_patients):
    test_results = []
    for _ in range(num_patients):
        patient_id = fake.uuid4()
        medical_condition = random.choice(list(medical_condition_data.keys()))
        cholesterol_range = medical_condition_data[medical_condition]['cholesterol_range']
        glucose_range = medical_condition_data[medical_condition]['glucose_range']
        blood_pressure_range = medical_condition_data[medical_condition]['blood_pressure_range']
        cholesterol = random.uniform(cholesterol_range[0], cholesterol_range[1])
        glucose = random.uniform(glucose_range[0], glucose_range[1])
        systolic = random.randint(blood_pressure_range[1], blood_pressure_range[0]) 
        diastolic = random.randint(60, systolic) 
        blood_pressure = f"{systolic}/{diastolic}"
        test_results.append({
                            'Patient_ID': patient_id,
                            'Medical_Condition': medical_condition,
                            'Cholesterol': cholesterol,
                            'Glucose': glucose,
                            'Blood_Pressure': blood_pressure
                            })
    return test_results

def generate_prescriptions(num_patients):
    prescriptions = []
    for i in range(num_patients):
        patient_id = fake.uuid4()
        medical_condition = random.choice(list(medical_condition_data.keys()))
        medication = random.choice(medical_condition_data[medical_condition]['medications'])
        dosage = f"{random.randint(1, 3)} pills"
        duration = f"{random.randint(1, 30)} days"
        prescriptions.append({
                            'Patient_ID': patient_id,
                            'Medical_Condition': medical_condition,
                            'Medication': medication,
                            'Dosage': dosage,
                            'Duration': duration
                            })
    return prescriptions

def generate_medical_history_dataset(num_patients):
    patient_records = generate_patient_records(num_patients)
    test_results = generate_test_results(num_patients)
    prescriptions = generate_prescriptions(num_patients)
    
    medical_history = []
    for i in range(num_patients):
        patient_id = patient_records[i]['Patient_ID']
        record = {**patient_records[i], **test_results[i], **prescriptions[i]}
        medical_history.append(record)
    return pd.DataFrame(medical_history)

In [4]:
medical_history_dataset = generate_medical_history_dataset(100)
medical_history_dataset.to_csv('data/01/medical_history_dataset.csv', index=False)
print("Synthetic medical history dataset created and saved to 'medical_history_dataset.csv'")

Synthetic medical history dataset created and saved to 'medical_history_dataset.csv'


## Process Medical Data

In [5]:
df_med = pd.read_csv('data/01/medical_history_dataset.csv')
df_med.head()

Unnamed: 0,Patient_ID,Name,Age,Gender,Blood_Type,Medical_Condition,Cholesterol,Glucose,Blood_Pressure,Medication,Dosage,Duration
0,d6607b34-7879-4e5c-812f-324399e00453,Kristen Strickland,22,Male,A-,Diabetes,190.314847,85.220164,120/68,Metformin,1 pills,6 days
1,b32e2b69-c30c-47d0-87f9-4efe02e5e2bf,Alison Owens,72,Female,O-,Diabetes,137.123513,193.91229,109/83,Metformin,3 pills,23 days
2,eb4eab08-9e90-4138-98f6-0fc3e6678c76,Jacqueline Ferrell,69,Female,A-,Diabetes,155.265142,163.708596,82/72,Sitagliptin,2 pills,7 days
3,f9a465c8-20a4-4a21-925a-fe2637c2d10c,Craig Smith,67,Male,A+,Hypertension,173.171781,167.583069,108/96,Hydrochlorothiazide,3 pills,4 days
4,f254c8fc-32f0-44fb-8946-1b0ea0e801b0,James Welch,60,Male,B+,Diabetes,124.331272,133.844722,102/65,Sitagliptin,2 pills,18 days


#### Convert CSV to PDF

In [6]:
def create_pdf_from_dataframe(
                            dataframe, 
                            output_file
                            ):
    doc = SimpleDocTemplate(output_file, pagesize=letter)
    styles = ParagraphStyle(name='Normal', fontSize=12)
    
    content = []
    for index, row in dataframe.iterrows():
        row_content = []
        for column_name, value in row.items():
            row_content.append(f"{column_name}: {value}")
    
        content.append(Paragraph(", ".join(row_content), styles))
        content.append(Paragraph("<br/><br/>", styles)) 

    doc.build(content)

create_pdf_from_dataframe(df_med, "data/01/output.pdf")

# Load data to LlamaIndex

In [7]:
documents = SimpleDirectoryReader(input_files=['data/01/output.pdf']).load_data()
text_parser = SentenceSplitter(
                            chunk_size=1024,
                            )
text_chunks = []
doc_idxs = []
nodes = []

for doc_idx, doc in enumerate(documents):
    cur_text_chunks = text_parser.split_text(doc.text)
    text_chunks.extend(cur_text_chunks)
    doc_idxs.extend([doc_idx] * len(cur_text_chunks))

for idx, text_chunk in enumerate(text_chunks):
    node = TextNode(
                    text=text_chunk,
                    )
    src_doc = documents[doc_idxs[idx]]
    node.metadata = src_doc.metadata
    nodes.append(node)

## Build Embeddings and Vector Store

In [8]:
embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en")
for node in nodes:
    node_embedding = embed_model.get_text_embedding(
    node.get_content(metadata_mode="all")
    )
    node.embedding = node_embedding

In [12]:
model_path = "./llms/local/mistral-7b-instruct-v0.1.Q4_0.gguf"
model_url = "https://huggingface.co/TheBloke/Mistral-7B-Instruct-v0.1-GGUF/resolve/main/mistral-7b-instruct-v0.1.Q4_0.gguf"

if not os.path.exists(model_path):
    llm = LlamaCPP(
                model_url=model_url,
                model_path=None,
                temperature=0.1,
                max_new_tokens=256,
                context_window=3900,
                generate_kwargs={},
                model_kwargs={"n_gpu_layers": 1},
                verbose=True,
                )
else:
    llm = LlamaCPP(
                model_url=model_url,
                model_path=model_path,
                temperature=0.1,
                max_new_tokens=256,
                context_window=3900,
                generate_kwargs={},
                model_kwargs={"n_gpu_layers": 1},
                verbose=True,
                )

In [None]:
service_context = ServiceContext.from_defaults(
                                                llm=llm, 
                                                embed_model=embed_model
                                                )


vector_store = QdrantVectorStore(
                                client=qdrant_client, 
                                collection_name="patient_collection"
                                )
storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents(
                                        documents, 
                                        storage_context=storage_context, 
                                        service_context=service_context
                                        )

vector_store.add(nodes)

In [None]:
query_str = "Can you tell me about the key concepts for safety finetuning"
query_embedding = embed_model.get_query_embedding(query_str)
query_embedding