In [23]:
import pandas as pd
import minsearch
import requests

## Ingestion

In [1]:
url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py"
response = requests.get(url)

with open("minsearch.py", "wb") as file:
    file.write(response.content)

In [3]:
health_df = pd.read_csv("healthcare_dataset.csv",nrows=500)

In [4]:
health_df

Unnamed: 0,Name,Age,Gender,Blood Type,Medical Condition,Date of Admission,Doctor,Hospital,Insurance Provider,Billing Amount,Room Number,Admission Type,Discharge Date,Medication,Test Results
0,Bobby JacksOn,30,Male,B-,Cancer,2024-01-31,Matthew Smith,Sons and Miller,Blue Cross,18856.281306,328,Urgent,2024-02-02,Paracetamol,Normal
1,LesLie TErRy,62,Male,A+,Obesity,2019-08-20,Samantha Davies,Kim Inc,Medicare,33643.327287,265,Emergency,2019-08-26,Ibuprofen,Inconclusive
2,DaNnY sMitH,76,Female,A-,Obesity,2022-09-22,Tiffany Mitchell,Cook PLC,Aetna,27955.096079,205,Emergency,2022-10-07,Aspirin,Normal
3,andrEw waTtS,28,Female,O+,Diabetes,2020-11-18,Kevin Wells,"Hernandez Rogers and Vang,",Medicare,37909.782410,450,Elective,2020-12-18,Ibuprofen,Abnormal
4,adrIENNE bEll,43,Female,AB+,Cancer,2022-09-19,Kathleen Hanna,White-White,Aetna,14238.317814,458,Urgent,2022-10-09,Penicillin,Abnormal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,denNIs cLarkE,22,Female,AB+,Obesity,2019-09-17,Mark Baker,Warren-Brown,Cigna,8033.611655,293,Elective,2019-10-01,Lipitor,Inconclusive
496,jeNNiFER spEnCe,65,Female,A+,Obesity,2019-05-14,Austin Perry,Ruiz Group,Medicare,7391.534745,270,Elective,2019-05-20,Aspirin,Inconclusive
497,biLlY fITZGERALd,73,Female,B+,Asthma,2020-05-31,Dr. Mackenzie Bell,Anderson-Brown,Aetna,35945.884474,146,Emergency,2020-06-28,Ibuprofen,Inconclusive
498,krisTEN JamES,82,Female,AB+,Hypertension,2020-03-27,Kimberly Calhoun,Hull Sons and,Medicare,10780.742008,472,Emergency,2020-04-04,Ibuprofen,Inconclusive


In [5]:
health_df.columns = health_df.columns.str.lower().str.replace(' ','_')
# Convert specific columns to strings if needed
health_df['age'] = health_df['age'].astype(str)
health_df['billing_amount'] = health_df['billing_amount'].astype(str)
health_df['room_number'] = health_df['room_number'].astype(str)

In [6]:
documents = health_df.to_dict(orient='records')

In [24]:
documents[0]

{'name': 'Bobby JacksOn',
 'age': '30',
 'gender': 'Male',
 'blood_type': 'B-',
 'medical_condition': 'Cancer',
 'date_of_admission': '2024-01-31',
 'doctor': 'Matthew Smith',
 'hospital': 'Sons and Miller',
 'insurance_provider': 'Blue Cross',
 'billing_amount': '18856.281305978155',
 'room_number': '328',
 'admission_type': 'Urgent',
 'discharge_date': '2024-02-02',
 'medication': 'Paracetamol',
 'test_results': 'Normal'}

In [8]:
health_df.columns

Index(['name', 'age', 'gender', 'blood_type', 'medical_condition',
       'date_of_admission', 'doctor', 'hospital', 'insurance_provider',
       'billing_amount', 'room_number', 'admission_type', 'discharge_date',
       'medication', 'test_results'],
      dtype='object')

In [9]:
index = minsearch.Index(
    text_fields=['age', 'gender', 'blood_type', 'medical_condition',
       'date_of_admission', 'doctor', 'hospital', 'insurance_provider',
       'billing_amount', 'room_number', 'admission_type', 'discharge_date',
       'medication', 'test_results'],
    keyword_fields=[]
)

In [11]:
index.fit(documents)

<minsearch.Index at 0x77cb14f61850>

## RAG Flow

In [12]:
def search(query):
    boost = {}

    results = index.search(
        query=query,
        filter_dict={},
        boost_dict=boost,
        num_results=5
    )

    return results

In [13]:
prompt_template = """
You're a healthcare data analyzer. Answer the QUESTION based on the CONTEXT from our exercises database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

entry_template = """
age: {age}
gender: {gender}
blood_type: {blood_type}
medical_condition: {medical_condition}
insurance_provider: {insurance_provider}
billing_amount: {billing_amount}
test_results: {test_results}
""".strip()

def build_prompt(query, search):
    context = ""
    
    for doc in search:
        context = context + entry_template.format(**doc) + "\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [14]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [15]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [16]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [20]:
query = 'Number of medical_condition having Normal test_results'

In [21]:
rag(query)

'Three patients with the medical condition "Normal test results" in our database are all females, do not have cancer as their primary medical condition and include a patient having blood type A+ from UnitedHealthcare. The specific ages of these individuals range between 18 to 76 years old.'

In [22]:
print(_)

Three patients with the medical condition "Normal test results" in our database are all females, do not have cancer as their primary medical condition and include a patient having blood type A+ from UnitedHealthcare. The specific ages of these individuals range between 18 to 76 years old.


## Retrieval evaluation