In [47]:
import pandas as pd

health_df = pd.read_csv("healthcare_dataset.csv",nrows=150)

In [48]:
health_df.columns = health_df.columns.str.lower().str.replace(' ','_')
# Convert specific columns to strings if needed
health_df['age'] = health_df['age'].astype(str)
health_df['billing_amount'] = health_df['billing_amount'].astype(str)
health_df['room_number'] = health_df['room_number'].astype(str)

In [49]:
health_df.insert(0, 'id', health_df.index)

In [50]:
health_df.columns

Index(['id', 'name', 'age', 'gender', 'blood_type', 'medical_condition',
       'date_of_admission', 'doctor', 'hospital', 'insurance_provider',
       'billing_amount', 'room_number', 'admission_type', 'discharge_date',
       'medication', 'test_results'],
      dtype='object')

In [51]:
health_df = health_df[['id','age', 'gender', 'blood_type', 'medical_condition','billing_amount',
       'insurance_provider','medication', 'test_results']].drop_duplicates()

In [52]:
health_df.shape

(150, 9)

In [53]:
documents = health_df.to_dict(orient='records')

In [54]:
documents[0]

{'id': 0,
 'age': '30',
 'gender': 'Male',
 'blood_type': 'B-',
 'medical_condition': 'Cancer',
 'billing_amount': '18856.281305978155',
 'insurance_provider': 'Blue Cross',
 'medication': 'Paracetamol',
 'test_results': 'Normal'}

In [55]:
prompt_template = """
You emulate a user of our healthcare database.
Formulate 5 questions this user might ask based on a provided database.
Make the questions specific to this database.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

age: {age}
gender: {gender}
blood_type: {blood_type}
medical_condition: {medical_condition}
insurance_provider: {insurance_provider}
medication: {medication}
billing_amount: {billing_amount}
test_results: {test_results}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2", "question3", "question4", "question5"]}}
""".strip()

In [56]:
prompt = prompt_template.format(**documents[0])

In [57]:
print(prompt)

You emulate a user of our healthcare database.
Formulate 5 questions this user might ask based on a provided database.
Make the questions specific to this database.
The record should contain the answer to the questions, and the questions should
be complete and not too short. Use as fewer words as possible from the record. 

The record:

age: 30
gender: Male
blood_type: B-
medical_condition: Cancer
insurance_provider: Blue Cross
medication: Paracetamol
billing_amount: 18856.281305978155
test_results: Normal

Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2", "question3", "question4", "question5"]}


In [58]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [59]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [None]:
questions = llm(prompt)

In [None]:
import json

In [None]:
json.loads(questions)

In [22]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [23]:
from tqdm.auto import tqdm


In [24]:
results = {}


In [26]:
for doc in tqdm(documents): 
    doc_id = doc['name']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    questions = json.loads(questions_raw)
    results[doc_id] = questions['questions']

  0%|          | 0/500 [00:00<?, ?it/s]

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [27]:
for doc in tqdm(documents): 
    doc_id = doc['name']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    print(f"Generated questions for {doc_id}: {questions_raw}")  # Debugging line

    if not questions_raw:
        print(f"No questions generated for {doc_id}. Skipping...")
        continue

    try:
        questions = json.loads(questions_raw)
        results[doc_id] = questions['questions']
    except json.JSONDecodeError as e:
        print(f"JSON decoding error for {doc_id}: {e}")
        continue


  0%|          | 0/500 [00:00<?, ?it/s]

Generated questions for LesLie TErRy: {
  "questions": [
    "What additional diagnostic tests might be necessary to ascertain my medical condition?",
    "Are there lifestyle changes I should adopt considering that the test results are inconclusive and I have obesity?",
    "As an individual over the age of fifty-five, am I eligible for any special healthcare programs beyond Medicare due to my specific medical condition?",
    "How does being a blood type A+ potentially impact treatment options available or restrictions that might apply given your current medical information and obesity status?",
    "Could you provide detailed explanation on the billing amount in relation to standard costs for individuals with similar age, gender, blood type and condition such as inconclusive results?"
  ]
}
Generated questions for DaNnY sMitH: {
  "questions": [
    "As a female with blood type A-, what dietary or exercise recommendations would be suitable for managing my obesity?",
    "What are th

InternalServerError: Error code: 500 - {'error': {'message': 'an unknown error was encountered while running the model ', 'type': 'api_error', 'param': None, 'code': None}}

In [None]:
if not questions_raw.strip():  # Checks if the output is empty or just whitespace
    print(f"Empty or invalid output for {doc_id}. Skipping...")
    continue

In [None]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [None]:
final_results[0]

In [None]:
df_results = pd.DataFrame(final_results, columns=['name', 'question'])

In [None]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)