In [23]:
import pandas as pd

health_df = pd.read_csv("healthcare_dataset.csv",nrows=100)

In [24]:
health_df.columns = health_df.columns.str.lower().str.replace(' ','_')
# Convert specific columns to strings if needed
health_df['age'] = health_df['age'].astype(str)
health_df['billing_amount'] = health_df['billing_amount'].astype(str)
health_df['room_number'] = health_df['room_number'].astype(str)

In [25]:
health_df.insert(0, 'id', health_df.index)

In [26]:
health_df.columns

Index(['id', 'name', 'age', 'gender', 'blood_type', 'medical_condition',
       'date_of_admission', 'doctor', 'hospital', 'insurance_provider',
       'billing_amount', 'room_number', 'admission_type', 'discharge_date',
       'medication', 'test_results'],
      dtype='object')

In [27]:
health_df = health_df[['id','age', 'gender', 'medical_condition','billing_amount',
       'insurance_provider', 'test_results']].drop_duplicates()

In [28]:
health_df.shape

(100, 7)

In [29]:
documents = health_df.to_dict(orient='records')

In [30]:
documents[0]

{'id': 0,
 'age': '30',
 'gender': 'Male',
 'medical_condition': 'Cancer',
 'billing_amount': '18856.281305978155',
 'insurance_provider': 'Blue Cross',
 'test_results': 'Normal'}

In [55]:
prompt_template = """
You emulate a user of our healthcare database.
Formulate 2 questions this user might ask based on a provided database.
Make the questions specific to this database. Question length should not be more than 12 words.

The record should contain the answer to the questions, and the questions should
be complete and not too short. Use any two words from the record to generate question. 


The record:

medical_condition: {medical_condition}
insurance_provider: {insurance_provider}
billing_amount: {billing_amount}
test_results: {test_results}

Provide the output in parsable JSON without using code blocks:

{{"questions": ["question1", "question2"]}}
""".strip()

In [56]:
prompt = prompt_template.format(**documents[0])

In [57]:
print(prompt)

You emulate a user of our healthcare database.
Formulate 2 questions this user might ask based on a provided database.
Make the questions specific to this database. Question length should not be more than 12 words.

The record should contain the answer to the questions, and the questions should
be complete and not too short. Use any two words from the record to generate question. 


The record:

medical_condition: Cancer
insurance_provider: Blue Cross
billing_amount: 18856.281305978155
test_results: Normal

Provide the output in parsable JSON without using code blocks:

{"questions": ["question1", "question2"]}


In [58]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [61]:
import json
import re

def llm(prompt):
    response = client.chat.completions.create(
        model='phi',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

# Fetch questions using llm
questions = llm(prompt)

# Use regular expression to extract the JSON part of the response
json_match = re.search(r'{.*}', questions, re.DOTALL)

if json_match:
    json_str = json_match.group(0)
    try:
        parsed_questions = json.loads(json_str)
        print(parsed_questions)
    except json.JSONDecodeError as e:
        print("Failed to parse JSON response:", e)
        print("Extracted JSON part:", json_str)
else:
    print("No valid JSON found in the response.")
    print("Raw response:", questions)

{'questions': ['What is my test results for Cancer diagnosis?', 'My insurance provider details, please.']}


In [70]:
import json
import re
from tqdm.auto import tqdm

def generate_questions(doc):
    # Prepare the prompt
    prompt = prompt_template.format(**doc)

    # Get response from the model
    response = client.chat.completions.create(
        model='phi',
        messages=[{"role": "user", "content": prompt}]
    )

    # Get the content from the response
    json_response = response.choices[0].message.content

    # Clean up the response to extract valid JSON
    json_match = re.search(r'{.*}', json_response, re.DOTALL)
    if json_match:
        return json_match.group(0)  # Return the cleaned JSON string
    else:
        return None  # Return None if no valid JSON is found

results = {}
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    # Generate questions and handle errors
    questions_raw = generate_questions(doc)
    if questions_raw:  # Only parse if valid JSON was extracted
        try:
            questions = json.loads(questions_raw)
            results[doc_id] = questions['questions']
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON for doc_id {doc_id}: {e}")
            print(f"Raw response: {questions_raw}")
    else:
        print(f"No valid JSON found for doc_id {doc_id}.")

  0%|          | 0/100 [00:00<?, ?it/s]

No valid JSON found for doc_id 1.
Failed to parse JSON for doc_id 3: Extra data: line 6 column 2 (char 145)
Raw response: {
  "insurance_provider": "Medicare", 
  "medical_condition": "Diabetes", 
  "billing_amount": 37909.78240987528, 
  "test_results": "Abnormal"
},
{"questions": [
  "What is the insurance provider for this patient?" , # Check with the user
  "Which medical condition is recorded in this patient data by healthcare database?"  # Double-check with user to make sure it's correct.
]}
Failed to parse JSON for doc_id 5: Expecting property name enclosed in double quotes: line 1 column 2 (char 1)
Raw response: {'questions': ['What is UnitedHealthcare?', 'Is it the medical condition you have?']}
Failed to parse JSON for doc_id 6: Expecting ',' delimiter: line 1 column 49 (char 48)
Raw response: {"questions": ["What is the medical condition?" "What insurance provider does it apply with? - Medicare - billing amount for the treatment - inconclusive test results?", "Medical condit

KeyError: 'questions'

In [68]:
import json
import re
from tqdm.auto import tqdm

def generate_questions(doc):
    # Prepare the prompt
    prompt = prompt_template.format(**doc)

    # Get response from the model
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )

    # Get the content from the response
    json_response = response.choices[0].message.content

    # Clean up the response to extract valid JSON
    json_match = re.search(r'{.*}', json_response, re.DOTALL)
    if json_match:
        return json_match.group(0).replace('\n', '').replace('\r', '').strip()  # Clean and return the JSON string
    else:
        return None  # Return None if no valid JSON is found

results = {}

# Attempt to load results from a previous run if available
try:
    with open('results.json', 'r') as f:
        results = json.load(f)
except (FileNotFoundError, json.JSONDecodeError):
    # Start with an empty dictionary if the file doesn't exist or is corrupted
    results = {}

# Process documents and resume from the last point
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue  # Skip already processed documents

    # Generate questions and handle errors
    questions_raw = generate_questions(doc)
    if questions_raw:  # Only parse if valid JSON was extracted
        try:
            questions = json.loads(questions_raw)
            results[doc_id] = questions.get('questions', [])
        except json.JSONDecodeError as e:
            print(f"Failed to parse JSON for doc_id {doc_id}: {e}")
            print(f"Raw response: {questions_raw}")
    else:
        print(f"No valid JSON found for doc_id {doc_id}.")

    # Save progress after each document
    with open('results.json', 'w') as f:
        json.dump(results, f)

print("Processing complete!")

  0%|          | 0/100 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [69]:
results

{'0': ['What is my billing amount for Cancer treatment with Blue Cross?',
  'Can I have normal test results while diagnosed with cancer?'],
 '1': ['How does Medicare handle billing for obesity-related treatments?',
  'What additional tests are suggested after inconclusive test results?'],
 '2': ['Within Aetna, what coverage is there for obesity treatment costs amounting to $27955.096078842456?',
  "What are the recommended lifestyle changes or treatments for a patient with normal test results but diagnosed as morbidly obese by Aetna's standards?"],
 '3': ['Can Medicare cover my diabetes treatment costs?',
  'What led to abnormal test results for Diabetes?'],
 '4': ['What is my billing amount for cancer treatment?',
  'Which insurance provider covers abnormal test results?'],
 '5': ['what is my total billing amount for Asthma services?',
  'who am I billed by for the management of my Asthma?'],
 '6': ["What is this patient's medical condition?",
  'Which insurance provider covers this t

In [27]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions_raw = generate_questions(doc)
    print(f"Generated questions for {doc_id}: {questions_raw}")  # Debugging line

    if not questions_raw:
        print(f"No questions generated for {doc_id}. Skipping...")
        continue

    try:
        questions = json.loads(questions_raw)
        results[doc_id] = questions['questions']
    except json.JSONDecodeError as e:
        print(f"JSON decoding error for {doc_id}: {e}")
        continue


  0%|          | 0/500 [00:00<?, ?it/s]

Generated questions for LesLie TErRy: {
  "questions": [
    "What additional diagnostic tests might be necessary to ascertain my medical condition?",
    "Are there lifestyle changes I should adopt considering that the test results are inconclusive and I have obesity?",
    "As an individual over the age of fifty-five, am I eligible for any special healthcare programs beyond Medicare due to my specific medical condition?",
    "How does being a blood type A+ potentially impact treatment options available or restrictions that might apply given your current medical information and obesity status?",
    "Could you provide detailed explanation on the billing amount in relation to standard costs for individuals with similar age, gender, blood type and condition such as inconclusive results?"
  ]
}
Generated questions for DaNnY sMitH: {
  "questions": [
    "As a female with blood type A-, what dietary or exercise recommendations would be suitable for managing my obesity?",
    "What are th

InternalServerError: Error code: 500 - {'error': {'message': 'an unknown error was encountered while running the model ', 'type': 'api_error', 'param': None, 'code': None}}

In [None]:
final_results = []

for doc_id, questions in results.items():
    for q in questions:
        final_results.append((doc_id, q))

In [None]:
final_results[0]

In [None]:
df_results = pd.DataFrame(final_results, columns=['name', 'question'])

In [None]:
df_results.to_csv('../data/ground-truth-retrieval.csv', index=False)