In [1]:
import pandas as pd
import numpy as np
from langchain.vectorstores import FAISS
from langchain.text_splitter import CharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
import ollama
import kagglehub
import os

In [2]:
# Se descargó el dataset "Disease Symptoms and Patient Profile Dataset". 
# Como los contenidos están en inglés, se escribió el código en inglés
# Se descarga la última version del dataset de Kaggle
path = kagglehub.dataset_download("uom190346a/disease-symptoms-and-patient-profile-dataset")
print("Path to dataset files:", path)
# Se guarda en un csv
df_medical = pd.read_csv(os.path.join(path, "Disease_symptom_and_patient_profile_dataset.csv"))

Path to dataset files: C:\Users\manue\.cache\kagglehub\datasets\uom190346a\disease-symptoms-and-patient-profile-dataset\versions\2


In [3]:
# Se convierte el data frame a text para que pueda ser incluido en el contexto posteriormente
# se convierte cada fila en una descripción clínica
# Cada fila se añade a una lista, para que cada chunk corresponda a una línea
texts = []

for idx, row in df_medical.iterrows():
    disease = row['Disease']
    fever = row['Fever']
    cough = row['Cough']
    fatigue = row['Fatigue']
    breathing = row['Difficulty Breathing']
    age = row['Age']
    gender = row['Gender']
    blood_pressure = row['Blood Pressure']
    cholesterol = row['Cholesterol Level']
    outcome = row['Outcome Variable']

    # Se hace una linea por cada enfermedad y paciente
    description = (
        f"Disease: {disease}. "
        f"Symptoms - Fever: {fever}, Cough: {cough}, Fatigue: {fatigue}, Difficulty Breathing: {breathing}. "
        f"Patient Profile - Age: {age}, Gender: {gender}, Blood Pressure: {blood_pressure}, Cholesterol Level: {cholesterol}. "
        f"Diagnosis Outcome: {outcome}."
    )
    
    texts.append(description)

In [4]:
# Se crean los embeddings y un almacen FAISS
# Se está usando un modelo de embedding ligero debido a los pocos recursos que tiene mi computadora
# Cada string en la lista es un chunk y un embedding. 
# Es la mejor opción aquí, para asegurar que las diferentes enfermedades no se mezclen en cada uno de los chunks (una enfermedad = un chunk)
embeddings = OllamaEmbeddings(model="all-minilm:33m")
vectorstore = FAISS.from_texts(texts, embeddings)

In [5]:
# Aquí se recuperan las entradas más relevantes de la base de datos de vectores usando la similaridad semántica.
def retrieve_medical_info(question, vectorstore):
    # Se obtienen los 5 más similares basado en la similaridad de cosenos
    results = vectorstore.similarity_search_with_score(question, k=5)
    filtered_results = [doc for doc, score in results]
    return filtered_results

In [6]:
def medical_diagnosis_assistant(question, vectorstore):
    # Se recupera el conocimiento médico
    medical_contexts = retrieve_medical_info(question, vectorstore)
    
    # Aquí se le indica que si no hay contexto, la función termina aquí, evita alucinaciones
    if not medical_contexts:
        return "I do not have sufficient information to provide a reliable diagnosis."

    # Se combinan todos los strings en un sólo contexto 
    context = "\n".join([doc.page_content for doc in medical_contexts])

    # Se contruye el prompt (instrucciones + contexto + pregunta)
    prompt = f"""
You are a medical assistant trained to interpret structured case descriptions and symptom profiles.

Using only the information from the context below, identify what condition or disease could explain the presented symptoms. 

If the answer is unclear or incomplete, try to provide the most likely explanation based on the available context.
If the answer is not found in the context, say:
"I do not have sufficient information to provide a reliable diagnosis."

Context:
{context}

Question:
{question}

Answer:
"""
    # Aquí se le envía el prompt completo al modelo. Se puede usar un modelo más avanzado
    response = ollama.chat(
        model="llama3.2:1b",
        messages=[{"role": "user", "content": prompt}]
    )

    return response['message']['content']

In [9]:
question = "What condition is associated with chest pain and fatigue?"

answer = medical_diagnosis_assistant(question, vectorstore)

print("\nQuestion:", question)
print("Answer:", answer)


Question: What condition is associated with chest pain and fatigue?
Answer: Based on the context provided, the condition that could explain the symptoms of chest pain and fatigue is Coronary Artery Disease (CAD). This is because CAD typically presents with:

- Chest pain or discomfort
- Fatigue
- Difficulty breathing

These symptoms are consistent with the described presentation in multiple case descriptions, including Myocardial Infarction, Hypertensive Heart Disease, and Coronary Artery Disease. The fact that a patient can be diagnosed as having CAD even if they have hypertension (high blood pressure) or high cholesterol levels indicates that chest pain and fatigue can be associated with CAD.

Additionally, the diagnosis outcome is positive for patients with coronary artery disease in multiple case descriptions, further supporting this conclusion.
