In [208]:
import numpy as np

## Document Loading

In [209]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [210]:
loader = CSVLoader(
    file_path="/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv",
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": ["id", "symptom"],
    },
)
data = loader.load()

In [211]:
data[:5]

[Document(page_content='id: id\nsymptom: symptom', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 0}),
 Document(page_content='id: 1\nsymptom: Family history of ankylosing spondylitis', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 1}),
 Document(page_content='id: 2\nsymptom: tummy ache', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 2}),
 Document(page_content='id: 2\nsymptom: hurt in belly', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 3}),
 Document(page_content='id: 2\nsymptom: pain in belly', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 4})]

## Document Splitting

Probably deal with this later


## Vector Stores and Embedding

In [212]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [213]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

In [214]:
from langchain_community.vectorstores import Chroma

In [8]:
persist_directory = '/Users/mansipandya/Desktop/KnidianMD/docs/chroma/'

In [9]:
# Create the vector store
vectordb = Chroma.from_documents(
    documents=data,
    embedding=embedding,
    persist_directory=persist_directory
)

In [215]:
print(vectordb._collection.count())

12066


In [216]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

## Retrieval

In [219]:
question = "A 24-year-old woman presents with acute swelling in her right knee, pain that hinders her ability to bear weight on her right leg, and a one-week history of intermittent low-grade fevers and malaise. What symptoms in the database present in this sentence?"
docs = vectordb.max_marginal_relevance_search(question,k=5, fetch_k=5)

In [221]:
docs

[Document(page_content='id: 4658\nsymptom: knee tenderness with swelling suprapatellar bilateral', metadata={'row': 11969, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 4658\nsymptom: Tenderness on palpation of suprapatellar region of both knees with swelling', metadata={'row': 11968, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 1109\nsymptom: Pain of knee region', metadata={'row': 3234, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 4646\nsymptom: Posterior knee pain', metadata={'row': 11943, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 4657\nsymptom: Superior unilateral or bilateral knee joint pain', metadata={'row': 11967, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'})]

In [220]:
for i in range(len(docs)):
    print(docs[i].page_content)

id: 4658
symptom: knee tenderness with swelling suprapatellar bilateral
id: 4658
symptom: Tenderness on palpation of suprapatellar region of both knees with swelling
id: 1109
symptom: Pain of knee region
id: 4646
symptom: Posterior knee pain
id: 4657
symptom: Superior unilateral or bilateral knee joint pain


## Chaining Prompts

In [294]:
from openai import OpenAI

client = OpenAI()

def get_completion_from_messages(messages, model="gpt-3.5-turbo", temperature=0, max_tokens=500):
    response = client.chat.completions.create(
        model=model,
        messages=messages,
        temperature=temperature, 
        max_tokens=max_tokens, 
    )
    return response.choices[0].message.content

In [312]:
delimiter = "####"

medical_history = "A 24-year-old woman presents with acute swelling in her right knee, pain that hinders her ability to bear weight on her right leg, and a one-week history of intermittent low-grade fevers and malaise"

system_message = f"""
You will be provided with a question. \
The question will be delimited with {delimiter} characters. \
Output a python list, where each entry in the list is a symptom present in the medical history text below \

{medical_history}

If a symptoms is mentioned, it must be associated with it must be present in the medical history above. \
If no symptoms are found, output an empty list. \

Only output the list of objects, with nothing else.
"""
user_message_1 = f"""What symptoms are present in the medical history?"""

messages =  [  
{'role':'system', 
 'content': system_message},    
 
{'role':'user', 
 'content': f"{delimiter}{user_message_1}{delimiter}"},  
] 

response_1 = get_completion_from_messages(messages)

print(response_1)

['acute swelling', 'pain', 'fevers', 'malaise']


In [313]:
type(response_1)

str

In [314]:
response_1 = response_1.strip('[]')
response_1 = response_1.split(', ')
response_1 = [symptom.strip("'") for symptom in response_1]

In [315]:
response_1

['acute swelling', 'pain', 'fevers', 'malaise']

In [241]:
type(response_1)

list

In [250]:
for i in range (len(response_1)):
    question = f"{response_1[i]}. What symptoms in the database present is this?"
    docs = vectordb.max_marginal_relevance_search(question,k=5, fetch_k=10)
    for each_doc in docs:
        print(each_doc.page_content)

id: 4658
symptom: Tenderness on palpation of suprapatellar region of both knees with swelling
id: 4431
symptom: Knee joint - tender (finding)
id: 4432
symptom: stiffness of knee joint (diagnosis)
id: 1276
symptom: Finding of joint swelling
id: 1109
symptom: Pain of knee region
id: 7
symptom: Hindlimb pain
id: 1039
symptom: Unable to move leg
id: 3949
symptom: Pain in lower limb worsen upon elevation of the extremity
id: 3780
symptom: Exertional leg pain
id: 3472
symptom: Lower leg pain worsened by lying down
id: 2438
symptom: Fevers (fever that comes and goes)
id: 18
symptom: Febrile
id: 4389
symptom: No fever
id: 514
symptom: FeversScarlet
id: 2822
symptom: body temperature low
id: 527
symptom: Malar rash
id: 4261
symptom: Malarial coma
id: 373
symptom: Meningitis-like
id: 2437
symptom: Malaria is endemic in recent area visited
id: 696
symptom: Infection


## QA Chain

Tried using the QA Chain but it gives really bad answers

#### Retrieval QA Chain with Prompt

In [189]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(model="gpt-3.5-turbo-0125")

In [156]:
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter
from langchain_core.runnables import RunnablePassthrough


template = """ Given the question and context:
{context}
Question: {question}

Use the question along with the retriever to get all the 
"""

custom_rag_prompt = PromptTemplate.from_template(template)

retriever = vectordb.as_retriever(search_type="mmr", search_kwargs={"k": 20})

def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)


rag_chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | custom_rag_prompt
    | llm
    | StrOutputParser()
)

In [157]:
question = "A 24-year-old woman presents with acute swelling in her right knee, pain that hinders her ability to bear weight on her right leg, and a one-week history of intermittent low-grade fevers and malaise. What symptoms in the context are present in the previous sentence?"

In [158]:
rag_chain.invoke(question)

'symptoms present in the context:\n\n- Symptom: Pain of knee region\n- Symptom: Knee pain\n- Symptom: Pain in unspecified knee\n- Symptom: Painful knee'

## Preprocessing the Input Questions

In [251]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [252]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.discard('no')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mansipandya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [253]:
df = pd.read_csv('/Users/mansipandya/Desktop/KnidianMD/data/test_cases.csv')
original_text = df.iloc[0].medical_history
original = df.iloc[0].medical_history.lower() 
pattern = r'\b\d+(\.\d+)?\s*\w+/\w+\b'
original = re.sub(pattern, '.', original)
original = original.replace('-', ' ')
original = ''.join(char for char in original if char.isalpha() or char.isspace() or char == '.')
words = original.split()
filtered_words = [word for word in words if word not in stop_words]
cleaned_text = ' '.join(filtered_words)
sentences = cleaned_text.split('.')
cleaned_sentences = [sentence.strip() for sentence in sentences]

## Function to Deal with entire Input Text for Retrieval

In [254]:
cleaned_sentences

['year old woman presents acute swelling right knee pain hinders ability bear weight right leg one week history intermittent low grade fevers malaise',
 'history daily heroin use otherwise healthy',
 'reports no recent sexual activity',
 'examination right knee erythematous hot obvious joint effusion',
 'limited minimal range motion knee',
 'track marks antecubital fossae',
 'joint aspiration shows leukocyte count cells per mm neutrophils',
 '']

In [277]:
def run(sentences):
    symptom_list = []
    symptom_id_list = []
    k_number=5
    for sentence in sentences:
        if sentence == '':
            continue
        question = f"{sentence}. What symptoms in the database present in this sentence?"
        docs = vectordb.max_marginal_relevance_search(question,k=k_number, fetch_k=10)
        for i in range(k_number):
            text = docs[i].page_content
            lines = text.split('\n')
            for line in lines:
                if line.startswith('id:') and line!= 'id: id':
                    symptom_id = line.split(': ', 1)[1]
                    symptom_id_list.append(int(symptom_id))
                if line.startswith('symptom:') and line != 'symptom: symptom':
                    symptom = line.split(': ', 1)[1]
                    symptom_list.append(symptom)

        #create a dictionary of the two
        symptom_dict = {symptom_id_list[i]: symptom_list[i] for i in range(len(symptom_list))}
                
    return symptom_dict

In [278]:
symptom_list = run(cleaned_sentences)

In [279]:
symptom_list

{1109: 'Arthralgia of knee',
 4658: 'Tenderness on palpation of suprapatellar region of both knees with swelling',
 4657: 'Superior unilateral or bilateral knee joint pain',
 2745: 'Personal history of drug therapy',
 1469: 'Drug UserIntravenous',
 265: 'History of - alcoholism (context-dependent category)',
 1301: 'Drug addiction',
 3116: 'Unsafe sexual practices',
 306: 'decreased sexual interest (symptom)',
 2965: 'Sexual Dysfunction',
 3836: 'bleeding after sexual intercourse',
 4431: 'Tenderness of knee joint.',
 4217: 'joint tenderness was seen',
 3720: 'Skin of the knee',
 450: 'Limitation of joint movement',
 4432: 'stiffness of knee joint (diagnosis)',
 1162: 'cicatrix',
 4686: 'Tenderness in popliteal fossa',
 4587: 'Winged scapula',
 4238: 'Tuberculum Arthriticum',
 3527: 'Testiculat trauma',
 2824: 'neutrophilic leukocytosis',
 3713: 'Increased leukocyte count in CSF',
 437: 'Decreased blood leukocyte number',
 2204: 'Bronchoalveolar lavage with neutophils',
 4318: 'Leucocy

## Function to Deal with entire Input Text for New Custom Retrieval Chain

In [319]:
cleaned_sentences

['year old woman presents acute swelling right knee pain hinders ability bear weight right leg one week history intermittent low grade fevers malaise',
 'history daily heroin use otherwise healthy',
 'reports no recent sexual activity',
 'examination right knee erythematous hot obvious joint effusion',
 'limited minimal range motion knee',
 'track marks antecubital fossae',
 'joint aspiration shows leukocyte count cells per mm neutrophils',
 '']

In [326]:
def run_updated(sentences):
    symptom_list = []
    symptom_id_list = []
    k_number=5
    for sentence in sentences:
        if sentence == '':
            continue
        medical_history = sentence
        delimiter = "####"
        system_message = f"""
        You will be provided with a question. \
        The question will be delimited with {delimiter} characters. \
        Output a python list, where each entry in the list is a symptom present in the medical history text below \

        {medical_history}

        If a symptoms is mentioned, it must be associated with it must be present in the medical history above. \
        If no symptoms are found, output an empty list. \

        Only output the list of objects, with nothing else.
        """
        user_message_1 = f"""What symptoms are present in the medical history?"""

        messages =  [  
        {'role':'system', 
        'content': system_message},    
        {'role':'user', 
        'content': f"{delimiter}{user_message_1}{delimiter}"},  
        ] 

        response = get_completion_from_messages(messages)

        response = response.strip('[]')
        response = response.split(', ')
        response = [symptom.strip("'") for symptom in response]

        for symptom in response:
            question = f"{symptom}. What symptoms in the database are present in the previous sentence?"
            docs = vectordb.max_marginal_relevance_search(question,k=k_number, fetch_k=10)
            for i in range(k_number):
                text = docs[i].page_content
                lines = text.split('\n')
                for line in lines:
                    if line.startswith('id:') and line!= 'id: id':
                        symptom_id = line.split(': ', 1)[1]
                        symptom_id_list.append(int(symptom_id))
                    if line.startswith('symptom:') and line != 'symptom: symptom':
                        symptom = line.split(': ', 1)[1]
                        symptom_list.append(symptom)

    #create a dictionary of the two
    symptom_dict = {symptom_id_list[i]: symptom_list[i] for i in range(len(symptom_list))}
                
    return symptom_dict

In [327]:
symptom_list = run_updated(cleaned_sentences)

In [328]:
symptom_list

{263: 'swelling of abdomen',
 4347: 'swells',
 1504: 'Family history of swelling',
 1109: 'Pain of knee region',
 4646: 'Posterior knee pain',
 4424: 'Anterior knee pain',
 2438: 'Fevers (fever that comes and goes)',
 3317: 'Fever of unknown origin',
 18: 'Febrile',
 4389: 'No fever',
 2822: 'body temperature low',
 50: 'feeling unwell',
 2653: 'Myelitis',
 1469: 'injecting drug',
 2745: 'Personal history of drug therapy',
 1301: 'Drug addiction',
 324: 'Cocaine user',
 142: 'reported a history of cancer',
 698: 'Detail of history of travel',
 650: 'Personal history of urinary tract infection',
 4419: 'No age 51 to 99',
 265: 'History of - alcoholism (context-dependent category)',
 1495: 'Dermal erythema',
 3892: 'Erythrocythaemia',
 1774: 'Erythroderma',
 3224: 'Figurate erythema',
 3734: 'flushed hot',
 401: 'Joint inflammation',
 1431: 'Ultrasound examination of joint',
 4217: 'Joint tenderness.',
 1276: 'Finding of joint swelling (finding)',
 450: 'Range of joint movement reduced',