## Document Loading

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader

In [2]:
loader = CSVLoader(
    file_path="/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv",
    csv_args={
        "delimiter": ",",
        "quotechar": '"',
        "fieldnames": ["id", "symptom"],
    },
)
data = loader.load()

In [3]:
data[:5]

[Document(page_content='id: id\nsymptom: symptom', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 0}),
 Document(page_content='id: 1\nsymptom: Family history of ankylosing spondylitis', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 1}),
 Document(page_content='id: 2\nsymptom: tummy ache', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 2}),
 Document(page_content='id: 2\nsymptom: hurt in belly', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 3}),
 Document(page_content='id: 2\nsymptom: pain in belly', metadata={'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv', 'row': 4})]

## Document Splitting

Since each row in the csv as a seperate entity in the vector database, splitting might not be required and I can move on to Vector Stores and Embedding.

## Vector Stores and Embedding

In [12]:
import getpass
import os

os.environ["OPENAI_API_KEY"] = getpass.getpass()

In [13]:
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

In [20]:
from langchain_community.vectorstores import Chroma

persist_directory = '/Users/mansipandya/Desktop/KnidianMD/docs/chroma'

# Create the vector store
vectordb = Chroma.from_documents(
    documents=data,
    embedding=embedding,
    persist_directory=persist_directory
)

In [21]:
print(vectordb._collection.count())

12066


In [39]:
vectordb.persist()

In [40]:
vectordb = None

In [61]:
vectordb = Chroma(persist_directory=persist_directory, embedding_function=embedding)

## Retrieval

In [70]:
question = "A 24-year-old woman presents with acute swelling in her right knee, pain that hinders her ability to bear weight on her right leg, and a one-week history of intermittent low-grade fevers and malaise. What symptoms in the database present in this sentence?"
docs = vectordb.max_marginal_relevance_search(question,k=5, fetch_k=10)

In [102]:
docs

[Document(page_content='id: 4658\nsymptom: knee tenderness with swelling suprapatellar bilateral', metadata={'row': 11969, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 1109\nsymptom: Pain of knee region', metadata={'row': 3234, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 4429\nsymptom: Knee pain with prolonged sitting.', metadata={'row': 11105, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 1276\nsymptom: Finding of joint swelling', metadata={'row': 3651, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'}),
 Document(page_content='id: 4431\nsymptom: Tenderness of knee joint.', metadata={'row': 11107, 'source': '/Users/mansipandya/Desktop/KnidianMD/data/symptoms_db.csv'})]

## Preprocessing the Input Questions

In [91]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

In [92]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stop_words.discard('no')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mansipandya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [93]:
df = pd.read_csv('/Users/mansipandya/Desktop/KnidianMD/data/test_cases.csv')
original_text = df.iloc[0].medical_history
original = df.iloc[0].medical_history.lower() 
pattern = r'\b\d+(\.\d+)?\s*\w+/\w+\b'
original = re.sub(pattern, '.', original)
original = original.replace('-', ' ')
original = ''.join(char for char in original if char.isalpha() or char.isspace() or char == '.')
words = original.split()
filtered_words = [word for word in words if word not in stop_words]
cleaned_text = ' '.join(filtered_words)
sentences = cleaned_text.split('.')
cleaned_sentences = [sentence.strip() for sentence in sentences]

## Function to Deal with entire Input Text

In [94]:
cleaned_sentences

['year old woman presents acute swelling right knee pain hinders ability bear weight right leg one week history intermittent low grade fevers malaise',
 'history daily heroin use otherwise healthy',
 'reports no recent sexual activity',
 'examination right knee erythematous hot obvious joint effusion',
 'limited minimal range motion knee',
 'track marks antecubital fossae',
 'joint aspiration shows leukocyte count cells per mm neutrophils',
 '']

In [119]:
def run(sentences):
    symptom_list = []
    k_number=5
    for sentence in sentences:
        if sentence == ' ':
            continue
        question = f"{sentence}. What symptoms in the database present in this sentence?"
        docs = vectordb.max_marginal_relevance_search(question,k=k_number, fetch_k=10)
        print(f"Following Symptoms are for Sentence: {sentence}")
        for i in range(k_number):
            text = docs[i].page_content
            lines = text.split('\n')
            for line in lines:
                if line.startswith('symptom:'):
                    symptom = line.split(': ', 1)[1]
                    print(symptom)
                    symptom_list.append(symptom)
                
    return symptom_list

In [120]:
symptom_list = run(cleaned_sentences)

Following Symptoms are for Sentence: year old woman presents acute swelling right knee pain hinders ability bear weight right leg one week history intermittent low grade fevers malaise
Pain of knee region
knee tenderness with swelling suprapatellar bilateral
Superior unilateral or bilateral knee joint pain
Arthralgia of knee
knees aching
Following Symptoms are for Sentence: history daily heroin use otherwise healthy
Personal history of drug therapy
injecting drug
Drug UserIntravenous
History of - alcoholism (context-dependent category)
Drug addiction
Following Symptoms are for Sentence: reports no recent sexual activity
High risk sexual behavior
decreased sexual interest (symptom)
Unsafe sexual practices
Sexual Dysfunction
bleeding after sexual intercourse
Following Symptoms are for Sentence: examination right knee erythematous hot obvious joint effusion
Tenderness of knee joint.
Tenderness on palpation of suprapatellar region of both knees with swelling
stiffness of knee joint (diagno

In [118]:
symptom_list

['Pain of knee region',
 'knee tenderness with swelling suprapatellar bilateral',
 'Superior unilateral or bilateral knee joint pain',
 'Aching in knees',
 'Arthralgia (knee)',
 'Personal history of drug therapy',
 'injecting drug',
 'Drug UserIntravenous',
 'History of - alcoholism (context-dependent category)',
 'Drug addiction',
 'High risk sexual behavior',
 'decreased sexual interest (symptom)',
 'Unsafe sexual practices',
 'Sexual Dysfunction',
 'bleeding after sexual intercourse',
 'Tenderness of knee joint.',
 'Tenderness on palpation of suprapatellar region of both knees with swelling',
 'stiffness of knee joint (diagnosis)',
 'Skin of the knee',
 'Arthralgia of knee',
 'limited motion of range',
 'Range of joint movement reduced',
 'Limitation of joint movement',
 'Hip limite range of motion',
 'stiffness of knee joint (diagnosis)',
 'Pain or tenderness on palpation of bicipital groove',
 'Tarsus',
 'Anconeal bursa',
 'Orthopedic Disorder',
 'Skin of the elbows',
 'neutrophil