In [None]:
!pip install --upgrade langchain openai faiss-cpu
!pip install -U langchain-community
!pip install tiktoken
!pip install streamlit
!pip install fuzzywuzzy
!pip install --upgrade langchain
!pip install langchain_openai
!pip install ipywidgets
!pip install gradio

Collecting langchain
  Downloading langchain-0.3.10-py3-none-any.whl.metadata (7.1 kB)
Collecting openai
  Downloading openai-1.57.1-py3-none-any.whl.metadata (24 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.4 kB)
Collecting langchain-core<0.4.0,>=0.3.22 (from langchain)
  Downloading langchain_core-0.3.22-py3-none-any.whl.metadata (6.3 kB)
Downloading langchain-0.3.10-py3-none-any.whl (1.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.0/1.0 MB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading openai-1.57.1-py3-none-any.whl (389 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m389.8/389.8 kB[0m [31m19.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading faiss_cpu-1.9.0.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (27.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.5/27.5 MB[0m [31m18.8 MB/s[0m eta [36m0:0

In [None]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
import json
import faiss

# Initialize embeddings
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    openai_api_key="YOUR_API_KEY_HERE"
)

# Load the dataset
with open("veterinary_cases.json", "r") as f:
    case_titles = json.load(f)

# Prepare documents
documents = [
    Document(
        page_content=(
            f"Animal Type: {case['Animal Type']}\n"
            f"Breed: {case['Breed']}\n"
            f"Age: {case['Age']}\n"
            f"Medical History: {case['Medical History']}\n"
            f"Symptoms: {case['Symptoms']}\n"
            f"Blood Test Results: {case['Blood test results']}\n"
            f"Diagnosis: {case['Diagnosis']}\n"
            f"Treatment: {case['Treatment']}"
        )
    )
    for case in case_titles
]

# Build FAISS index
faiss_vectorstore = FAISS.from_documents(documents, embeddings)

# Save the FAISS index
faiss.write_index(faiss_vectorstore.index, "veterinary_cases.index")

print("FAISS index is being CREATED and SAVED. PLEASE WAIT UNTIL SHOWS UP IN COLAB FILES")

with open("metadata.json", "w") as f:
    json.dump(case_titles, f)

print("Vector database complete and saved.")


FAISS index is being CREATED and SAVED. PLEASE WAIT UNTIL SHOWS UP IN COLAB FILES
Vector database complete and saved.


In [None]:
import json
import faiss
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document, HumanMessage
import ipywidgets as widgets
from IPython.display import display
import gradio as gr

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load the veterinary cases JSON
with open("veterinary_cases.json", "r") as f:
    case_titles = json.load(f)

# Initialize FAISS index
index = faiss.read_index("veterinary_cases.index")

# initialize the InMemoryDocstore
docstore = InMemoryDocstore({
    str(i): Document(
        page_content=(
            f"Animal Type: {case['Animal Type']}\n"
            f"Breed: {case['Breed']}\n"
            f"Age: {case['Age']}\n"
            f"Medical History: {case['Medical History']}\n"
            f"Symptoms: {case['Symptoms']}\n"
            f"Blood Test Results: {case['Blood test results']}\n"
            f"Diagnosis: {case['Diagnosis']}\n"
            f"Treatment: {case['Treatment']}"
        ),
        metadata={
            "Animal Type": case["Animal Type"],
            "Symptoms": case["Symptoms"].lower(),  # Store symptoms in lowercase for filtering
            "title": f"Case {i+1}"
        }
    )
    for i, case in enumerate(case_titles)
})


# Initialize FAISS vectorstore
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key="YOUR_API_KEY_HERE")
faiss_vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id={i: str(i) for i in range(index.ntotal)}
)

# Load and process the textbook data
textbook_path = "textbook_summaries.json"
with open(textbook_path, "r") as f:
    textbook_data = json.load(f)

# Create Document objects for each section
textbook_documents = [
    Document(
        page_content=section["content"],
        metadata=section.get("metadata", {"source": section["title"]})
    )
    for section in textbook_data
]

# Generate embeddings and index the textbook sections
textbook_vectorstore = FAISS.from_documents(textbook_documents, embeddings)

retriever = faiss_vectorstore.as_retriever()
llm = ChatOpenAI(model="gpt-4", openai_api_key="YOUR_API_KEY_HERE")

# Query handling with enhanced filtering and debugging
def handle_query(query):
    if len(query.strip()) < 3 or not re.search(r'[a-zA-Z]', query) or all(re.fullmatch(r'[a-zA-Z]{3}', word) for word in query.split()):
      print("Query is too short or nonspecific. Please enter more specific symptoms or details.")
      return

    # Split query into keywords for filtering
    keyword_filters = query.lower().split()
    print(f"Query Keywords: {keyword_filters}")  # Debugging

    # Retrieve veterinary case documents
    retrieved_docs = retriever.get_relevant_documents(query, k=10)  # Retrieve up to 10 documents
    print(f"Retrieved {len(retrieved_docs)} veterinary documents")  # Debugging

    # Filter veterinary documents
    filtered_docs = [
        doc for doc in retrieved_docs
        if any(fuzz.partial_ratio(keyword, doc.metadata.get("Symptoms", "")) > 70 for keyword in keyword_filters)
        or any(fuzz.partial_ratio(keyword, doc.page_content.split('Diagnosis: ')[-1].split('Treatment: ')[0].strip()) > 75 for keyword in keyword_filters)
    ]
    # print(f"Filtered {len(filtered_docs)} relevant veterinary documents")  # Debugging

    # Build the corpus for TF-IDF using both veterinary documents and textbook sections
    corpus = [
        f"{doc.metadata.get('Symptoms', '')} {doc.page_content}"
        for doc in filtered_docs
        if doc.page_content.strip()  # Ensure non-empty veterinary document content
    ] + [
        doc.page_content
        for doc in textbook_vectorstore.as_retriever().get_relevant_documents(query)
        if doc.page_content.strip()  # Ensure non-empty textbook section content
    ]

    if not corpus:
        print("No meaningful content in the filtered documents or textbook sections. Using fallback.")
        corpus = [
            f"{doc.metadata.get('Symptoms', '')} {doc.page_content}" for doc in retrieved_docs[:3]  # Fallback: Top 3 retrieved veterinary docs
        ] + [
            doc.page_content for doc in textbook_vectorstore.as_retriever().get_relevant_documents(query)[:3]  # Fallback: Top 3 textbook sections
        ]



    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english", max_features=20)
    try:
        tfidf_matrix = vectorizer.fit_transform(corpus)
        # Extract top keywords using TF-IDF
        feature_names = vectorizer.get_feature_names_out()
        related_keywords = set()

        for doc_idx in range(tfidf_matrix.shape[0]):
            # Get TF-IDF scores for the document
            tfidf_scores = zip(feature_names, tfidf_matrix[doc_idx].toarray().flatten())

            # Select top keywords
            top_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:5]
            related_keywords.update([keyword for keyword, score in top_keywords])
    except ValueError as e:
        print(f"TF-IDF Error: {e}")
        related_keywords = set()

    print(f"Extracted relevant words from Veterinary Cases (TF-IDF): {related_keywords}")


    # Retrieve relevant textbook sections
    textbook_results = [
        doc for doc in textbook_vectorstore.as_retriever().get_relevant_documents(query)
        if any(fuzz.partial_ratio(keyword.lower(), doc.page_content.lower()) > 40 for keyword in related_keywords)
        or any(keyword.lower() in doc.page_content.lower() for keyword in related_keywords)  # Substring match
    ]
    print(f"Filtered {len(textbook_results)} relevant textbook sections")

    # Determine if textbook sections are relevant
    if not textbook_results:
        textbook_context = "No relevant information gathered from the textbook."
    else:
        textbook_context = "\n\n".join(
            f"Textbook Section {i+1}:\n{doc.page_content}" for i, doc in enumerate(textbook_results[:2]))


    # Combine relevant veterinary cases and textbook sections
    combined_context = "\n\n".join(
        [
            f"Veterinary Case {i+1}:\n"
            f"{doc.page_content}\n"
            f"Treatment Mentioned: {doc.page_content.split('Treatment: ')[-1].strip()}"
            for i, doc in enumerate(filtered_docs[:2])
        ] +
        [
            f"Textbook Section {i+1}:\n{doc.page_content}" for i, doc in enumerate(textbook_results[:2])
        ]
    )

    if not combined_context.strip():
        print("No relevant cases or textbook sections found. Please refine your query.")
        return

    # Query the LLM with combined context
    prompt = (
        "Based on the following context, provide possible explanations for the symptoms described. "
        "Highlight potential conditions and suggest further evaluation where necessary. "
        "Do not make definitive statements about the diagnosis:\n\n"
        "Structure your answer like so (bold each and separate each by a new line):\n\n"
        "**Case:** All information from json\n"
        "**Animal Type:**\n"
        "**Breed:**\n"
        "**Potential Other Symptoms:**\n"
        "**Potential Diagnosis:**\n"
        "**Potential Treatments:**\n\n"
        "Then a separate small paragraph including textbook analysis and info.\n\n"
        f"Query: {query}\n\nContext:\n{combined_context}"
    )

    response = llm.invoke([HumanMessage(content=prompt)])

    # Display results
    print("\nGenerated Answer:")
    print(response.content)
    return response.content


# # Interactive input with ipywidgets NOTEBOOK
query_input = widgets.Text(
    description='Query:',
    placeholder='Enter symptoms, animal type, or specific question...'
)

output = widgets.Output()

def on_submit(change):
    with output:
        output.clear_output()
        handle_query(query_input.value)

query_input.on_submit(on_submit)

display(query_input, output)

# def query_interface(query):
#     result = handle_query(query)  # Call your processing function
#     if not result:  # Handle cases where the result is empty or None
#         return "Query is too short or nonspecific. Please enter more specific symptoms or details."
#     return result

# interface = gr.Interface(
#     fn=query_interface,
#     inputs=gr.Textbox(label="Enter your query"),
#     outputs=gr.Markdown(label="Results"),
#     title="Veterinary Diagnostic Assistant"
# )

# interface.launch()

# print(f"Total documents in FAISS index: {index.ntotal}")
# print(f"Textbook Vectorstore contains {len(textbook_documents)} documents")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Text(value='', description='Query:', placeholder='Enter symptoms, animal type, or specific question...')

Output()

In [None]:
import json
import faiss
import re
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
from langchain.vectorstores import FAISS
from langchain.embeddings import OpenAIEmbeddings
from langchain_openai import ChatOpenAI
from langchain.docstore import InMemoryDocstore
from langchain.schema import Document, HumanMessage
import ipywidgets as widgets
from IPython.display import display
import gradio as gr

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

# Load the veterinary cases JSON
with open("veterinary_cases.json", "r") as f:
    case_titles = json.load(f)

# Initialize FAISS index
index = faiss.read_index("veterinary_cases.index")

# initialize the InMemoryDocstore
docstore = InMemoryDocstore({
    str(i): Document(
        page_content=(
            f"Animal Type: {case['Animal Type']}\n"
            f"Breed: {case['Breed']}\n"
            f"Age: {case['Age']}\n"
            f"Medical History: {case['Medical History']}\n"
            f"Symptoms: {case['Symptoms']}\n"
            f"Blood Test Results: {case['Blood test results']}\n"
            f"Diagnosis: {case['Diagnosis']}\n"
            f"Treatment: {case['Treatment']}"
        ),
        metadata={
            "Animal Type": case["Animal Type"],
            "Symptoms": case["Symptoms"].lower(),  # Store symptoms in lowercase for filtering
            "title": f"Case {i+1}"
        }
    )
    for i, case in enumerate(case_titles)
})


# Initialize FAISS vectorstore
embeddings = OpenAIEmbeddings(model="text-embedding-3-small", openai_api_key="YOUR_API_KEY_HERE")
faiss_vectorstore = FAISS(
    embedding_function=embeddings,
    index=index,
    docstore=docstore,
    index_to_docstore_id={i: str(i) for i in range(index.ntotal)}
)

# Load and process the textbook data
textbook_path = "textbook_summaries.json"
with open(textbook_path, "r") as f:
    textbook_data = json.load(f)

# Create Document objects for each section
textbook_documents = [
    Document(
        page_content=section["content"],
        metadata=section.get("metadata", {"source": section["title"]})
    )
    for section in textbook_data
]

# Generate embeddings and index the textbook sections
textbook_vectorstore = FAISS.from_documents(textbook_documents, embeddings)

retriever = faiss_vectorstore.as_retriever()
llm = ChatOpenAI(model="gpt-4", openai_api_key="YOUR_API_KEY_HERE")

# Query handling with enhanced filtering and debugging
def handle_query(query):
    if len(query.strip()) < 3 or not re.search(r'[a-zA-Z]', query) or all(re.fullmatch(r'[a-zA-Z]{3}', word) for word in query.split()):
      print("Query is too short or nonspecific. Please enter more specific symptoms or details.")
      return

    # Split query into keywords for filtering
    keyword_filters = query.lower().split()
    print(f"Query Keywords: {keyword_filters}")  # Debugging

    # Retrieve veterinary case documents
    retrieved_docs = retriever.get_relevant_documents(query, k=10)  # Retrieve up to 10 documents
    print(f"Retrieved {len(retrieved_docs)} veterinary documents")  # Debugging

    # Filter veterinary documents
    filtered_docs = [
        doc for doc in retrieved_docs
        if any(fuzz.partial_ratio(keyword, doc.metadata.get("Symptoms", "")) > 70 for keyword in keyword_filters)
        or any(fuzz.partial_ratio(keyword, doc.page_content.split('Diagnosis: ')[-1].split('Treatment: ')[0].strip()) > 75 for keyword in keyword_filters)
    ]
    # print(f"Filtered {len(filtered_docs)} relevant veterinary documents")  # Debugging

    # Build the corpus for TF-IDF using both veterinary documents and textbook sections
    corpus = [
        f"{doc.metadata.get('Symptoms', '')} {doc.page_content}"
        for doc in filtered_docs
        if doc.page_content.strip()  # Ensure non-empty veterinary document content
    ] + [
        doc.page_content
        for doc in textbook_vectorstore.as_retriever().get_relevant_documents(query)
        if doc.page_content.strip()  # Ensure non-empty textbook section content
    ]

    if not corpus:
        print("No meaningful content in the filtered documents or textbook sections. Using fallback.")
        corpus = [
            f"{doc.metadata.get('Symptoms', '')} {doc.page_content}" for doc in retrieved_docs[:3]  # Fallback: Top 3 retrieved veterinary docs
        ] + [
            doc.page_content for doc in textbook_vectorstore.as_retriever().get_relevant_documents(query)[:3]  # Fallback: Top 3 textbook sections
        ]



    # Initialize TF-IDF Vectorizer
    vectorizer = TfidfVectorizer(stop_words="english", max_features=20)
    try:
        tfidf_matrix = vectorizer.fit_transform(corpus)
        # Extract top keywords using TF-IDF
        feature_names = vectorizer.get_feature_names_out()
        related_keywords = set()

        for doc_idx in range(tfidf_matrix.shape[0]):
            # Get TF-IDF scores for the document
            tfidf_scores = zip(feature_names, tfidf_matrix[doc_idx].toarray().flatten())

            # Select top keywords
            top_keywords = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)[:5]
            related_keywords.update([keyword for keyword, score in top_keywords])
    except ValueError as e:
        print(f"TF-IDF Error: {e}")
        related_keywords = set()

    print(f"Extracted relevant words from Veterinary Cases (TF-IDF): {related_keywords}")


    # Retrieve relevant textbook sections
    textbook_results = [
        doc for doc in textbook_vectorstore.as_retriever().get_relevant_documents(query)
        if any(fuzz.partial_ratio(keyword.lower(), doc.page_content.lower()) > 40 for keyword in related_keywords)
        or any(keyword.lower() in doc.page_content.lower() for keyword in related_keywords)  # Substring match
    ]
    print(f"Filtered {len(textbook_results)} relevant textbook sections")

    # Determine if textbook sections are relevant
    if not textbook_results:
        textbook_context = "No relevant information gathered from the textbook."
    else:
        textbook_context = "\n\n".join(
            f"Textbook Section {i+1}:\n{doc.page_content}" for i, doc in enumerate(textbook_results[:2]))


    # Combine relevant veterinary cases and textbook sections
    combined_context = "\n\n".join(
        [
            f"Veterinary Case {i+1}:\n"
            f"{doc.page_content}\n"
            f"Treatment Mentioned: {doc.page_content.split('Treatment: ')[-1].strip()}"
            for i, doc in enumerate(filtered_docs[:2])
        ] +
        [
            f"Textbook Section {i+1}:\n{doc.page_content}" for i, doc in enumerate(textbook_results[:2])
        ]
    )

    if not combined_context.strip():
        print("No relevant cases or textbook sections found. Please refine your query.")
        return

    # Query the LLM with combined context
    prompt = (
      "Based on the following context, provide possible explanations for the symptoms described. "
      "Highlight potential conditions and suggest further evaluation where necessary. "
      "Do not make definitive statements about the diagnosis:\n\n"
      "Structure your answer like so (bold each and separate each by a new line):\n\n"
      "**Case:** All information from json\n"
      "**Animal Type:**\n"
      "**Breed:**\n"
      "**Potential Other Symptoms:**\n"
      "**Potential Diagnosis:**\n"
      "**Potential Treatments:**\n\n"
      "Then a separate small paragraph including textbook analysis and info.\n\n"
      f"Query: {query}\n\nContext:\n{combined_context}"
  )

    response = llm.invoke([HumanMessage(content=prompt)])

    # Display results
    print("\nGenerated Answer:")
    print(response.content)
    return response.content


# # Interactive input with ipywidgets NOTEBOOK
# query_input = widgets.Text(
#     description='Query:',
#     placeholder='Enter symptoms, animal type, or specific question...'
# )

# output = widgets.Output()

# def on_submit(change):
#     with output:
#         output.clear_output()
#         handle_query(query_input.value)

# query_input.on_submit(on_submit)

# display(query_input, output)

def query_interface(query):
    result = handle_query(query)  # Call your processing function
    if not result:  # Handle cases where the result is empty or None
        return "Query is too short or nonspecific. Please enter more specific symptoms or details."
    return result

interface = gr.Interface(
    fn=query_interface,
    inputs=gr.Textbox(label="Enter your query"),
    outputs=gr.Markdown(label="Results"),
    title="Veterinary Diagnostic Assistant"
)

interface.launch()

# print(f"Total documents in FAISS index: {index.ntotal}")
# print(f"Textbook Vectorstore contains {len(textbook_documents)} documents")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://21a50b6c2b1ed06334.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


