<a href="https://colab.research.google.com/github/meghamkpatel/physicaltherapyassistant/blob/main/PhysioPhrame_Test_and_Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import streamlit as st
from dotenv import load_dotenv
from pinecone import Pinecone
from langchain_openai.embeddings import OpenAIEmbeddings

# Load environment variables
OPENAI_API_KEY = st.secrets["OPENAI_API_KEY"]

# Initialize Pinecone
PINECONE_API_KEY = st.secrets["PINECONE_API_KEY"]
pc = Pinecone(api_key=PINECONE_API_KEY)

# Initialize OpenAI embeddings
embeddings = OpenAIEmbeddings(openai_api_key=OPENAI_API_KEY)


  from tqdm.autonotebook import tqdm


In [2]:
from langchain_community.document_loaders import DirectoryLoader

def load_local_documents(directory):
    loader = DirectoryLoader(directory)
    documents = loader.load()
    
    # Replace newline characters with spaces in each document
    for doc in documents:
        doc.page_content = doc.page_content.replace("\n", " ")
    
    return documents

# Define the directory containing the documents
directory = 'content/Textbook'

# Load local documents
documents = load_local_documents(directory)

In [27]:
from pinecone import Pinecone
from langchain_openai.embeddings import OpenAIEmbeddings

# Initialize Pinecone
pinecone_api_key = st.secrets["PINECONE_API_KEY"]
pinecone = Pinecone(api_key=pinecone_api_key)

# Create or load the index
index_name = "physical-therapy"
index = pc.Index(index_name)


vectorstore = pinecone.Index(name=index_name)

# Generate embeddings
embeddings = OpenAIEmbeddings()
vector_dict = {f"doc_{i}": embeddings.embed_query(doc.page_content) for i, doc in enumerate(documents)}


In [4]:
import pandas as pd

# Load the content in a pandas DataFrame
df = pd.DataFrame([doc.page_content for doc in documents], columns=["text"])

In [5]:
from giskard.rag import KnowledgeBase, generate_testset, evaluate

# Create a Knowledge Base
knowledge_base = KnowledgeBase(df)

In [6]:
# Generate the Test Set
testset = generate_testset(
    knowledge_base,
    num_questions=20,
    agent_description="A chatbot answering questions about physical therapy",
)

2024-04-24 15:14:41,657 pid:882 MainThread giskard.rag  INFO     Finding topics in the knowledge base.


  warn(


2024-04-24 15:14:51,269 pid:882 MainThread giskard.rag  INFO     Found 1 topics in the knowledge base.


Generating questions: 100%|██████████| 20/20 [15:53<00:00, 47.66s/it]


In [7]:
test_set_df = testset.to_pandas()

for index, row in enumerate(test_set_df.head(3).iterrows()):
    print(f"Question {index + 1}: {row[1]['question']}")
    print(f"Reference answer: {row[1]['reference_answer']}")
    print("Reference context:")
    print(row[1]['reference_context'])
    print("******************", end="\n\n")

Question 1: What are the key considerations for the post-operative rehabilitation program after a Total Shoulder Arthroplasty and Hemiarthroplasty?
Reference answer: Many different factors influence the post-operative rehabilitation outcome, including surgical approach, concomitant repair of the rotator cuff, arthroplasty secondary to fracture, arthroplasty secondary to rheumatoid arthritis or osteonecrosis, and individual patient factors including co-morbidities.
Reference context:
Document 5: Rehabilitation Protocol for Sternoclavicular Joint Reconstruction  This protocol is intended to guide clinicians through the post-operative course for sternoclavicular joint reconstruction. This protocol is time based (dependent on tissue healing) as well as criterion based. Specific intervention should be based on the needs of the individual and should consider exam findings and clinical decision making. The timeframes for expected outcomes contained within this guideline may vary based on surg

In [8]:
testset.save("test-set.jsonl")


In [9]:
from langchain.prompts import PromptTemplate

template = """
Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: {context}

Question: {question}
"""

prompt = PromptTemplate.from_template(template)
print(prompt.format(context="Here is some context", question="Here is a question"))


Answer the question based on the context below. If you can't 
answer the question, reply "I don't know".

Context: Here is some context

Question: Here is a question



In [35]:
from langchain_core.output_parsers import StrOutputParser

# Define a function to answer questions
def answer_fn(question, history=None):
    query_vector = embeddings.embed_query(question)
    results = index.query(vector=query_vector, top_k=5, include_metadata=True)
    context = [x['metadata']['text'] for x in results['matches']]

    #relevant_docs = [result.id for result in results]
    #context = " ".join([documents[int(doc.split('_')[1])].page_content for doc in relevant_docs])
    
    # Create the RAG Chain
    chain = (
        {"context": context, "question": question},
        template,
        StrOutputParser()
    )
    
    return chain.invoke({"context": context, "question": question})

In [36]:
# Evaluate the model on the test set
report = evaluate(answer_fn, testset=testset, knowledge_base=knowledge_base)

Asking questions to the agent:   0%|          | 0/20 [00:02<?, ?it/s]


AttributeError: 'tuple' object has no attribute 'invoke'

In [None]:
# Display the report
display(report)