In [1]:
import streamlit as st
import dspy
import time
from DSPyPineconeRM import PineconeRM
from helper import load_gemini_model
from dspy.teleprompt import *   

In [2]:
gemini_flash = load_gemini_model()
pinecone_retriever = PineconeRM
dspy.settings.configure(lm=gemini_flash, rm=pinecone_retriever)

In [3]:
class GenerateAnswerWithContext(dspy.Signature):
    """Generate an answer based on the provided context and question."""

    context = dspy.InputField(desc="Relevant facts to consider")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Answer derived from the context")

class RAG(dspy.Module):
    """Retrieval-Augmented Generation (RAG) module for question answering."""

    def __init__(self, num_passages=5):
        super().__init__()
        self.retrieve = pinecone_retriever(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswerWithContext)
    
    def forward(self, question):
        """
        Process the query and generate an answer using RAG.

        Args:
            query (str): The input question.

        Returns:
            dspy.Prediction: Object containing the context, answer, and confidence score.
        """
        context = self.retrieve(question).passages
        context = [passage.long_text for passage in context]
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(
            context=context, 
            answer=prediction.answer
        )

In [4]:

rag = RAG()
question = "What are the requirements for in-state tuition?"
prediction = rag(question)


#for key, vlaue in vars(prediction).items():



In [5]:
print(prediction.answer)

Context:
[1] «In-State Tuition Eligibility To be eligible for in-state tuition rates, students must be domiciled in Virginia for a minimum of one year before the first official day of classes. When students apply for in-state tuition, they should be prepared to present documentation to support their claim. See the following "Domicile Requirements" section for details


In [6]:
gemini_flash.inspect_history(n=1)




Generate an answer based on the provided context and question.

---

Follow the following format.

Context: Relevant facts to consider

Question: ${question}

Reasoning: Let's think step by step in order to ${produce the answer}. We ...

Answer: Answer derived from the context

---

Context:
[1] «In-State Tuition Eligibility To be eligible for in-state tuition rates, students must be domiciled in Virginia for a minimum of one year before the first official day of classes. When students apply for in-state tuition, they should be prepared to present documentation to support their claim. See the following "Domicile Requirements" section for details. To change one's tuition status from out-of-state to in-state the student must initiate the process by completing the "Domicile Determination Form" section of the Virginia Community College System Application for Admission Form (125-030), which can be obtained online at www.nvec.edu/forms or at any campus Student Services Center. It must be 

'\n\n\nGenerate an answer based on the provided context and question.\n\n---\n\nFollow the following format.\n\nContext: Relevant facts to consider\n\nQuestion: ${question}\n\nReasoning: Let\'s think step by step in order to ${produce the answer}. We ...\n\nAnswer: Answer derived from the context\n\n---\n\nContext:\n[1] «In-State Tuition Eligibility To be eligible for in-state tuition rates, students must be domiciled in Virginia for a minimum of one year before the first official day of classes. When students apply for in-state tuition, they should be prepared to present documentation to support their claim. See the following "Domicile Requirements" section for details. To change one\'s tuition status from out-of-state to in-state the student must initiate the process by completing the "Domicile Determination Form" section of the Virginia Community College System Application for Admission Form (125-030), which can be obtained online at www.nvec.edu/forms or at any campus Student Servi

In [None]:
def generate_questions(document_content, num_questions=10):
    prompt = f"""
    Given the following document, generate {num_questions} diverse questions that can be answered using information from the document.
    Ensure the questions cover different topics and vary in complexity.

    Document:
    {document_content}


    Generate {num_questions} questions:
    1.
    2.
    ...
    {num_questions}.
    """
    questions = gemini_flash(prompt).split('\n')
    return [q.split('. ', 1)[1] for q in questions if q.strip()]

In [None]:
with open("full_document_content.txt", "r", encoding="utf-8") as f:
    full_catalog_text = f.read()

In [None]:
questions = generate_questions(full_catalog_text, 5)

for question in questions:
    print(question)

In [3]:
def get_contexts_for_questions(questions, retriever):
    """
    Get contexts for a list of questions using the retriever model.
    
    Args:
    questions (list): List of questions generated by generate_questions function.
    retriever: The retriever model (e.g., PineconeRM instance).
    
    Returns:
    list: List of tuples containing (question, context).
    """
    question_context_pairs = []
    
    for question in questions:
        # Use the retriever to get context for each question
        retrieved_result = retriever(question)
        
        # Assuming the retriever returns a result with a 'passages' attribute
        # containing the retrieved contexts
        context = " ".join([passage.long_text for passage in retrieved_result.passages])
        
        question_context_pairs.append((question, context))
    
    return question_context_pairs


seed_questions_from_catalog = [
    "What are the requirements for the English as a Second Language programs?",
    "What is the maximum number of P grades that can be applied toward graduation?",
    "What services are available for veteran students?",
    "What are the eligibility requirements for in-state tuition?",
    "What is the Satisfactory Academic Progress policy, and how does it affect financial aid?",
    "How can I get help with academic advising?"
]

seed_contexts_for_questions = get_contexts_for_questions(seed_questions_from_catalog, pinecone_retriever(k=5))

for question, context in seed_contexts_for_questions:
    print(f"Question: {question}\nContext: {context}\n")



Question: What are the requirements for the English as a Second Language programs?
Context: The portfolio process and the related course activities will allow students to practice self-evaluation, introspection, analysis, and synthesis. Lecture 4 hours per week. English as a Second Language ESL 20 (10 CR.) English as a Second Language II Prerequisite(s): Recommendation for ESL Level 2. Provides intensive instruction and practice at the low intermediate level. Provides an introduction to the sound system, stress, intonational, and rhythmic patterns of English through listening and speaking exercises. Includes individualized instruction to improve basic reading comprehension. Requires practice in writing with emphasis on building basic sentence structures, grammar, and sentence-level writing. Credits are not applicable toward graduation. Lecture 10 hours per week. ESL 21 (5 CR.) Written Communication Improves students' competence in grammatical patterns of written English. Requires pract

In [4]:
seed_question_answers = [
    "The context provides information about several English as a Second Language (ESL) courses, but it doesn't explicitly list comprehensive requirements for ESL programs.",
    "The maximum number of P grades that can be applied toward graduation is 7 credit hours.",
    """Based on the provided context, the following services are available for veteran students at NOVA:

1. Office of Military and Veteran Services: This office helps all members of the military community (active duty, veterans, and family members) achieve their education and career goals.
2. Veterans' advisors: Each campus has a veterans' advisor to assist with completing necessary forms and maintaining eligibility for benefits.
3. Assistance with VA educational benefits: The college helps veterans and their dependents determine eligibility for and maintain educational benefits from the Department of Veterans Affairs.
4. Enrollment reporting: The office assists students in reporting their enrollment each semester to the Department of Veterans Affairs through the Veteran Enrollment Report Form (VERF).
5. Class protection: To prevent classes from being dropped due to nonpayment, the college allows students to submit documentation such as the VERF, Certificate of Eligibility (COE), or statement of benefits.
6. Guidance on changes in enrollment: The office helps students report any changes in enrollment that may affect their VA benefits.
7. Assistance with Tuition Assistance (TA): For military students using TA, the office provides support in case of dropping/cancelling TA due to military-related duties or assignments.
8. Support for military mobilization or active duty orders: The Office of Military Services assists students who need to withdraw from the College after the census date due to military obligations.
9. Veterans' advisors on each campus: These advisors provide information and assistance specific to veterans' needs.

These services are designed to support veteran students throughout their academic journey at NOVA, from enrollment to graduation, and to help them navigate their VA benefits effectively.""",
    """Based on the provided context, the eligibility requirements for in-state tuition at Northern Virginia Community College (NOVA) are as follows:

1. Domicile requirement: Students must be domiciled in Virginia for a minimum of one year before the first official day of classes.
2. Domicile definition: An individual must have their "present, fixed home where you return following 3. temporary absences and where you intend to stay indefinitely" in Virginia.
4. Dual criteria: Individuals must both reside in Virginia and intend to keep it as their home indefinitely.
Documentation: Students may need to present documentation to support their claim, such as:
- Residence during the past year prior to the first day of the semester
- State to which income taxes are filed or paid
- Driver's license
- Motor vehicle registration
- Voter registration
- Employment
- Property ownership
- Sources of financial support
- Other social or economic ties with Virginia
5. Process: To change tuition status from out-of-state to in-state, students must complete the "Domicile Determination Form" section of the Virginia Community College System Application for Admission Form (125-030).
6. Special cases:
- Children and dependent spouses of Virginia domiciliaries may be eligible for in-state tuition.
- Dependent children who are U.S. citizens may be eligible to establish Virginia domicile separate - from their noncitizen parents.
- Active-duty military members and their dependents have specific provisions for in-state tuition eligibility.

It's important to note that meeting any or all of these factors does not automatically result in Virginia domicile, and the college reserves the right to evaluate each case individually.
""",
    """Based on the provided context, the Satisfactory Academic Progress (SAP) policy at NOVA and its effect on financial aid can be summarized as follows:

1. Definition: SAP is a set of standards that students receiving federal financial aid must meet to maintain their eligibility.
2. Application: The policy applies to federal aid, state funds, institutional funds, and foundation scholarships.
3. Measurement: SAP is measured by two main factors:
a) Cumulative grade point average (Qualitative)
b) Credits earned as a percentage of those attempted (Quantitative or Pace of Completion)
4. Time frame: Students must complete their programs before attempting 150% of the credits required for their program.
5. Evaluation: The Financial Aid Office evaluates SAP before awarding aid and after grades are posted each term.
6. Financial Aid Statuses:
a) Good Standing (GS): Students meeting all SAP requirements
b) Warning Status (WS): First-time failure to meet SAP requirements
c) Suspension: Failure to meet SAP requirements after the warning period
7. Effect on Financial Aid:
- Students in good standing retain financial aid eligibility
- Students on warning status retain eligibility for one term
- Students on suspension lose financial aid eligibility unless they successfully appeal and are placed on probation
8. Quantitative Standard: Students must receive satisfactory grades in at least 67% of cumulative credits attempted.
9. Appeals: Students can appeal their suspension status to potentially retain financial aid eligibility.

The policy ensures that students receiving financial aid are making progress towards completing their academic programs within a reasonable time frame.""",
"""Based on the provided context, I can provide the following information about getting help with academic advising at NOVA:

1. Academic advising is provided by faculty in academic departments and Counselors/Advisors in the Student Services Centers.
2. All students are encouraged to seek information and assistance from academic advisors for both career planning and curriculum planning.
3. New students should work with a counselor or academic advisor to select a program that meets their educational objectives.
4. Once a student has chosen a major, they will be referred to a faculty advisor or counselor who will assist in planning their program for subsequent terms.
5. Students should meet with their advisor to discuss progress toward graduation near the midpoint of their program.
6. Virtual advising is offered through live chat and rapid response email. Students can log on during specified hours to chat with an advisor.
7. Students can use the online Advisement Report in NOVAConnect to monitor their progress toward their degree or certificate.
8. For more information, students are directed to see NOVA's Advising & Counseling web page.

The context emphasizes the importance of the advising relationship as a continuous developmental process involving open communication. It encourages students to actively participate in advising activities to gain an understanding of campus and College resources and develop skills for making informed, independent decisions."""
]

print(len(seed_question_answers))
for i in range(len(seed_question_answers)):
    print(f"Question: {seed_questions_from_catalog[i]}\nAnswer: {seed_question_answers[i]}\n")

6
Question: What are the requirements for the English as a Second Language programs?
Answer: The context provides information about several English as a Second Language (ESL) courses, but it doesn't explicitly list comprehensive requirements for ESL programs.

Question: What is the maximum number of P grades that can be applied toward graduation?
Answer: The maximum number of P grades that can be applied toward graduation is 7 credit hours.

Question: What services are available for veteran students?
Answer: Based on the provided context, the following services are available for veteran students at NOVA:

1. Office of Military and Veteran Services: This office helps all members of the military community (active duty, veterans, and family members) achieve their education and career goals.
2. Veterans' advisors: Each campus has a veterans' advisor to assist with completing necessary forms and maintaining eligibility for benefits.
3. Assistance with VA educational benefits: The college he

In [5]:
seed_dataset = []

for i in range(len(seed_questions_from_catalog)):
    question = seed_questions_from_catalog[i]
    context = seed_contexts_for_questions[i][1]  # The context is the second item in each tuple
    answer = seed_question_answers[i]
    
    seed_dataset.append(dspy.Example({
        "context": context,
        "question": question,
        "answer": answer
        }).with_inputs("question"))

# Print out the first example to verify
print(f"Question: {seed_dataset[0].question}")
print(f"Context: {seed_dataset[0].context[:200]}...")  # Truncated for brevity
print(f"Answer: {seed_dataset[0].answer}")

# Print the total number of seed examples
print(f"\nTotal number of seed examples: {len(seed_dataset)}")

Question: What are the requirements for the English as a Second Language programs?
Context: The portfolio process and the related course activities will allow students to practice self-evaluation, introspection, analysis, and synthesis. Lecture 4 hours per week. English as a Second Language ...
Answer: The context provides information about several English as a Second Language (ESL) courses, but it doesn't explicitly list comprehensive requirements for ESL programs.

Total number of seed examples: 6


In [6]:
train_data = seed_dataset  
#test_data = seed_dataset[5:]   

In [None]:
# Use the dataset with your optimizer
optimizer = BootstrapFewShotWithRandomSearch(metric=rag_assessment_metric)
compiled_rag = optimizer.compile(RAG(), trainset=train_data)

In [None]:
for example in test_data:
    prediction = compiled_rag(example.question)
    print(f"Question: {example.question}")
    print(f"Predicted Answer: {prediction.answer}")
    print(f"Correct Answer: {example.answer}")
    print()

In [11]:
class GenerateAnswerWithContext(dspy.Signature):
    """Generate an answer based on the provided context and question."""

    context = dspy.InputField(desc="Helpful information for answering the question.")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="Answer derived from the context")


class RAG(dspy.Module):
    """Retrieval-Augmented Generation (RAG) module for question answering."""

    def __init__(self, num_passages=5, retriever_factory=None):
        super().__init__()
        self.retriever_factory = retriever_factory() or (lambda: pinecone_retriever)
        self.retrieve = self.retriever_factory(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswerWithContext)
    
    def forward(self, question):
        """
        Process the query and generate an answer using RAG.

        Args:
            query (str): The input question.

        Returns:
            dspy.Prediction: Object containing the context, answer, and confidence score.
        """
        context = self.retrieve(question).passages
        context = [passage.long_text for passage in context]
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(
            context=context, 
            answer=prediction.answer
        )
    
    def __getstate__(self):
        """Custom getstate to avoid pickling the retriever"""
        state = self.__dict__.copy()
        del state['retrieve']
        return state
    
    def __setstate__(self, state):
        """Custom setstate to recreate the retriever"""
        self.__dict__.update(state)
        self.retrieve = self.retriever_factory()

In [None]:
class CheckAnswerInContext(dspy.Signature):
    """Check if the correct answer is present in the given context."""
    
    context = dspy.InputField(desc="Context provided to the chatbot")
    correct_answer = dspy.InputField(desc="The actual correct answer")
    is_present = dspy.OutputField(desc="'True' if the answer is in the context, 'False' otherwise")

class AssessRAGResponse(dspy.Signature):
    """Assess the quality of a chatbot's response based on the context provided,
    the question asked, and the actual correct answer."""
    
    question = dspy.InputField(desc="The question asked to the chatbot")
    chatbot_response = dspy.InputField(desc="The chatbot's response")
    correct_answer = dspy.InputField(desc="The actual correct answer")
    answer_in_context = dspy.InputField(desc="Whether the correct answer is in the context")
    assessment = dspy.OutputField(desc="'Yes' if the response is correct, 'No' if incorrect")
    explanation = dspy.OutputField(desc="Explanation for the assessment")

def rag_assessment_metric(example, pred, trace=None):
    with dspy.context(lm=gemini_flash):
        # First, check if the answer is in the context
        context_check = dspy.Predict(CheckAnswerInContext)(
            context=example.context,
            correct_answer=example.answer
        )
        
        # Then, assess the chatbot's response
        assessment = dspy.Predict(AssessRAGResponse)(
            question=example.question,
            chatbot_response=pred.answer,
            correct_answer=example.answer,
            answer_in_context=context_check.is_present
        )
    
    is_correct = assessment.assessment.lower() == 'yes'
    
    if trace is not None:
        return is_correct
    
    return 1.0 if is_correct else 0.0

In [12]:
class CheckResponseInContext(dspy.Signature):
    """Check if the response is present in the given context."""
    
    context = dspy.InputField(desc="Context provided to the chatbot")
    response = dspy.InputField(desc="The response to check for in the context. If the answer is not present in the context, the response will indicate so.")
    answer_in_context = dspy.OutputField(desc="'Yes' if the chatbot's response is similar to the correct answer, 'No' otherwise")

class InContextAssessment(dspy.Signature):
    """Compare the response of a chatbot to the actual correct answer and determine if the chatbot's response
    is similar to the actual correct answer."""
    
    question = dspy.InputField(desc="The question asked to the chatbot")
    chatbot_response = dspy.InputField(desc="The chatbot's response")
    correct_answer = dspy.InputField(desc="The actual correct answer")
    assessment = dspy.OutputField(desc="'Yes' if the chatbot's response is similar to the correct answer, 'No' otherwise")
    
class NotInContextAssessment(dspy.Signature):
    """Compare the response of a chatbot to the actual correct answer and determine if the chatbot's response
    is similar to the actual correct answer."""
    
    question = dspy.InputField(desc="The question asked to the chatbot")
    chatbot_response = dspy.InputField(desc="The chatbot's response")
    correct_answer = dspy.InputField(desc="The actual correct answer")
    indicates_not_in_context = dspy.OutputField(desc="'Yes' if the chatbot's response indicates that the answer is not in the context it was provided, 'No' otherwise")
    assessment = dspy.OutputField(desc="'Yes' if the chatbot's response is similar to the correct answer, 'No' otherwise")

check_context = dspy.ChainOfThought(CheckResponseInContext)
in_context_assess = dspy.ChainOfThought(InContextAssessment)
not_in_context_assess = dspy.ChainOfThought(NotInContextAssessment)

def rag_assessment_metric(example, pred, trace=None):
    with dspy.context(lm=gemini_flash):
        # First, check if the answer is in the context
        context_check = check_context(
            context=example.context,
            response=example.answer
        )
        in_context = True if check_context.answer_in_context.lower() == 'yes' else False
        score = 0.0
        
        # Then, assess the chatbot's response
        if in_context:
            in_context_assessment = in_context_assess(
                question=example.question,
                chatbot_response=pred.answer,
                correct_answer=example.answer,
            )
            if in_context_assessment.assessment.lower() == 'yes':
                score += 1.5
        else:
            not_in_context_assessment = not_in_context_assess(
                question=example.question,
                chatbot_response=pred.answer,
                correct_answer=example.answer,
            )
            if not_in_context_assessment.indicates_not_in_context.lower() == 'yes':
                score += 1.0
            if not_in_context_assessment.assessment.lower() == 'yes':
                score += .5
                
    is_correct = assessment.assessment.lower() == 'yes'
    
    if trace is not None:
        return is_correct
    
    return 1.0 if is_correct else 0.0

In [14]:
class CheckResponseInContext(dspy.Signature):
    """Given a chatbot's response to a question, check if the response indicates that answer to the question
    is not present in the context the chatbot was provided with."""
    
    response = dspy.InputField(desc="The response to check. If the answer is not present in the context, the response will indicate so.")
    answer_in_context = dspy.OutputField(desc="Does the chatbot's response indicate that the answer is not present in the context the chatbot was provided with?", prefix="[Yes/No]:")

class InContextAssessment(dspy.Signature):
    """Compare a chatbot's response to a question to the correct answer of the question and determine if the chatbot's response
    is similar to the correct answer."""
    
    question = dspy.InputField(desc="The question asked to the chatbot")
    chatbot_response = dspy.InputField(desc="The chatbot's response to the question")
    correct_answer = dspy.InputField(desc="The correct answer to the question")
    partially_correct = dspy.OutputField(desc="Based on the correct answer, was the chatbot's response at least partially correct? 'Yes' if the chatbot's response contains some of the content in the correct answer, 'No' otherwise")
    mostly_correct = dspy.OutputField(desc="Is the chatbot's response similar to the actual correct answer? 'Yes' if the chatbot's response covers most or all of the content in the correct answer, 'No' otherwise")
    
class NotInContextAssessment(dspy.Signature):
    """Compare a chatbot's response to a question to the correct answer of the question and determine if the chatbot's response
    is similar to the correct answer."""
    
    question = dspy.InputField(desc="The question asked to the chatbot")
    chatbot_response = dspy.InputField(desc="The chatbot's response")
    correct_answer = dspy.InputField(desc="The actual correct answer")
    indicates_not_in_context = dspy.OutputField(desc="Does the chatbot's response indicate that the answer to the question is not in the context it was provided? 'Yes' if the chatbot's response indicates that the answer is not in the context it was provided, 'No' otherwise")
    mostly_correct = dspy.OutputField(desc="Is the chatbot's response similar to the actual correct answer? 'Yes' if the chatbot's response covers most or all of the content in the correct answer, 'No' otherwise")
    
class FormatAssessment(dspy.Signature):
    """Compare a chatbot's response to a question to the correct answer of the question and determine if the chatbot's response
    is formatted similarly to the correct answer."""
    
    chatbot_response = dspy.InputField(desc="The chatbot's response")
    correct_answer = dspy.InputField(desc="The actual correct answer")
    assessment = dspy.OutputField(desc="Is the chatbot's response formatted similarly to the correct answer? 'Yes' if the chatbot's response is formatted similarly to the correct answer, 'No' otherwise")


check_context = dspy.ChainOfThought(CheckResponseInContext)
in_context_assess = dspy.ChainOfThought(InContextAssessment)
not_in_context_assess = dspy.ChainOfThought(NotInContextAssessment)
check_format = dspy.ChainOfThought(FormatAssessment)

def rag_assessment_metric2(example, pred, trace=None):
    with dspy.context(lm=gemini_flash):
        # First, check if the answer is in the context
        context_check = check_context(
            response=example.answer
        )
        in_context = True if context_check.answer_in_context.lower() == 'yes' else False
        good_response = False
        score = 0.0
        
        # Assess the chatbot's response
        if in_context:
            in_context_assessment = in_context_assess(
                question=example.question,
                chatbot_response=pred.answer,
                correct_answer=example.answer,
            )
            if in_context_assessment.partially_correct.lower() == 'yes':
                score += 1.0
                if in_context_assessment.mostly_correct.lower() == 'yes':
                    good_response = True
                    score += 1.5
        else:
            not_in_context_assessment = not_in_context_assess(
                question=example.question,
                chatbot_response=pred.answer,
                correct_answer=example.answer,
            )
            if not_in_context_assessment.indicates_not_in_context.lower() == 'yes':
                score += 1.5
                if not_in_context_assessment.mostly_correct.lower() == 'yes':
                    good_response = True
                    score += 1.0
                    
        # Check if the chatbot's response is formatted similarly to the correct answer
        format_assessment = check_format(
            chatbot_response=pred.answer,
            correct_answer=example.answer,
        )
        if format_assessment.assessment.lower() == 'yes':
            score += .5
            
    if trace is not None:
        return good_response
    
    return score / 3.0
        

In [15]:
def retriever_factory():
    return PineconeRM  # Or however you create your pinecone_retriever


rag = RAG(retriever_factory=retriever_factory)
optimizer = BootstrapFewShotWithRandomSearch(metric=rag_assessment_metric2)
compiled_rag = optimizer.compile(rag, trainset=train_data)

Going to sample between 1 and 4 traces per predictor.
Will attempt to bootstrap 16 candidate sets.


Average Metric: 1.0 / 5  (20.0): 100%|██████████| 5/5 [00:09<00:00,  1.90s/it]                


Score: 20.0 for set: [0]
New best sscore: 20.0 for seed -3
Scores so far: [20.0]
Best score: 20.0


Average Metric: 0.9999999999999999 / 5  (20.0): 100%|██████████| 5/5 [00:11<00:00,  2.23s/it] 


Score: 20.0 for set: [5]
Scores so far: [20.0, 20.0]
Best score: 20.0


100%|██████████| 5/5 [00:43<00:00,  8.77s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.6666666666666666 / 5  (13.3): 100%|██████████| 5/5 [00:12<00:00,  2.45s/it]


Score: 13.33 for set: [5]
Scores so far: [20.0, 20.0, 13.33]
Best score: 20.0
Average of max per entry across top 1 scores: 0.2
Average of max per entry across top 2 scores: 0.36666666666666664
Average of max per entry across top 3 scores: 0.4333333333333333
Average of max per entry across top 5 scores: 0.4333333333333333
Average of max per entry across top 8 scores: 0.4333333333333333
Average of max per entry across top 9999 scores: 0.4333333333333333


100%|██████████| 5/5 [00:50<00:00, 10.15s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 1.0 / 5  (20.0): 100%|██████████| 5/5 [00:14<00:00,  2.82s/it]                


Score: 20.0 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0]
Best score: 20.0
Average of max per entry across top 1 scores: 0.2
Average of max per entry across top 2 scores: 0.36666666666666664
Average of max per entry across top 3 scores: 0.36666666666666664
Average of max per entry across top 5 scores: 0.4333333333333333
Average of max per entry across top 8 scores: 0.4333333333333333
Average of max per entry across top 9999 scores: 0.4333333333333333


100%|██████████| 5/5 [00:51<00:00, 10.34s/it]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 1.3333333333333335 / 5  (26.7): 100%|██████████| 5/5 [00:14<00:00,  2.84s/it] 


Score: 26.67 for set: [5]
New best sscore: 26.67 for seed 1
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67]
Best score: 26.67
Average of max per entry across top 1 scores: 0.26666666666666666
Average of max per entry across top 2 scores: 0.4333333333333333
Average of max per entry across top 3 scores: 0.4666666666666667
Average of max per entry across top 5 scores: 0.5333333333333333
Average of max per entry across top 8 scores: 0.5333333333333333
Average of max per entry across top 9999 scores: 0.5333333333333333


100%|██████████| 5/5 [00:52<00:00, 10.53s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.8333333333333333 / 5  (16.7): 100%|██████████| 5/5 [00:12<00:00,  2.59s/it] 


Score: 16.67 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67]
Best score: 26.67
Average of max per entry across top 1 scores: 0.26666666666666666
Average of max per entry across top 2 scores: 0.4333333333333333
Average of max per entry across top 3 scores: 0.4666666666666667
Average of max per entry across top 5 scores: 0.5666666666666667
Average of max per entry across top 8 scores: 0.6333333333333333
Average of max per entry across top 9999 scores: 0.6333333333333333


100%|██████████| 5/5 [00:54<00:00, 10.85s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.16666666666666666 / 5  (3.3): 100%|██████████| 5/5 [00:10<00:00,  2.14s/it] 


Score: 3.33 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33]
Best score: 26.67
Average of max per entry across top 1 scores: 0.26666666666666666
Average of max per entry across top 2 scores: 0.4333333333333333
Average of max per entry across top 3 scores: 0.4666666666666667
Average of max per entry across top 5 scores: 0.5666666666666667
Average of max per entry across top 8 scores: 0.6333333333333333
Average of max per entry across top 9999 scores: 0.6333333333333333


100%|██████████| 5/5 [00:50<00:00, 10.03s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 1.6666666666666667 / 5  (33.3): 100%|██████████| 5/5 [00:10<00:00,  2.11s/it]


Score: 33.33 for set: [5]
New best sscore: 33.33 for seed 4
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33]
Best score: 33.33
Average of max per entry across top 1 scores: 0.33333333333333337
Average of max per entry across top 2 scores: 0.36666666666666664
Average of max per entry across top 3 scores: 0.5333333333333333
Average of max per entry across top 5 scores: 0.5333333333333333
Average of max per entry across top 8 scores: 0.6333333333333333
Average of max per entry across top 9999 scores: 0.6333333333333333


100%|██████████| 5/5 [00:45<00:00,  9.03s/it]


Bootstrapped 1 full traces after 5 examples in round 0.


Average Metric: 1.1666666666666665 / 5  (23.3): 100%|██████████| 5/5 [00:10<00:00,  2.19s/it]


Score: 23.33 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33]
Best score: 33.33
Average of max per entry across top 1 scores: 0.33333333333333337
Average of max per entry across top 2 scores: 0.36666666666666664
Average of max per entry across top 3 scores: 0.4333333333333333
Average of max per entry across top 5 scores: 0.6
Average of max per entry across top 8 scores: 0.6333333333333333
Average of max per entry across top 9999 scores: 0.6333333333333333


100%|██████████| 5/5 [00:45<00:00,  9.09s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 2.3333333333333335 / 5  (46.7): 100%|██████████| 5/5 [00:11<00:00,  2.36s/it] 


Score: 46.67 for set: [5]
New best sscore: 46.67 for seed 6
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67]
Best score: 46.67
Average of max per entry across top 1 scores: 0.4666666666666667
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.5666666666666667
Average of max per entry across top 5 scores: 0.6333333333333333
Average of max per entry across top 8 scores: 0.6666666666666667
Average of max per entry across top 9999 scores: 0.6666666666666667


100%|██████████| 5/5 [00:41<00:00,  8.33s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.6666666666666666 / 5  (13.3): 100%|██████████| 5/5 [00:13<00:00,  2.71s/it]


Score: 13.33 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33]
Best score: 46.67
Average of max per entry across top 1 scores: 0.4666666666666667
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.5666666666666667
Average of max per entry across top 5 scores: 0.6333333333333333
Average of max per entry across top 8 scores: 0.6666666666666667
Average of max per entry across top 9999 scores: 0.6666666666666667


100%|██████████| 5/5 [00:48<00:00,  9.75s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 1.8333333333333333 / 5  (36.7): 100%|██████████| 5/5 [00:10<00:00,  2.12s/it] 


Score: 36.67 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67]
Best score: 46.67
Average of max per entry across top 1 scores: 0.4666666666666667
Average of max per entry across top 2 scores: 0.5333333333333333
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.6333333333333333
Average of max per entry across top 8 scores: 0.6333333333333333
Average of max per entry across top 9999 scores: 0.6666666666666667


100%|██████████| 5/5 [00:48<00:00,  9.68s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.6666666666666666 / 5  (13.3): 100%|██████████| 5/5 [00:15<00:00,  3.16s/it] 


Score: 13.33 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67, 13.33]
Best score: 46.67
Average of max per entry across top 1 scores: 0.4666666666666667
Average of max per entry across top 2 scores: 0.5333333333333333
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.6333333333333333
Average of max per entry across top 8 scores: 0.6333333333333333
Average of max per entry across top 9999 scores: 0.6666666666666667


100%|██████████| 5/5 [00:55<00:00, 11.03s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 2.5 / 5  (50.0): 100%|██████████| 5/5 [00:19<00:00,  3.82s/it]                


Score: 50.0 for set: [5]
New best sscore: 50.0 for seed 10
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67, 13.33, 50.0]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.7
Average of max per entry across top 8 scores: 0.7
Average of max per entry across top 9999 scores: 0.7333333333333333


100%|██████████| 5/5 [00:52<00:00, 10.42s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.16666666666666666 / 5  (3.3): 100%|██████████| 5/5 [00:13<00:00,  2.62s/it] 


Score: 3.33 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67, 13.33, 50.0, 3.33]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.7
Average of max per entry across top 8 scores: 0.7
Average of max per entry across top 9999 scores: 0.7333333333333333


100%|██████████| 5/5 [00:57<00:00, 11.53s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.3333333333333333 / 5  (6.7): 100%|██████████| 5/5 [00:18<00:00,  3.69s/it]  


Score: 6.67 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67, 13.33, 50.0, 3.33, 6.67]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.7
Average of max per entry across top 8 scores: 0.7
Average of max per entry across top 9999 scores: 0.7333333333333333


100%|██████████| 5/5 [00:53<00:00, 10.77s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 1.6666666666666667 / 5  (33.3): 100%|██████████| 5/5 [00:15<00:00,  3.08s/it]


Score: 33.33 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67, 13.33, 50.0, 3.33, 6.67, 33.33]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.7
Average of max per entry across top 8 scores: 0.7
Average of max per entry across top 9999 scores: 0.7333333333333333


100%|██████████| 5/5 [00:42<00:00,  8.42s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 1.3333333333333335 / 5  (26.7): 100%|██████████| 5/5 [00:15<00:00,  3.05s/it] 


Score: 26.67 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67, 13.33, 50.0, 3.33, 6.67, 33.33, 26.67]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.7
Average of max per entry across top 8 scores: 0.7
Average of max per entry across top 9999 scores: 0.7333333333333333


100%|██████████| 5/5 [00:39<00:00,  7.88s/it]


Bootstrapped 0 full traces after 5 examples in round 0.


Average Metric: 0.8333333333333333 / 5  (16.7): 100%|██████████| 5/5 [00:14<00:00,  2.90s/it] 

Score: 16.67 for set: [5]
Scores so far: [20.0, 20.0, 13.33, 20.0, 26.67, 16.67, 3.33, 33.33, 23.33, 46.67, 13.33, 36.67, 13.33, 50.0, 3.33, 6.67, 33.33, 26.67, 16.67]
Best score: 50.0
Average of max per entry across top 1 scores: 0.5
Average of max per entry across top 2 scores: 0.5666666666666667
Average of max per entry across top 3 scores: 0.6333333333333333
Average of max per entry across top 5 scores: 0.7
Average of max per entry across top 8 scores: 0.7
Average of max per entry across top 9999 scores: 0.7333333333333333
19 candidate programs found.





In [16]:
for example in test_data:
    prediction = compiled_rag(example.question)
    print(f"Question: {example.question}")
    print(f"Predicted Answer: {prediction.answer}")
    print(f"Correct Answer: {example.answer}")
    print()

Question: How can I get help with academic advising?
Predicted Answer: Context:
[1] «All students are encouraged to seek information and assistance from academic advisors in career planning in addition to curriculum planning. Even students not enrolled in a specific curricular major may seek assistance from academic advisors and counselors to help select courses during enrollment. 25 | 2024-2025 NOVA Catalog | Academic Planning Students should use the
Correct Answer: Based on the provided context, I can provide the following information about getting help with academic advising at NOVA:

1. Academic advising is provided by faculty in academic departments and Counselors/Advisors in the Student Services Centers.
2. All students are encouraged to seek information and assistance from academic advisors for both career planning and curriculum planning.
3. New students should work with a counselor or academic advisor to select a program that meets their educational objectives.
4. Once a stude

In [17]:
gemini_flash.inspect_history(n=1)




Generate an answer based on the provided context and question.

---

Question: What is the Satisfactory Academic Progress policy, and how does it affect financial aid?

Context: The portfolio process and the related course activities will allow students to practice self-evaluation, introspection, analysis, and synthesis. Lecture 4 hours per week. English as a Second Language ESL 20 (10 CR.) English as a Second Language II Prerequisite(s): Recommendation for ESL Level 2. Provides intensive instruction and practice at the low intermediate level. Provides an introduction to the sound system, stress, intonational, and rhythmic patterns of English through listening and speaking exercises. Includes individualized instruction to improve basic reading comprehension. Requires practice in writing with emphasis on building basic sentence structures, grammar, and sentence-level writing. Credits are not applicable toward graduation. Lecture 10 hours per week. ESL 21 (5 CR.) Written Communication



In [21]:
test_question = "What are some work study programs available at NOVA?"
prediction = compiled_rag(test_question)
print(f"Question: {test_question}")
print(f"Predicted Answer: {prediction.answer}")

Question: What are some work study programs available at NOVA?
Predicted Answer: Context:
[1] «The topics vary from job skills to personal enrichment interests. Various community education programs and seminars focus attention on social issues. Workforce development services for business, industry, and professional organizations provide special courses at NOVA for their employees. These programs can be taught at the College or in the workplace. Many noncredit programs are offered each semester to serve special community


In [None]:
gemini_flash.inspect_history(n=1)

In [20]:
save_path = './v2.json'
compiled_rag.save(save_path)

[('generate_answer', Predict(StringSignature(context, question -> rationale, answer
    instructions='Generate an answer based on the provided context and question.'
    context = Field(annotation=str required=True json_schema_extra={'desc': 'Relevant facts to consider', '__dspy_field_type': 'input', 'prefix': 'Context:'})
    question = Field(annotation=str required=True json_schema_extra={'__dspy_field_type': 'input', 'prefix': 'Question:', 'desc': '${question}'})
    rationale = Field(annotation=str required=True json_schema_extra={'prefix': "Reasoning: Let's think step by step in order to", 'desc': '${produce the answer}. We ...', '__dspy_field_type': 'output'})
    answer = Field(annotation=str required=True json_schema_extra={'desc': 'Answer derived from the context', '__dspy_field_type': 'output', 'prefix': 'Answer:'})
))), ('retrieve', <DSPyPineconeRM.PineconeRM object at 0x32796e450>)]


In [None]:
class GenerateAnswer(dspy.Signature):
    """Assess the context provided by the college catalog and answer the given questions that are predominantly about community college policies, programs, and procedures. Focus on providing accurate information related to admissions, enrollment, academic programs, student services, and other topics covered in the college catalog. If the information to answer a question is not available in the catalog, clearly state that you don't have that information. Always prioritize accuracy and relevance to the official college information contained in the catalog."""

    context = dspy.InputField(desc="Helpful information for answering the question.")
    question = dspy.InputField()
    answer = dspy.OutputField(desc="A detailed answer that is supported by the context.")


class RAG(dspy.Module):
    def __init__(self, num_passages=5, retriever_factory=None):
        super().__init__()
        self.retriever_factory = retriever_factory() or (lambda: pinecone_retriever)
        self.retrieve = self.retriever_factory(k=num_passages)
        self.generate_answer = dspy.ChainOfThought(GenerateAnswer)
    
    def forward(self, question):
        context = self.retrieve(question).passages
        context = [passage.long_text for passage in context]
        prediction = self.generate_answer(context=context, question=question)
        return dspy.Prediction(
            context=context, 
            answer=prediction.answer
        )
    
    def __getstate__(self):
        """Custom getstate to avoid pickling the retriever"""
        state = self.__dict__.copy()
        del state['retrieve']
        return state
    
    def __setstate__(self, state):
        """Custom setstate to recreate the retriever"""
        self.__dict__.update(state)
        self.retrieve = self.retriever_factory()

In [None]:
class Evaluator(dspy.Signature):
    """Evaluate the quality of a system's answer to a question according to a given criterion."""
    
    context = dspy.InputField(desc="The context for answering the question.")
    criterion = dspy.InputField(desc="The evaluation criterion.")
    question = dspy.InputField(desc="The question asked to the system.")
    ground_truth_answer = dspy.InputField(desc="An expert written Ground Truth Answer to the question.")
    predicted_answer = dspy.InputField(desc="The system's answer to the question.")
    rating = dspy.OutputField(desc="A rating between 1 and 5. IMPORTANT!! Only output the rating as an `int` and nothing else.")

class RatingParser(dspy.Signature):
    """Parse the rating from a string."""
    
    raw_rating_response = dspy.InputField(desc="The string that contains the rating in it.")
    rating = dspy.OutputField(desc="An integer valued rating.")
    
class Summarizer(dspy.Signature):
    """Summarize the information provided in the search results in 5 sentences."""
    
    question = dspy.InputField(desc="a question to a search engine")
    context = dspy.InputField(desc="context filtered as relevant to the query by a search engine")
    summary = dspy.OutputField(desc="a 5 sentence summary of information in the context that would help answer the question.")

class RAGMetricProgram(dspy.Module):
    def __init__(self):
        self.evaluator = dspy.ChainOfThought(Evaluator)
        self.rating_parser = dspy.Predict(RatingParser)
        self.summarizer = dspy.ChainOfThought(Summarizer)
    
    def forward(self, gold, pred, trace=None):
        predicted_answer = pred.answer
        question = gold.question
        ground_truth_answer = gold.gold_answer
        
        detail = "Is the assessed answer detailed?"
        faithful = "Is the assessed answer factually supported by the context?"
        ground_truth = f"The Ground Answer Truth to the Question: {question} is given as: \n \n {ground_truth_answer} \n \n How aligned is this Predicted Answer? {predicted_answer}"
        
        # Judgement
        with dspy.context(lm=gemini_flash):
            context = pinecone_retriever(k=5)(question).passages
            # Context Summary
            context = self.summarizer(question=question, context=context).summary
            raw_detail_response = self.evaluator(context=context, 
                                 criterion=detail,
                                 question=question,
                                 ground_truth_answer=ground_truth_answer,
                                 predicted_answer=predicted_answer).rating
            raw_faithful_response = self.evaluator(context=context, 
                                 criterion=faithful,
                                 question=question,
                                 ground_truth_answer=ground_truth_answer,
                                 predicted_answer=predicted_answer).rating
            raw_ground_truth_response = self.evaluator(context=context, 
                                 criterion=ground_truth,
                                 question=question,
                                 ground_truth_answer=ground_truth_answer,
                                 predicted_answer=predicted_answer).rating
        
        # Structured Output Parsing
        with dspy.context(lm=gemini_flash):
            detail_rating = self.rating_parser(raw_rating_response=raw_detail_response).rating
            faithful_rating = self.rating_parser(raw_rating_response=raw_faithful_response).rating
            ground_truth_rating = self.rating_parser(raw_rating_response=raw_ground_truth_response).rating
        
        total = float(detail_rating) + float(faithful_rating)*2 + float(ground_truth_rating)
    
        return total / 5.0



In [None]:
metric_test_ground_truth_answer = """
Cross encoders score the relevance of a document to a query. They are commonly used to rerank documents.
"""

metric_test_query = "What do cross encoders do?"
metric_test_example = dspy.Example(question=metric_test_query, gold_answer=metric_test_ground_truth_answer)


# If this is your first time exploring LLM metrics,
# I recommend trying the exercise of improving this answer to achieve a higher LLM rating.

metric_test_pred = dspy.Example(answer="They re-rank documents.")

llm_metric = RAGMetricProgram()
llm_metric_rating = llm_metric(metric_test_example, metric_test_pred)
print(llm_metric_rating)

def MetricWrapper(gold, pred, trace=None):
    return llm_metric(gold, pred)

In [None]:
gemini_flash.inspect_history(n=1)