# Lesson 3: Automating Model-Graded Evals

In [2]:
import warnings
warnings.filterwarnings('ignore')

## Import the API keys for our 3rd party APIs.

In [3]:
from utils import get_circle_api_key
cci_api_key = get_circle_api_key()

In [4]:
from utils import get_gh_api_key
gh_api_key = get_gh_api_key()

In [5]:
from utils import get_openai_api_key
openai_api_key = get_openai_api_key()

## Set up our github branch

In [6]:
from utils import get_repo_name
course_repo = get_repo_name()
course_repo

'mfunaki-circleci/llmops-course2'

In [7]:
from utils import get_branch
course_branch = get_branch()
course_branch

'dl-cci-idolized-jasmine-37'

## The sample application: AI-powered quiz generator

Here is our sample application from the previous lesson that you will continue working on.


In [9]:
!cat app.py

from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser

delimiter = "####"


def read_file_into_string(file_path):
    try:
        with open(file_path, "r") as file:
            file_content = file.read()
            return file_content
    except FileNotFoundError:
        print(f"The file at '{file_path}' was not found.")
    except Exception as e:
        print(f"An error occurred: {e}")


quiz_bank = read_file_into_string("quiz_bank.txt")

system_message = f"""
Follow these steps to generate a customized quiz for the user.
The question will be delimited with four hashtags i.e {delimiter}

The user will provide a category that they want to create a quiz for. Any questions included in the quiz
should only refer to the category.

Step 1:{delimiter} First identify the category user is asking about from the following list:
* Geography
* Science
* Art

Step 2:{delimiter} Determine the subje

## A first model graded eval
Build a prompt that tells the LLM to evaluate the output of the quizzes.

In [10]:
delimiter = "####"

In [11]:
eval_system_prompt = f"""You are an assistant that evaluates \
  whether or not an assistant is producing valid quizzes.
  The assistant should be producing output in the \
  format of Question N:{delimiter} <question N>?"""

Simulate LLM response to make a first test.

In [12]:
llm_response = """
Question 1:#### What is the largest telescope in space called and what material is its mirror made of?

Question 2:#### True or False: Water slows down the speed of light.

Question 3:#### What did Marie and Pierre Curie discover in Paris?
"""

Build the prompt for the evaluation (eval).

In [13]:
eval_user_message = f"""You are evaluating a generated quiz \
based on the context that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Response]: {llm_response}
    ************
    [END DATA]

Read the response carefully and determine if it looks like \
a quiz or test. Do not evaluate if the information is correct
only evaluate if the data is in the expected format.

Output Y if the response is a quiz, \
output N if the response does not look like a quiz.
"""

Use langchain to build the prompt template for evaluation.

In [14]:
from langchain.prompts import ChatPromptTemplate
eval_prompt = ChatPromptTemplate.from_messages([
      ("system", eval_system_prompt),
      ("human", eval_user_message),
  ])

Choose an LLM.

In [15]:
from langchain.chat_models import ChatOpenAI
llm = ChatOpenAI(model="gpt-3.5-turbo",
                 temperature=0)

From langchain import a parser to have a readable response.

In [16]:
from langchain.schema.output_parser import StrOutputParser
output_parser = StrOutputParser()

Connect all pieces together in the variable 'chain'.

In [17]:
eval_chain = eval_prompt | llm | output_parser

Test the 'good LLM' with positive response by invoking the eval_chain.

In [18]:
eval_chain.invoke({})

'Y'

Create function 'create_eval_chain'.

In [19]:
def create_eval_chain(
    agent_response,
    llm = ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser()
):
  delimiter = "####"
  eval_system_prompt = f"""You are an assistant that evaluates whether or not an assistant is producing valid quizzes.
  The assistant should be producing output in the format of Question N:{delimiter} <question N>?"""
  
  eval_user_message = f"""You are evaluating a generated quiz based on the context that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Response]: {agent_response}
    ************
    [END DATA]

Read the response carefully and determine if it looks like a quiz or test. Do not evaluate if the information is correct
only evaluate if the data is in the expected format.

Output Y if the response is a quiz, output N if the response does not look like a quiz.
"""
  eval_prompt = ChatPromptTemplate.from_messages([
      ("system", eval_system_prompt),
      ("human", eval_user_message),
  ])

  return eval_prompt | llm | output_parser

Create new response to test in the eval_chain.

In [20]:
known_bad_result = "There are lots of interesting facts. Tell me more about what you'd like to know"

In [21]:
bad_eval_chain = create_eval_chain(known_bad_result)

In [22]:
# response for wrong prompt
bad_eval_chain.invoke({})

'N'

Add new create_eval_chain into the 'test_assistant.py' file.

In [23]:
!cat test_assistant.py

from app import assistant_chain
import os

from dotenv import load_dotenv, find_dotenv

_ = load_dotenv(find_dotenv())


def test_science_quiz():
    assistant = assistant_chain()
    question = "Generate a quiz about science."
    answer = assistant.invoke({"question": question})
    expected_subjects = ["davinci", "telescope", "physics", "curie"]
    print(answer)
    assert any(
        subject.lower() in answer.lower() for subject in expected_subjects
    ), f"Expected the assistant questions to include '{expected_subjects}', but it did not"


def test_geography_quiz():
    assistant = assistant_chain()
    question = "Generate a quiz about geography."
    answer = assistant.invoke({"question": question})
    expected_subjects = ["paris", "france", "louvre"]
    print(answer)
    assert any(
        subject.lower() in answer.lower() for subject in expected_subjects
    ), f"Expected the assistant questions to include '{expected_subjects}', but it did not"


def test_decline_unknown

In [24]:
# Command to see the content of the file
!cat test_release_evals.py

from app import assistant_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.schema.output_parser import StrOutputParser
import pytest


def create_eval_chain(
    agent_response,
    llm=ChatOpenAI(model="gpt-3.5-turbo", temperature=0),
    output_parser=StrOutputParser(),
):
    delimiter = "####"
    eval_system_prompt = f"""You are an assistant that evaluates whether or not an assistant is producing valid quizzes.
  The assistant should be producing output in the format of Question N:{delimiter} <question N>?"""

    eval_user_message = f"""You are evaluating a generated quiz based on the context that the assistant uses to create the quiz.
  Here is the data:
    [BEGIN DATA]
    ************
    [Response]: {agent_response}
    ************
    [END DATA]

Read the response carefully and determine if it looks like a quiz or test. Do not evaluate if the information is correct
only evaluate if the data is in the expecte

**_Note:_** if you want to inspect the config run `!head circle_config.yml`

Command: !cat circle_config.yml


Push new files into CircleCI's Git repo.

In [25]:
from utils import push_files
push_files(course_repo, 
           course_branch, 
           ["app.py",
            "test_release_evals.py",
            "test_assistant.py"],
           config="circle_config.yml"
          )

uploading app.py
uploading test_assistant.py
uploading test_release_evals.py


Trigger the Release Evaluations.

In [None]:
from utils import trigger_release_evals
trigger_release_evals(course_repo, 
                      course_branch, 
                      ["app.py",
                       "test_assistant.py",
                       "test_release_evals.py"],
                      cci_api_key)