# Galileo - Chain Pool Eval

### Imports

In [47]:
import os
import sys
import json
import cohere

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.pydantic_v1 import BaseModel, Field
from typing import List
from langchain_openai import ChatOpenAI

from dotenv import load_dotenv
load_dotenv(dotenv_path="../.env")

True

### System Preamble and Prompt Setup

In [48]:
docs = [
    "'Kings and Queens' is Killing Joke's third single from their fifth studio album, 'Night Time'. It was originally released by E.G. Records on 21 March 1985 as a 12' and 7' single in the UK, and a 7' single by Polydor in the Netherlands. It was produced by Chris Kimsey. The 12' single featured 'Kings and Queens (A Right Royal Mix)' as an A-side, and both 'The Madding Crowd (Remixed by Killing Joke)' and 'Kings and Queens' as B-sides. E.G.'s 7' single and Polydor's 7' single exempted 'Kings and Queens (A Right Royal Mix)' and instead featured 'Kings and Queens' as the A-side, and 'The Madding Crowd (Remixed by Killing Joke)' as the B-side. E.G. also released a remix of the song, 'Kings and Queens (Knaves Mix)', as an A-side and featured the same B-sides as the 12' single of 'Kings and Queens'",
    "Acceptance is an American rock band from Seattle, Washington, formed in 1998. They released their first EP, 'Lost for Words', in 2000, followed by 'Black Lines to Battlefields' in 2003 (this EP was also re-released with live bonus tracks). Their debut album, 'Phantoms', was released in 2005.",
    "Killing Joke is the eleventh studio album by English rock band Killing Joke, released on 28 July 2003 by record label Zuma Recordings.",
    "Killing Joke are an English rock band formed in October 1978 in Notting Hill, London, England. The original line-up included Jaz Coleman (vocals, keyboards), Paul Ferguson (drums), Geordie Walker (guitars) and Youth (bass).",
    "Paul Ferguson (born Matthew Paul Ferguson, 31 March 1958) is a rock drummer, best known for his work in the post-punk/industrial group Killing Joke and cult English punk band Pink Parts. Following a stint as the drummer with the London-based Matt Stagger Band in 1978, Ferguson became a founding member of Killing Joke and served as their drummer from 1979 to 1987. He was known as 'Big Paul Ferguson' during this period.",
    "'A New Day' is a non-album single by Killing Joke. It was released by E.G. Records in July 1984 as a 12' and 7' single. The 12' single featured a dub mix of 'A New Day' as the A-side and 'A New Day' as the B-side. The 7' single featured a shorter version of 'A New Day' as the A-side and 'Dance Day' as the B-side. A completely different version of 'A New Day', which was not a mix, later appeared on the 2008 reissue of Killing Joke's fifth studio album, 'Night Time'. The single reached No. 51 in the UK Singles Chart. A promotional video was filmed for the song, marking the first time the band had made a video for a non-album single.",
    "Batman: The Killing Joke is a 1988 DC Comics one-shot graphic novel featuring the characters Batman and the Joker written by Alan Moore and illustrated by Brian Bolland. 'The Killing Joke' provides an origin story for the supervillain the Joker, loosely adapted from the 1951 story arc 'The Man Behind the Red Hood!'. Taking place over two timelines, 'The Killing Joke' depicts the Joker attempting to drive Jim Gordon insane and Batman's desperate attempt to stop him.",
    "Killing Joke is the debut studio album by English rock band Killing Joke. It was released in August 1980 by record label E.G.",
    "Pandemonium is the ninth studio album by English rock band Killing Joke, released on 2 August 1994 by record label Butterfly. This album marked Killing Joke's return after a four-year-long hiatus, the longest the band has taken since it was initially founded. This album also featured the return of founding member Youth, who replaced Paul Raven on bass.",
    "'Ha' or 'Ha': Killing Joke Live is the first commercially distributed live recording by English rock band Killing Joke. It was recorded at Larry's Hideaway in Toronto, Ontario, Canada on 9 and 10 August 1982, and released on 4 November by record label Virgin.",
]

CONTEXT = ""
for i, doc in enumerate(docs):
    CONTEXT += f"Document {i + 1}: {doc}\n"


In [49]:
# preamble, task and question setup
PREAMBLE = \
"""
You are Coral, a brilliant, sophisticated, AI-assistant chatbot trained to assist human users by providing thorough responses. You are powered by Command, a large language model built by the company Cohere. Today's date is Thursday, April 25, 2024.
"""
TASK = \
"""
Answer the question using the information in the context.
"""
QUESTION = \
"""
Which band was formed first Killing Joke or Acceptance ?
"""



In [50]:
# combining context, task and question into 1 prompt
PROMPT_TEMPLATE = \
"""
{preamble}

Task:
{task}
Question:
{question}
Answer:
"""


In [57]:
# chain poll eval prompt
CP_PROMPT = \
"""
You are an LLM Hallucination Evaluator tasked with judging whether the provided Response is relevant to Document the provided Context to ensure that there is no hallucination and that the response adheres to the context provided.

## Context
{context}

## Response
{response}

## Task
Produce a judgement that compare the relevance of the context to the response. For each judgement:
1. Think step by step and check if the claims made by the Response are fully supported by the documents in the Context. 
2. First analyze each document with detailed reasoning for EACH of the documents in context including how it does or does not support the response. Respond with this in the "reasonings" key in the JSON. Make sure ALL documents are included.
3. Then perform an overall analysis summarizing the results across all documents. Respond with this in the "summary" key in the JSON
4. Based on the results and analysis, also include a 1 (if yes) or a 0 (if no) if the reponse is fully supported by looking at all the documents. Include this as the "judgement" key in the JSON. 
"""


### Setting up Cohere API

In [58]:
co = cohere.Client(
    api_key=os.getenv("COHERE_API_KEY"),
    log_warning_experimental_features=False,
)
# combine prompt template sections
generation_prompt = PROMPT_TEMPLATE.format(
    preamble=PREAMBLE,
    task=TASK,
    question=QUESTION,
)
generation_params = {
    "model": "command-r-plus",
    "message": generation_prompt,
    "temperature": 0.3,
    "documents": [{"text": doc} for doc in docs]
}
response = co.chat(
    **generation_params,
).text

In [59]:
response


'Killing Joke was formed in October 1978, while Acceptance was formed in 1998. Therefore, Killing Joke was formed first.'

### Running Chain Poll Eval

In [62]:
# setup pydantic class for json output
class Reasoning(BaseModel):
    document: int = Field(description="document number from the provided context")
    reasoning: str = Field(description="detailed reasoning step by step for this document to evaluate if it supports the Response or not")

class JudgementRecord(BaseModel):
    reasonings: List[Reasoning] = Field(description="summarized reasoning for this judgement by individually evaluating the relevance of the Response against EACH documents in Context. Think step-by-step and provide verbose, detailed reasoning to explain you judged the relevance.")
    summary: str = Field(description="final reasoning explanation that summarizes all reasonings for all documents")
    judgement: int = Field(description="1 if the reasoning indicates that the Response is relevant to and supported by the Context or else 0")

# setup chain pool prompt with json format guidance
parser = JsonOutputParser(pydantic_object=JudgementRecord)
cp_prompt = PromptTemplate(
    template=CP_PROMPT + "\n## Format instructions\n{format_instructions}",
    input_variables=["context", "response"],
    partial_variables={"format_instructions": parser.get_format_instructions()},
)
# setup GPT 3.5 as judge
api_key = "" # <api key>
model = ChatOpenAI(api_key=api_key, model="gpt-4-turbo")
judgement_chain = cp_prompt | model | parser

# generate 5 judgements
judgements = []
judgement_params = {
    "context": CONTEXT,
    "response": response,
}
for i in range(5):
    judgement = judgement_chain.invoke(judgement_params)
    judgement["run"] = i + 1
    judgements.append(
        judgement
    )


In [63]:
from pprint import pprint
for item in judgements:
    pprint(item)
    print("-" * 20)


{'judgement': 1,
 'reasonings': [{'document': 1,
                 'reasoning': 'This document details the release and features '
                              "of the single 'Kings and Queens' by Killing "
                              'Joke but does not mention the formation dates '
                              'of Killing Joke or Acceptance, therefore it '
                              'does not support the response.'},
                {'document': 2,
                 'reasoning': 'This document provides information on the '
                              'American rock band Acceptance, including its '
                              'formation year (1998). This supports part of '
                              'the response concerning the formation year of '
                              'Acceptance.'},
                {'document': 3,
                 'reasoning': 'This document discusses the release of an album '
                              'by Killing Joke but does not provide '
  

### Chain Poll Score

In [64]:
# track yes judgements and total judgements
TOTAL_RECORDS = 5
yes_records = sum(list(item['judgement'] for item in judgements))
# calcualate chain poll score
chain_poll_score = yes_records / TOTAL_RECORDS


In [65]:
print(f"Chain Poll Score (%): {chain_poll_score:.2%}")

Chain Poll Score (%): 100.00%


In [66]:
from datasets import Dataset
from ragas.metrics import faithfulness
from ragas import evaluate
from langchain_openai import ChatOpenAI

judge = ChatOpenAI(api_key=api_key, model="gpt-4o")
data_samples = {
    "question": [QUESTION],
    "answer": [response],
    "contexts": [docs],
}
dataset = Dataset.from_dict(data_samples)
score = evaluate(dataset, llm=judge, metrics=[faithfulness])["faithfulness"]


Evaluating: 100%|██████████| 1/1 [00:03<00:00,  3.42s/it]


In [67]:
print(f"Ragas Score (%): {score:.2%}")

Ragas Score (%): 100.00%
