In [1]:
import random

In [3]:
def monte_carlo_eval(prompt):
    # Simulating different types of responses
    response_types = ['highly relevant', 'somewhat relevant', 'irrelevant']
    scores = {'highly relevant': 3, 'somewhat relevant': 2, 'irrelevant': 1}

    # Perform multiple random trials
    trials = 100
    total_score = 0
    for _ in range(trials):
        response = random.choice(response_types)
        total_score += scores[response]

    # Average score represents the evaluation
    return total_score / trials

def elo_eval(prompt, base_rating=1500):
    # Simulate the outcome of the prompt against standard criteria
    # Here, we randomly decide if the prompt 'wins', 'loses', or 'draws'
    outcomes = ['win', 'loss', 'draw']
    outcome = random.choice(outcomes)

    # Elo rating formula parameters
    K = 30  # Maximum change in rating
    R_base = 10 ** (base_rating / 400)
    R_opponent = 10 ** (1600 / 400)  # Assuming a fixed opponent rating
    expected_score = R_base / (R_base + R_opponent)

    # Calculate the new rating based on the outcome
    actual_score = {'win': 1, 'loss': 0, 'draw': 0.5}[outcome]
    new_rating = base_rating + K * (actual_score - expected_score)

    return new_rating

In [9]:
def elo_ratings_func(prompts, elo_ratings, K=30, opponent_rating=1600):
    """
    Update Elo ratings for a list of prompts based on simulated outcomes.

    Parameters:
    prompts (list): List of prompts to be evaluated.
    elo_ratings (dict): Current Elo ratings for each prompt.
    K (int): Maximum change in rating.
    opponent_rating (int): Fixed rating of the opponent for simulation.

    Returns:
    dict: Updated Elo ratings.
    """

    for prompt in prompts:
        # Simulate an outcome against the standard criteria or another prompt
        outcome = random.choice(['win', 'loss', 'draw'])

        # Calculate the new rating based on the outcome
        actual_score = {'win': 1, 'loss': 0, 'draw': 0.5}[outcome]
        R_base = 10 ** (elo_ratings[prompt] / 400)
        R_opponent = 10 ** (opponent_rating / 400)
        expected_score = R_base / (R_base + R_opponent)
        elo_ratings[prompt] += K * (actual_score - expected_score)

    return elo_ratings

# Example usage
prompts = ["Who founded OpenAI?", 
                "What was the initial goal of OpenAI?",
                "What did OpenAI release in 2016?", 
                "What project did OpenAI showcase in 2018?",
                "How did the AI agents in OpenAI Five work together?"
                ]
elo_ratings = {prompt: 1500 for prompt in prompts}  # Initial ratings

# Conduct multiple rounds of evaluation
for _ in range(10):  # Number of rounds
    elo_ratings = elo_ratings_func(prompts, elo_ratings)

# Sort prompts by their final Elo ratings
sorted_prompts = sorted(prompts, key=lambda x: elo_ratings[x], reverse=True)

# Print the ranked prompts
for prompt in sorted_prompts:
    print(f"{prompt}: {elo_ratings[prompt]}")

Who founded OpenAI?: 1540.716754023036
How did the AI agents in OpenAI Five work together?: 1534.5983988735882
What did OpenAI release in 2016?: 1534.5803899984946
What project did OpenAI showcase in 2018?: 1533.6478333919786
What was the initial goal of OpenAI?: 1511.8342195312985


In [10]:
def evaluate_prompt(main_prompt, test_cases):
    evaluations = {}

    # Evaluate the main prompt using Monte Carlo and Elo methods
    evaluations['main_prompt'] = {
        'Monte Carlo Evaluation': monte_carlo_eval(main_prompt),
        'Elo Rating Evaluation': elo_eval(main_prompt)
    }

    # Evaluate each test case
    for idx, test_case in enumerate(test_cases):
        evaluations[f'test_case_{idx+1}'] = {
            'Monte Carlo Evaluation': monte_carlo_eval(test_case),
            'Elo Rating Evaluation': elo_eval(test_case)
        }

    return evaluations

In [12]:
main_prompt = "why we use OepenAI?"
test_cases = ["Who founded OpenAI?", 
                "What was the initial goal of OpenAI?",
                "What did OpenAI release in 2016?", 
                "What project did OpenAI showcase in 2018?",
                "How did the AI agents in OpenAI Five work together?"
                ]
result = evaluate_prompt(main_prompt, test_cases)

result



{'main_prompt': {'Monte Carlo Evaluation': 2.0,
  'Elo Rating Evaluation': 1504.2019499940866},
 'test_case_1': {'Monte Carlo Evaluation': 2.11,
  'Elo Rating Evaluation': 1504.2019499940866},
 'test_case_2': {'Monte Carlo Evaluation': 2.1,
  'Elo Rating Evaluation': 1504.2019499940866},
 'test_case_3': {'Monte Carlo Evaluation': 1.87,
  'Elo Rating Evaluation': 1489.2019499940866},
 'test_case_4': {'Monte Carlo Evaluation': 1.92,
  'Elo Rating Evaluation': 1519.2019499940866},
 'test_case_5': {'Monte Carlo Evaluation': 2.11,
  'Elo Rating Evaluation': 1489.2019499940866}}

## RAGAS Evaluation 

In [6]:
%pip install --pre -U "weaviate-client==4.*"

Collecting weaviate-client==4.*
  Downloading weaviate_client-4.4b8-py3-none-any.whl.metadata (3.5 kB)
Collecting deprecated<2.0.0,>=1.2.14 (from weaviate-client==4.*)
  Using cached Deprecated-1.2.14-py2.py3-none-any.whl.metadata (5.4 kB)
Collecting httpx==0.26.0 (from weaviate-client==4.*)
  Downloading httpx-0.26.0-py3-none-any.whl.metadata (7.6 kB)
Collecting validators==0.22.0 (from weaviate-client==4.*)
  Downloading validators-0.22.0-py3-none-any.whl.metadata (4.7 kB)
Collecting authlib<2.0.0,>=1.2.1 (from weaviate-client==4.*)
  Downloading Authlib-1.3.0-py2.py3-none-any.whl.metadata (3.8 kB)
Collecting grpcio<2.0.0,>=1.57.0 (from weaviate-client==4.*)
  Downloading grpcio-1.60.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.0 kB)
Collecting grpcio-tools<2.0.0,>=1.57.0 (from weaviate-client==4.*)
  Downloading grpcio_tools-1.60.0-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.2 kB)
Collecting grpcio-health-checking<2.0.0,>=1.57.0

In [20]:
%pip install chardet

Collecting chardet
  Downloading chardet-5.2.0-py3-none-any.whl.metadata (3.4 kB)
Downloading chardet-5.2.0-py3-none-any.whl (199 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m199.4/199.4 kB[0m [31m340.6 kB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hInstalling collected packages: chardet
Successfully installed chardet-5.2.0
Note: you may need to restart the kernel to use updated packages.


In [18]:
import requests
import os

In [1]:
import weaviate
from weaviate.embedded import EmbeddedOptions

In [7]:
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter  
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [29]:
from dotenv import load_dotenv,find_dotenv

In [8]:
# Data loader
def data_loader(file_path= '../RAG/prompts/context.txt'):
    loader = TextLoader(file_path)
    documents = loader.load()

    # Chunk the data
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

In [30]:
def create_retriever(chunks):
  # Load OpenAI API key from .env file
  load_dotenv(find_dotenv())

  # Retrieve the OpenAI API key
  openai_api_key = os.getenv("OPENAI_API_KEY")

  # Print the key
  print(openai_api_key)
  
  # Setup vector database
  client = weaviate.Client(
    embedded_options = EmbeddedOptions()
  )

  # Populate vector database
  vectorstore = Weaviate.from_documents(
      client = client,    
      documents = chunks,
      embedding = OpenAIEmbeddings(),
      by_text = False
  )

  # Define vectorstore as retriever to enable semantic search
  retriever = vectorstore.as_retriever()
  return retriever

In [33]:
chunks =  data_loader()

chunks

[Document(page_content='OpenAI was initially founded in 2015 by Sam Altman, Elon Musk, Ilya Sutskever and Greg Brockman as a \nnon-profit organization with the stated goal to “advance digital intelligence in the way that is most \nlikely to benefit humanity as a whole.” The company assembled a team of the best researchers in the \nfield of AI to pursue the goal of building AGI in a safe way.', metadata={'source': '../RAG/prompts/context.txt'}),
 Document(page_content='The early years of OpenAI were marked with rapid experimentation. The company made significant progress \non research in deep learning and reinforcement learning, and released ‘OpenAI Gym’ in 2016, a toolkit \nfor developing and comparing reinforcement learning algorithms.', metadata={'source': '../RAG/prompts/context.txt'}),
 Document(page_content='OpenAI showcased the capabilities of these reinforcement learning algorithms through its ‘OpenAI Five’ \nproject in 2018, which trained five independent AI agents to play a co

In [34]:
retriever = create_retriever(chunks)

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
embedded weaviate is already listening on port 8079


Retrying langchain.embeddings.openai.embed_with_retry.<locals>._embed_with_retry in 4.0 seconds as it raised Timeout: Request timed out: HTTPSConnectionPool(host='api.openai.com', port=443): Read timed out. (read timeout=600).


AuthenticationError: Incorrect API key provided: sk-RtbPi***************************************BOq7. You can find your API key at https://platform.openai.com/account/api-keys.

In [14]:
# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

  warn_deprecated(


In [15]:
from datasets import Dataset

questions = [
             "Who founded OpenAI?", 
             "What was the initial goal of OpenAI?",
             "What did OpenAI release in 2016?"
            ]

ground_truths = [
                 ["Sam Altman, Elon Musk, Ilya Sutskever and Greg Brockman"],
                 ["To advance digital intelligence in a way that benefits humanity"],
                 ["OpenAI Gym, a toolkit for developing and comparing reinforcement learning algorithms"]
                ]

answers = []
contexts = []

# Inference
for query in questions:
  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "question": questions, # list 
    "answer": answers, # list
    "contexts": contexts, # list list
    "ground_truths": ground_truths # list Lists
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

  from .autonotebook import tqdm as notebook_tqdm
/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/langchain_community/chat_models/openai.py:458: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Gui

In [None]:
%pip install ragas

In [32]:
from ragas import evaluate

from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()

ModuleNotFoundError: No module named 'ragas'

In [17]:
df

Unnamed: 0,question,answer,contexts,ground_truths,context_precision,context_recall,faithfulness,answer_relevancy
0,Who founded OpenAI?,"OpenAI was founded by Sam Altman, Elon Musk, I...",[OpenAI was initially founded in 2015 by Sam A...,"[Sam Altman, Elon Musk, Ilya Sutskever and Gre...",1.0,1.0,1.0,0.9592
1,What was the initial goal of OpenAI?,The initial goal of OpenAI was to advance digi...,[OpenAI was initially founded in 2015 by Sam A...,[To advance digital intelligence in a way that...,1.0,1.0,1.0,1.0
2,What did OpenAI release in 2016?,"OpenAI released 'OpenAI Gym' in 2016, a toolki...",[The early years of OpenAI were marked with ra...,"[OpenAI Gym, a toolkit for developing and comp...",1.0,1.0,1.0,0.899171


#### Integration with Retrieval-Augmented Generation Assessment:
##### Monte Carlo for Robustness Testing: Use Monte Carlo simulations to test the robustness of the RAG system across a wide range of possible retrieval scenarios. This helps in understanding how different types of retrieved information can impact the quality of the generated content.
##### Elo Rating for Continuous Improvement: Utilize the Elo rating system to continuously assess and improve the RAG model. By comparing new outputs with previous ones and adjusting ratings accordingly, the system can learn which types of retrieval-augmented generations work best.