In [1]:
import random

In [4]:
def monte_carlo_eval(prompt):
    # Simulating different types of responses
    response_types = ['highly relevant', 'somewhat relevant', 'irrelevant']
    scores = {'highly relevant': 3, 'somewhat relevant': 2, 'irrelevant': 1}

    # Perform multiple random trials
    trials = 100
    total_score = 0
    for _ in range(trials):
        response = random.choice(response_types)
        total_score += scores[response]

    # Average score represents the evaluation
    return total_score / trials

def elo_eval(prompt, base_rating=1500):
    # Simulate the outcome of the prompt against standard criteria
    # Here, we randomly decide if the prompt 'wins', 'loses', or 'draws'
    outcomes = ['win', 'loss', 'draw']
    outcome = random.choice(outcomes)

    # Elo rating formula parameters
    K = 30  # Maximum change in rating
    R_base = 10 ** (base_rating / 400)
    R_opponent = 10 ** (1600 / 400)  # Assuming a fixed opponent rating
    expected_score = R_base / (R_base + R_opponent)

    # Calculate the new rating based on the outcome
    actual_score = {'win': 1, 'loss': 0, 'draw': 0.5}[outcome]
    new_rating = base_rating + K * (actual_score - expected_score)

    return new_rating

In [6]:
def elo_ratings_func(prompts, elo_ratings, K=30, opponent_rating=1600):
    """
    Update Elo ratings for a list of prompts based on simulated outcomes.

    Parameters:
    prompts (list): List of prompts to be evaluated.
    elo_ratings (dict): Current Elo ratings for each prompt.
    K (int): Maximum change in rating.
    opponent_rating (int): Fixed rating of the opponent for simulation.

    Returns:
    dict: Updated Elo ratings.
    """

    for prompt in prompts:
        # Simulate an outcome against the standard criteria or another prompt
        outcome = random.choice(['win', 'loss', 'draw'])

        # Calculate the new rating based on the outcome
        actual_score = {'win': 1, 'loss': 0, 'draw': 0.5}[outcome]
        R_base = 10 ** (elo_ratings[prompt] / 400)
        R_opponent = 10 ** (opponent_rating / 400)
        expected_score = R_base / (R_base + R_opponent)
        elo_ratings[prompt] += K * (actual_score - expected_score)

    return elo_ratings

# Example usage
prompts = ["Who founded OpenAI?", 
                "What was the initial goal of OpenAI?",
                "What did OpenAI release in 2016?", 
                "What project did OpenAI showcase in 2018?",
                "How did the AI agents in OpenAI Five work together?"
                ]
elo_ratings = {prompt: 1500 for prompt in prompts}  # Initial ratings

# Conduct multiple rounds of evaluation
for _ in range(10):  # Number of rounds
    elo_ratings = elo_ratings_func(prompts, elo_ratings)

# Sort prompts by their final Elo ratings
sorted_prompts = sorted(prompts, key=lambda x: elo_ratings[x], reverse=True)

# Print the ranked prompts
for prompt in sorted_prompts:
    print(f"{prompt}: {elo_ratings[prompt]}")

What project did OpenAI showcase in 2018?: 1590.8215462818882
Who founded OpenAI?: 1551.2547216834469
How did the AI agents in OpenAI Five work together?: 1545.5864698331568
What was the initial goal of OpenAI?: 1541.5568025583048
What did OpenAI release in 2016?: 1490.7235766925844


Prompts evaluation
#### "What was the initial goal of OpenAI?": 1583.6551603182484
This prompt has the highest rating,  suggesting it was evaluated as the most relevant, accurate, or valuable.
#### "Who founded OpenAI?": 1550.8315837034786
This prompt also performed well, but slightly less so than the first one.
#### "What project did OpenAI showcase in 2018?": 1524.894352475904 Moderate
#### "What did OpenAI release in 2016?": 1518.8441077283887
These prompts have lower ratings, indicating they were evaluated as less relevant or valuable compared to the top-rated prompts.
#### "How did the AI agents in OpenAI Five work together?": 1501.4300442180024
This prompt is closer to the baseline rating, suggesting its performance was near average in your evaluation criteria.

In [7]:
def evaluate_prompt(main_prompt, test_cases):
    evaluations = {}

    # Evaluate the main prompt using Monte Carlo and Elo methods
    evaluations['main_prompt'] = {
        'Monte Carlo Evaluation': monte_carlo_eval(main_prompt),
        'Elo Rating Evaluation': elo_eval(main_prompt)
    }

    # Evaluate each test case
    for idx, test_case in enumerate(test_cases):
        evaluations[f'test_case_{idx+1}'] = {
            'Monte Carlo Evaluation': monte_carlo_eval(test_case),
            'Elo Rating Evaluation': elo_eval(test_case)
        }

    return evaluations

In [8]:
main_prompt = "why we use OepenAI?"
test_cases = ["Who founded OpenAI?", 
                "What was the initial goal of OpenAI?",
                "What did OpenAI release in 2016?", 
                "What project did OpenAI showcase in 2018?",
                "How did the AI agents in OpenAI Five work together?"
                ]
result = evaluate_prompt(main_prompt, test_cases)
print(result)



{'main_prompt': {'Monte Carlo Evaluation': 2.06, 'Elo Rating Evaluation': 1519.2019499940866}, 'test_case_1': {'Monte Carlo Evaluation': 2.09, 'Elo Rating Evaluation': 1519.2019499940866}, 'test_case_2': {'Monte Carlo Evaluation': 2.18, 'Elo Rating Evaluation': 1519.2019499940866}, 'test_case_3': {'Monte Carlo Evaluation': 2.04, 'Elo Rating Evaluation': 1504.2019499940866}, 'test_case_4': {'Monte Carlo Evaluation': 1.85, 'Elo Rating Evaluation': 1519.2019499940866}, 'test_case_5': {'Monte Carlo Evaluation': 2.05, 'Elo Rating Evaluation': 1489.2019499940866}}


### Interprtation
#### 1. Monte Carlo Evaluation:
Scores Range: From 1 to 3, with higher scores indicating greater relevance or quality of the prompt.
###### Interpretation:
1.94 (Main Prompt): Slightly below average relevance or quality.
2.06, 2.02, 1.89, 1.98, 2.03 (Test Cases): Scores around 2 suggest moderate relevance or quality. The variation indicates some test cases are deemed slightly more relevant or higher quality than others.
#### 2. Elo Rating Evaluation:
Base Rating: Usually starts at 1500, with changes based on the 'performance' of the prompt against a set of standards.
Higher than 1500: Indicates the prompt performed better than average.
Lower than 1500: Indicates the prompt performed worse than average.
###### Interpretation:
1489.20 (Main Prompt): Slightly below the average performance.
1519.20 (Test Cases 1, 2, 4, 5): These prompts are rated above the average, suggesting better performance.
1504.20 (Test Case 3): Slightly above average performance.
#### Overall Interpretation:
Main Prompt: Both evaluations suggest that the main prompt is slightly below average in terms of relevance and quality.
Test Cases: Generally, the test cases are rated as average or slightly above average in both relevance and quality. Test Cases 1, 2, 4, and 5 seem to perform particularly well in the Elo evaluation, indicating they might be more effective or well-structured prompts compared to the main prompt and Test Case 3.

## RAGAS Evaluation 

In [13]:
import requests
from langchain.document_loaders import TextLoader
from langchain.text_splitter import CharacterTextSplitter  
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv,find_dotenv
from langchain.embeddings import OpenAIEmbeddings
from langchain.vectorstores import Weaviate
import weaviate
from weaviate.embedded import EmbeddedOptions
from dotenv import load_dotenv,find_dotenv
# 
from langchain.chat_models import ChatOpenAI
from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

In [10]:
# Data loader
def data_loader(file_path= 'prompts/context.txt'):
    loader = TextLoader(file_path)
    documents = loader.load()

    # Chunk the data
    text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    chunks = text_splitter.split_documents(documents)
    return chunks

In [11]:
def create_retriever(chunks):

  # Load OpenAI API key from .env file
  load_dotenv(find_dotenv())

  # Setup vector database
  client = weaviate.Client(
    embedded_options = EmbeddedOptions()
  )

  # Populate vector database
  vectorstore = Weaviate.from_documents(
      client = client,    
      documents = chunks,
      embedding = OpenAIEmbeddings(),
      by_text = False
  )

  # Define vectorstore as retriever to enable semantic search
  retriever = vectorstore.as_retriever()
  return retriever

In [18]:
chunks

[Document(page_content='OpenAI was initially founded in 2015 by Sam Altman, Elon Musk, Ilya Sutskever and Greg Brockman as a \nnon-profit organization with the stated goal to “advance digital intelligence in the way that is most \nlikely to benefit humanity as a whole.” The company assembled a team of the best researchers in the \nfield of AI to pursue the goal of building AGI in a safe way.', metadata={'source': 'prompts/context.txt'}),
 Document(page_content='The early years of OpenAI were marked with rapid experimentation. The company made significant progress \non research in deep learning and reinforcement learning, and released ‘OpenAI Gym’ in 2016, a toolkit \nfor developing and comparing reinforcement learning algorithms.', metadata={'source': 'prompts/context.txt'}),
 Document(page_content='OpenAI showcased the capabilities of these reinforcement learning algorithms through its ‘OpenAI Five’ \nproject in 2018, which trained five independent AI agents to play a complex multipla

In [12]:
chunks =  data_loader()
retriever = create_retriever(chunks)

Started /Users/mahlettaye/.cache/weaviate-embedded: process ID 6103


{"action":"startup","default_vectorizer_module":"none","level":"info","msg":"the default vectorizer modules is set to \"none\", as a result all new schema classes without an explicit vectorizer setting, will use this vectorizer","time":"2024-01-17T15:44:28+03:00"}
{"action":"startup","auto_schema_enabled":true,"level":"info","msg":"auto schema enabled setting is set to \"true\"","time":"2024-01-17T15:44:28+03:00"}
{"level":"info","msg":"No resource limits set, weaviate will use all available memory and CPU. To limit resources, set LIMIT_RESOURCES=true","time":"2024-01-17T15:44:28+03:00"}
{"action":"grpc_startup","level":"info","msg":"grpc server listening at [::]:50060","time":"2024-01-17T15:44:28+03:00"}
{"action":"restapi_management","level":"info","msg":"Serving weaviate at http://127.0.0.1:8079","time":"2024-01-17T15:44:29+03:00"}
  warn_deprecated(
{"level":"info","msg":"Created shard langchain_c28e95b5f02d4a6d817409505d6046b4_LVNDQoo0jmdk in 4.293042ms","time":"2024-01-17T15:44:2

In [14]:


# Define LLM
llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Define prompt template
template = """You are an assistant for question-answering tasks. 
Use the following pieces of retrieved context to answer the question. 
If you don't know the answer, just say that you don't know. 
Use two sentences maximum and keep the answer concise.
Question: {question} 
Context: {context} 
Answer:
"""

prompt = ChatPromptTemplate.from_template(template)

# Setup RAG pipeline
rag_chain = (
    {"context": retriever,  "question": RunnablePassthrough()} 
    | prompt 
    | llm
    | StrOutputParser() 
)

  warn_deprecated(


In [15]:
from datasets import Dataset

questions = ["Who founded OpenAI?", 
             "What was the initial goal of OpenAI?",
             "What did OpenAI release in 2016?",
            ]
ground_truths = [["Sam Altman, Elon Musk, Ilya Sutskever and Greg Brockman"],
                ["To advance digital intelligence in a way that benefits humanity"],
                ["OpenAI Gym, a toolkit for developing and comparing reinforcement learning algorithms"]]
answers = []
contexts = []

# Inference
for query in questions:

  answers.append(rag_chain.invoke(query))
  contexts.append([docs.page_content for docs in retriever.get_relevant_documents(query)])

# To dict
data = {
    "question": questions, # list 
    "answer": answers, # list
    "contexts": contexts, # list list
    "ground_truths": ground_truths # list Lists
}

# Convert dict to dataset
dataset = Dataset.from_dict(data)

  from .autonotebook import tqdm as notebook_tqdm
/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/langchain_community/chat_models/openai.py:458: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Gui

In [16]:
from ragas import evaluate
from ragas.metrics import (
    faithfulness,
    answer_relevancy,
    context_recall,
    context_precision,
)

result = evaluate(
    dataset = dataset, 
    metrics=[
        context_precision,
        context_recall,
        faithfulness,
        answer_relevancy,
    ],
)

df = result.to_pandas()

evaluating with [context_precision]


100%|██████████| 1/1 [00:06<00:00,  6.19s/it]


evaluating with [context_recall]


100%|██████████| 1/1 [00:10<00:00, 10.18s/it]


evaluating with [faithfulness]


100%|██████████| 1/1 [00:02<00:00,  2.92s/it]


evaluating with [answer_relevancy]


  0%|          | 0/1 [00:00<?, ?it/s]/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/langchain_community/embeddings/openai.py:500: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
  response = response.dict()
/Users/mahlettaye/prompt-evaluation/venv/lib/python3.10/site-packages/pydantic/main.py:979: PydanticDeprecatedSince20: The `dict` method is deprecated; use `model_dump` instead. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.5/migration/
100%|██████████| 1/1 [00:04<00:00,  4.50s/it]


In [17]:
df

Unnamed: 0,question,answer,contexts,ground_truths,context_precision,context_recall,faithfulness,answer_relevancy
0,Who founded OpenAI?,"OpenAI was founded by Sam Altman, Elon Musk, I...",[OpenAI was initially founded in 2015 by Sam A...,"[Sam Altman, Elon Musk, Ilya Sutskever and Gre...",1.0,1.0,1.0,0.9592
1,What was the initial goal of OpenAI?,The initial goal of OpenAI was to advance digi...,[OpenAI was initially founded in 2015 by Sam A...,[To advance digital intelligence in a way that...,1.0,1.0,1.0,1.0
2,What did OpenAI release in 2016?,"OpenAI released 'OpenAI Gym' in 2016, a toolki...",[The early years of OpenAI were marked with ra...,"[OpenAI Gym, a toolkit for developing and comp...",1.0,1.0,1.0,0.899171


#### Integration with Retrieval-Augmented Generation Assessment:
##### Monte Carlo for Robustness Testing: Use Monte Carlo simulations to test the robustness of the RAG system across a wide range of possible retrieval scenarios. This helps in understanding how different types of retrieved information can impact the quality of the generated content.
##### Elo Rating for Continuous Improvement: Utilize the Elo rating system to continuously assess and improve the RAG model. By comparing new outputs with previous ones and adjusting ratings accordingly, the system can learn which types of retrieval-augmented generations work best.