In [1]:
! pip install langchain_community tiktoken langchain-openai langchainhub chromadb langchain langchain_fireworks google-search-results requests gradio

Collecting langchain_community
  Downloading langchain_community-0.2.10-py3-none-any.whl.metadata (2.7 kB)
Collecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting langchain-openai
  Downloading langchain_openai-0.1.19-py3-none-any.whl.metadata (2.6 kB)
Collecting langchainhub
  Downloading langchainhub-0.1.20-py3-none-any.whl.metadata (659 bytes)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting langchain
  Downloading langchain-0.2.11-py3-none-any.whl.metadata (7.1 kB)
Collecting langchain_fireworks
  Downloading langchain_fireworks-0.1.6-py3-none-any.whl.metadata (4.0 kB)
Collecting google-search-results
  Downloading google_search_results-2.4.2.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting gradio
  Downloading gradio-4.39.0-py3-none-any.whl.metadata (15 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain_community)
  D

In [None]:
# import os
# os.environ['LANGCHAIN_TRACING_V2'] = 'true'
# os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
# os.environ['LANGCHAIN_API_KEY'] = 'API_KEY'

In [None]:
import os
os.environ['FIREWORKS_API_KEY'] = 'API_KEY'
os.environ["SERPER_API_KEY"] = 'API_KEY'

In [None]:
import os, bs4, re, time, json
from langchain import hub
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_fireworks import FireworksEmbeddings, ChatFireworks
from langchain_community.utilities import GoogleSerperAPIWrapper
from langchain.prompts import PromptTemplate
from collections import defaultdict
import gradio as gr

In [None]:
prompt_1 = """Task: Named Entity Recognition (NER)
Description: Perform Named Entity Recognition by analyzing the input text to identify named entities among nouns and classify them into specified categories: LOC (Location), MISC (Miscellaneous), ORG (Organization), and PER (Person). The output should be a dictionary where each key is a noun from the input sentence and the value is the corresponding entity tag.

Process:
1. Read the input text and identify nouns that potentially represent named entities.
2. Classify each noun according to the context it appears in:
   - LOC: Geographical locations like cities or countries.
   - MISC: Entities that don't fit other categories, such as events or products.
   - ORG: Any type of organization, including companies or governmental bodies.
   - PER: Names of individuals.
3. Output a dictionary with nouns from the sentence as keys and their corresponding tags as values. Nouns that are not named entities should be tagged as "None".

Example 1:
Input: "Tim Cook visited Apple headquarters."
Reasoning:
- "Tim Cook" is identified as a person's name.
- "Apple" refers to an organization, specifically a company.
- "headquarters" although a noun, does not represent a named entity in this context.
Output: {'Tim Cook': 'PER', 'Apple': 'ORG', 'headquarters': 'None'}

Example 2:
Input: "The Eiffel Tower is in Paris."
Reasoning:
- "Eiffel Tower" is a location, a famous landmark.
- "Paris" is also a location, a city.
Output: {'Eiffel Tower': 'LOC', 'Paris': 'LOC'}

Instructions for Use:
1. Input a sentence into the model.
2. Follow the reasoning steps to identify and classify each noun that is a potential named entity.
3. Construct the output dictionary based on the classifications.

Ensure the model processes each noun carefully to determine the most accurate category based on the surrounding context.

Input: {input}


"""



prompt_3 = """You are an AI assistant answering questions based solely on the provided context. Follow these steps:

1. Analyze the context (search result snippets) and determine if it contains complete information to answer the question. Explain your reasoning in 30 words or less.

2. If the context is sufficient:
   - Provide a clear, concise answer based strictly on the context.
   - End with "Sufficient: Yes."

3. If the context is insufficient:
   - Briefly explain why you cannot answer the question.
   - End with "Sufficient: No."

Context:
{context}

Question:
{question}

Response:"""

In [None]:
num_search_docs = 10
search = GoogleSerperAPIWrapper(k=num_search_docs)

def extract_snippets(data):
    snippets = []
    if 'organic' in data:
        for result in data['organic']:
            if 'snippet' in result:
                snippets.append(result['snippet'])
    if 'peopleAlsoAsk' in data:
        for item in data['peopleAlsoAsk']:
            if 'snippet' in item:
                snippets.append(item['snippet'])
    return snippets

def get_context(question):
    results = search.results(question)
    snippets = extract_snippets(results)
    return "\n\n".join(snippets[:num_search_docs])

def answer_question(question, model_choice, prompt_choice):
    context = get_context(question)

    # Select the model
    if model_choice == "LLaMA-3.1-8B":
        llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)
    elif model_choice == "Gemma2-9B":
        llm = ChatFireworks(model_name="accounts/fireworks/models/gemma2-9b-it", temperature=0)

    # Select the prompt
    if prompt_choice == "Prompt 1":
        system_template = prompt_1
    elif prompt_choice == "Prompt 2":
        system_template = prompt_2
    elif prompt_choice == "Prompt 3":
        system_template = prompt_3
    else:
        raise ValueError("Invalid prompt choice. Choose 'Prompt 1', 'Prompt 2', or 'Prompt 3'.")

    prompt = PromptTemplate.from_template(system_template)

    chain = prompt | llm | StrOutputParser()

    input_dict = {"context": context, "question": question}

    start_time = time.time()
    response = chain.invoke(input_dict)
    end_time = time.time()
    response_time = end_time - start_time

    return {
        "model": model_choice,
        "prompt": prompt_choice,
        "context": context,
        "response": response,
        "response_time": response_time
    }

# Example usage
question = "When was Apple M2 released?"
out = answer_question(question, "LLaMA-3.1-8B", "Prompt 3")

print(f"{out['response']}")
print(f"{out['response_time']:.2f} Seconds")

Apple announced the M2 chip in June 2022, and it debuted in two new laptops: the 13-inch MacBook Air and MacBook Pro, which launched in the Summer of 2022. However, the exact release date of the M2 chip is not specified in the context. Sufficient: No.
0.50 Seconds


In [None]:
def format_output(result):
    return f"""
### Model: {result['model']}
### Prompt: {result['prompt']}

## Question:
{result['question']}

## Context:
{result['context']}

## Answer:
{result['response']}

**Response Time:** {result['response_time']}
    """

def answer_question_wrapper(question, model_choice, prompt_choice):
    result = answer_question(question, model_choice, prompt_choice)
    result['question'] = question  # Add the question to the result dictionary
    return format_output(result)

demo = gr.Interface(
    fn=answer_question_wrapper,
    inputs=[
        gr.Textbox(label="Question"),
        gr.Radio(["LLaMA-3.1-8B", "Gemma2-9B"], label="Model Choice", value="LLaMA-3.1-8B"),
        gr.Radio(["Prompt 1", "Prompt 2", "Prompt 3"], label="Prompt Choice", value="Prompt 1")
    ],
    outputs=gr.Markdown(),
    title="Question Answering Model",
    description="Enter a question, choose a model and a prompt, and the system will provide an answer based on web search, along with the context and response time."
)

# Launch the Gradio app
demo.launch()

Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://23e1508193853fbc5a.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)




In [None]:
# LLM for generating questions
llm_generator = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-70b-instruct", temperature=0.6)

# Question generation prompt
question_gen_template = """Generate exactly {num_questions} diverse and challenging questions that would require complex web searches to answer. The questions should:

1. Cover a wide range of topics (e.g., science, history, current events, technology, arts, code)
2. Include some questions that are easy to search and find solutions for
3. Avoid long questions
4. Include some easy factual questions in the list
5. Ensure there is only one question per query. Query should NOT be multiple questions

Please provide the questions as a numbered list, starting from 1 and ending at {num_questions}.

Generated Questions:"""

question_gen_prompt = PromptTemplate.from_template(question_gen_template)

def generate_questions(num_questions, max_attempts=3):
    for attempt in range(max_attempts):
        question_gen_chain = question_gen_prompt | llm_generator | StrOutputParser()
        questions_text = question_gen_chain.invoke({"num_questions": num_questions})

        questions = []
        for line in questions_text.split('\n'):
            match = re.match(r'^\s*\d+\.\s*(.+)$', line)
            if match:
                question = match.group(1).strip()
                questions.append(question)

        if len(questions) == num_questions:
            return questions

        print(f"Attempt {attempt + 1}: Generated {len(questions)} questions instead of {num_questions}. Retrying...")

    raise ValueError(f"Failed to generate exactly {num_questions} questions after {max_attempts} attempts.")

# Generate questions
num_questions = 100
try:
    evaluation_questions = generate_questions(num_questions)
    print(f"Successfully generated {len(evaluation_questions)} questions:")
    for i, question in enumerate(evaluation_questions, 1):
        print(f"{i}. {question}")
except ValueError as e:
    print(f"Error: {e}")

Successfully generated 100 questions:
1. What is the average airspeed velocity of an unladen swallow?
2. Which ancient civilization built the first known sundial?
3. What is the chemical composition of the pigment Tyrian purple?
4. Who is the author of the first computer bug?
5. What is the name of the largest living organism in the world?
6. In what year did the first human settle in North America?
7. What is the name of the algorithm used in the first GPS system?
8. Who painted the ceiling of the Sistine Chapel?
9. What is the deepest part of the ocean?
10. What is the name of the first computer virus?
11. Who is the founder of the field of psychoanalysis?
12. What is the name of the largest star known to science?
13. What is the chemical formula for the compound responsible for the smell of skunk spray?
14. Who is the author of the book "The Origin of Species"?
15. What is the name of the first successful polio vaccine?
16. In what year did the first human walk on the moon?
17. What

In [None]:
# LLM for answering questions
llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)

# LLM for judging (70B model)
judge_llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-70b-instruct", temperature=0)

# Chain for answering questions
chain = prompt | llm | StrOutputParser()

# Judge prompt template
judge_template = """You are an expert AI evaluator tasked with assessing the performance of a smaller AI model. Your job is to evaluate the model's response based on the given context and question. Focus on two main aspects:

1. Answer Correctness: Determine if the model's answer is correct and fully addresses the question based solely on the provided context.
2. Sufficiency Assessment: Evaluate if the model's "Sufficient: Yes/No" conclusion is correct.

Guidelines:
- The model should only answer when it can COMPLETELY address the question using the context. It is ok if the model mentions it can not answer based on context.
- Partial answers should be considered incorrect.
- The model should not use any external knowledge not present in the context.
- "Sufficient: Yes" should only be used when the context contains ALL necessary information to fully answer the question.
- There should be always either "Sufficient: Yes." or "Sufficient: No." at the end.

Context:
{context}

Question: {question}

Model's Response:
{response}

Provide your evaluation in the following JSON format:
{{
  "answer_correctness": "Correct" or "Incorrect",
  "sufficiency_assessment": "Correct" or "Incorrect",
  "explanation": "Brief explanation of your evaluation"
}}

Your Evaluation:"""

judge_prompt = PromptTemplate.from_template(judge_template)

# Evaluation function
def evaluate_response(context, question, response):
    judge_chain = judge_prompt | judge_llm | StrOutputParser()
    evaluation = judge_chain.invoke({"context": context, "question": question, "response": response})

    try:
        # Try to parse the JSON output
        eval_dict = json.loads(evaluation)

        # Ensure all required keys are present
        required_keys = ["answer_correctness", "sufficiency_assessment", "explanation"]
        if all(key in eval_dict for key in required_keys):
            return eval_dict
        else:
            raise ValueError("Missing required keys in evaluation output")

    except (json.JSONDecodeError, ValueError) as e:
        # If JSON parsing fails or required keys are missing, return an error dict
        return {
            "answer_correctness": "Error",
            "sufficiency_assessment": "Error",
            "explanation": f"Failed to parse evaluation: {str(e)}"
        }

# Run evaluation
results = []
total_answer_time = 0
total_judge_time = 0

for question in evaluation_questions:
    context = get_context(question)

    # Time the smaller model (answering)
    start_answer = time.time()
    answer = chain.invoke({"context": context, "question": question})
    end_answer = time.time()
    answer_time = end_answer - start_answer
    total_answer_time += answer_time

    # Time the larger model (judging)
    start_judge = time.time()
    evaluation = evaluate_response(context, question, answer)
    end_judge = time.time()
    judge_time = end_judge - start_judge
    total_judge_time += judge_time

    result = {
        "question": question,
        "context": context,
        "response": answer,
        "evaluation": evaluation,
        "answer_time": answer_time,
        "judge_time": judge_time
    }
    results.append(result)

# Save results to a JSON file
with open("evaluation_results_1.json", "w") as f:
    json.dump(results, f, indent=2)

# Print summary
def parse_evaluation(evaluation):
    lines = evaluation.strip().split('\n')
    answer_correctness = lines[0].split(': ')[1]
    sufficiency_assessment = lines[1].split(': ')[1]
    return answer_correctness, sufficiency_assessment

correct_answers = 0
correct_sufficiency = 0
total_questions = len(evaluation_questions)
valid_evaluations = 0

for result in results:
    evaluation = result["evaluation"]
    if evaluation["answer_correctness"] != "Error":
        valid_evaluations += 1
        if evaluation["answer_correctness"] == "Correct":
            correct_answers += 1
        if evaluation["sufficiency_assessment"] == "Correct":
            correct_sufficiency += 1

print(f"Total questions: {total_questions}")
print(f"Valid evaluations: {valid_evaluations}")
print(f"Correct answers: {correct_answers}")
print(f"Answer Accuracy: {correct_answers / valid_evaluations * 100:.2f}% (of {valid_evaluations} valid evaluations)")
print(f"Correct sufficiency assessments: {correct_sufficiency}")
print(f"Sufficiency Assessment Accuracy: {correct_sufficiency / valid_evaluations * 100:.2f}% (of {valid_evaluations} valid evaluations)")
print(f"Averge answer time of the model is: {total_answer_time/len(evaluation_questions):.2f} second")

Total questions: 100
Valid evaluations: 94
Correct answers: 87
Answer Accuracy: 92.55% (of 94 valid evaluations)
Correct sufficiency assessments: 90
Sufficiency Assessment Accuracy: 95.74% (of 94 valid evaluations)
Averge answer time of the model is: 0.5932064151763916


In [None]:
# LLM for answering questions
llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-8b-instruct", temperature=0)

# LLM for judging (405B model)
judge_llm = ChatFireworks(model_name="accounts/fireworks/models/llama-v3p1-405b-instruct", temperature=0)

# Chain for answering questions
chain = prompt | llm | StrOutputParser()

judge_prompt = PromptTemplate.from_template(judge_template)


# Run evaluation
results = []
total_answer_time = 0
total_judge_time = 0

for question in evaluation_questions:
    context = get_context(question)

    # Time the smaller model (answering)
    start_answer = time.time()
    answer = chain.invoke({"context": context, "question": question})
    end_answer = time.time()
    answer_time = end_answer - start_answer
    total_answer_time += answer_time

    # Time the larger model (judging)
    start_judge = time.time()
    evaluation = evaluate_response(context, question, answer)
    end_judge = time.time()
    judge_time = end_judge - start_judge
    total_judge_time += judge_time

    result = {
        "question": question,
        "context": context,
        "answer": answer,
        "evaluation": evaluation,
        "answer_time": answer_time,
        "judge_time": judge_time
    }
    results.append(result)

# Save results to a JSON file
with open("evaluation_results_2.json", "w") as f:
    json.dump(results, f, indent=2)

# Print summary
def parse_evaluation(evaluation):
    lines = evaluation.strip().split('\n')
    answer_correctness = lines[0].split(': ')[1]
    sufficiency_assessment = lines[1].split(': ')[1]
    return answer_correctness, sufficiency_assessment

correct_answers = 0
correct_sufficiency = 0
total_questions = len(evaluation_questions)
valid_evaluations = 0

for result in results:
    evaluation = result["evaluation"]
    if evaluation["answer_correctness"] != "Error":
        valid_evaluations += 1
        if evaluation["answer_correctness"] == "Correct":
            correct_answers += 1
        if evaluation["sufficiency_assessment"] == "Correct":
            correct_sufficiency += 1

print(f"Total questions: {total_questions}")
print(f"Valid evaluations: {valid_evaluations}")
print(f"Correct answers: {correct_answers}")
print(f"Answer Accuracy: {correct_answers / valid_evaluations * 100:.2f}% (of {valid_evaluations} valid evaluations)")
print(f"Correct sufficiency assessments: {correct_sufficiency}")
print(f"Sufficiency Assessment Accuracy: {correct_sufficiency / valid_evaluations * 100:.2f}% (of {valid_evaluations} valid evaluations)")
print(f"Averge answer time of the model is: {total_answer_time/len(evaluation_questions):.2f} second")

Total questions: 100
Valid evaluations: 100
Correct answers: 95
Answer Accuracy: 95.00% (of 100 valid evaluations)
Correct sufficiency assessments: 99
Sufficiency Assessment Accuracy: 99.00% (of 100 valid evaluations)
Averge answer time of the model is: 0.6739742994308472


In [None]:
def evaluate_models(evaluation_questions, models, prompts, judges):
    results = []
    response_times = defaultdict(list)

    for model in models:
        for prompt_name in prompts:
            for judge_name in judges:
                config_results = []
                config_times = []

                for question in evaluation_questions:
                    context = get_context(question)

                    # Select the prompt
                    if prompt_name == "Prompt 1":
                        system_template = prompt_1
                    elif prompt_name == "Prompt 2":
                        system_template = prompt_2
                    elif prompt_name == "Prompt 3":
                        system_template = prompt_3
                    else:
                        raise ValueError(f"Invalid prompt choice: {prompt_name}")

                    prompt = PromptTemplate.from_template(system_template)

                    # Answer the question
                    llm = ChatFireworks(model_name=model_name_map[model], temperature=0)
                    chain = prompt | llm | StrOutputParser()

                    start_time = time.time()
                    answer = chain.invoke({"context": context, "question": question})
                    end_time = time.time()
                    response_time = end_time - start_time

                    config_times.append(response_time)

                    judge = ChatFireworks(model_name=model_name_map[judge_name], temperature=0)

                    judge_prompt = PromptTemplate.from_template("""You are an expert AI evaluator tasked with assessing the performance of a smaller AI model. Your job is to evaluate the model's response based on the given context and question. Focus on two main aspects:

                    1. Answer Correctness: Determine if the model's answer is correct and fully addresses the question based solely on the provided context.
                    2. Sufficiency Assessment: Evaluate if the model's "Sufficient: Yes/No" conclusion is correct.

                    Guidelines:
                    - The model should only answer when it can COMPLETELY address the question using the context. It is ok if the model mentions it can not answer based on context.
                    - Partial answers should be considered incorrect.
                    - The model should not use any external knowledge not present in the context.
                    - "Sufficient: Yes" should only be used when the context contains ALL necessary information to fully answer the question.
                    - There should be always either "Sufficient: Yes." or "Sufficient: No." at the end.

                    Context:
                    {context}

                    Question: {question}

                    Model's Response:
                    {response}

                    Provide your evaluation in the following JSON format:
                    {{
                      "answer_correctness": "Correct" or "Incorrect",
                      "sufficiency_assessment": "Correct" or "Incorrect",
                      "explanation": "Brief explanation of your evaluation"
                    }}

                    Your Evaluation:""")

                    judge_chain = judge_prompt | judge | StrOutputParser()

                    evaluation = judge_chain.invoke({
                        "question": question,
                        "context": context,
                        "response": answer
                    })

                    result = {
                        "question": question,
                        "model": model,
                        "prompt": prompt_name,
                        "judge": judge_name,
                        "response": answer,
                        "response_time": response_time,
                        "evaluation": evaluation
                    }

                    config_results.append(result)

                # Calculate metrics for this configuration
                avg_response_time = sum(config_times) / len(config_times)
                correct_answers = sum(1 for r in config_results if '"answer_correctness": "Correct"' in r['evaluation'])
                correct_sufficiency = sum(1 for r in config_results if '"sufficiency_assessment": "Correct"' in r['evaluation'])

                print(f"\nMetrics for configuration:")
                print(f"Model: {model}")
                print(f"Prompt: {prompt_name}")
                print(f"Judge: {judge_name}")
                print(f"Average Response Time: {avg_response_time:.2f} seconds")
                print(f"Correct Answers: {correct_answers}/{len(evaluation_questions)}")
                print(f"Correct Sufficiency Assessments: {correct_sufficiency}/{len(evaluation_questions)}")
                print("-" * 50)

                results.extend(config_results)
                response_times[(model, prompt_name)].extend(config_times)

    # Calculate and print overall average response times
    print("\nOverall Average Response Times:")
    for (model, prompt), times in response_times.items():
        avg_time = sum(times) / len(times)
        print(f"Model: {model}, Prompt: {prompt} - Average Time: {avg_time:.2f} seconds")

    return results

In [None]:
# Define a mapping for model names
model_name_map = {
    "LLaMA-3.1-8B": "accounts/fireworks/models/llama-v3p1-8b-instruct",
    "Gemma2-9B": "accounts/fireworks/models/gemma2-9b-it",
    "LLaMA-70B": "accounts/fireworks/models/llama-v3p1-70b-instruct",
    "LLaMA-405B": "accounts/fireworks/models/llama-v3p1-405b-instruct"
}

models = ["LLaMA-3.1-8B", "Gemma2-9B"]
prompts = ["Prompt 1", "Prompt 2", "Prompt 3"]
judges = ["LLaMA-70B", "LLaMA-405B"]

evaluation_results = evaluate_models(evaluation_questions, models, prompts, judges)


Metrics for configuration:
Model: LLaMA-3.1-8B
Prompt: Prompt 1
Judge: LLaMA-70B
Average Response Time: 0.71 seconds
Correct Answers: 96/100
Correct Sufficiency Assessments: 96/100
--------------------------------------------------

Metrics for configuration:
Model: LLaMA-3.1-8B
Prompt: Prompt 1
Judge: LLaMA-405B
Average Response Time: 0.85 seconds
Correct Answers: 96/100
Correct Sufficiency Assessments: 98/100
--------------------------------------------------





Metrics for configuration:
Model: LLaMA-3.1-8B
Prompt: Prompt 2
Judge: LLaMA-70B
Average Response Time: 0.63 seconds
Correct Answers: 95/100
Correct Sufficiency Assessments: 96/100
--------------------------------------------------

Metrics for configuration:
Model: LLaMA-3.1-8B
Prompt: Prompt 2
Judge: LLaMA-405B
Average Response Time: 0.56 seconds
Correct Answers: 94/100
Correct Sufficiency Assessments: 98/100
--------------------------------------------------

Metrics for configuration:
Model: LLaMA-3.1-8B
Prompt: Prompt 3
Judge: LLaMA-70B
Average Response Time: 0.42 seconds
Correct Answers: 80/100
Correct Sufficiency Assessments: 89/100
--------------------------------------------------

Metrics for configuration:
Model: LLaMA-3.1-8B
Prompt: Prompt 3
Judge: LLaMA-405B
Average Response Time: 0.40 seconds
Correct Answers: 86/100
Correct Sufficiency Assessments: 93/100
--------------------------------------------------

Metrics for configuration:
Model: Gemma2-9B
Prompt: Prompt 1
Judg