In [18]:
import os
import random
import chromadb
from utils import *
from openai import OpenAI
import math
    

# LLM Clients
os.environ['TOGETHER_API_KEY'] = os.getenv('TOGETHER_API_KEY')
os.environ['OPENAI_API_KEY'] = os.getenv('OPENAI_API_KEY')

together_client = OpenAI(
    api_key=os.environ['TOGETHER_API_KEY'],
    base_url='https://api.together.xyz/v1',
)

openai_client = OpenAI(
    api_key=os.environ['OPENAI_API_KEY']
)


# Load datasets
pre_2021_articles = load_json('../datasets/pre_2021_articles.json')
post_2024_articles = load_json('../datasets/post_2024_articles.json')
qa_map = load_json('../datasets/qa_map.json')


# Chroma Client
client = chromadb.PersistentClient(path="../chromadb/")
clean_collection = client.get_collection("clean_articles")
adversarial_collection = client.get_collection("adversarial_articles")

In [21]:
%pip install chromadb
import chromadb

# Initialize Chroma Client
client = chromadb.PersistentClient(path="../chromadb/")
clean_collection = client.get_collection("clean_articles")
adversarial_collection = client.get_collection("adversarial_articles")

# Constants
NUM_ARTICLES = 200

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Note: you may need to restart the kernel to use updated packages.


In [23]:
def rag_evaluation_with_increased_context(article_names, num_articles = 1, p = 0):
    """
    Evaluate the RAG model on a list of articles.

    Args:
        article_names (list): List of article names to evaluate.
        num_articles (int): Number of articles to query from the RAG.
        p (float): Probability of querying the clean RAG.
    
    Returns:
        float: Average binary score of the model.
    """
    num_questions = 0
    total_score = 0

    for article_name in article_names:
        qa_list = qa_map[article_name]

        for question, answer in qa_list:
            try:
                #With probability p, query the clean RAG, and with probability 1-p, query the adversarial RAG
                if random.random() < p:
                    results = clean_collection.query(
                        query_texts=[question],
                        n_results=num_articles
                    )
                else:
                    results = adversarial_collection.query(
                        query_texts=[question],
                        n_results=num_articles
                    )

                retrieved_context_list = results['documents'][0]
                curr_max_probability = -float('inf')
                model_answer = None
                
                for retrieved_context in retrieved_context_list:
                    prompt = (
                        f"<s>[INST] <<SYS>>\nYou are an AI. Please answer the following question as briefly and concisely as possible, using as few words as possible. "
                        f"For example, if the question is 'What year was George Washington born?' please answer '1732'. Use the following context to help answer the question:<</SYS>>\n"
                        f"Context:\n{retrieved_context}\nQuestion:\n{question} [/INST]"
                    )
                    
                    # Check token length and truncate if necessary
                    prompt = truncate_prompt(prompt, question, retrieved_context, 4047)

                    response = together_client.chat.completions.create(
                        messages=[
                            {
                                "role": "system",
                                "content": "<<SYS>>\nYou are an AI. Please answer the following questions as briefly and concisely as possible, using as few words as possible. "
                                            "For example, if the question is 'What year was George Washington born?' please answer '1732'.<</SYS>>"
                            },
                            {
                                "role": "user",
                                "content": prompt,
                            }
                        ],
                        model="meta-llama/Llama-2-7b-chat-hf",
                        logprobs = 1
                    )

                    log_prob_sum = sum(response.choices[0].logprobs.token_logprobs) # Check model confidence
                    if log_prob_sum > curr_max_probability:
                        curr_max_probability = log_prob_sum
                        model_answer = response.choices[0].message.content
                    
                binary_score = evaluate_with_gpt3(question, answer, model_answer)

                
                # Log statements
                print("Question: ", question)
                print("Ground Truth Answer: ", answer)
                print("Model Answer: ", model_answer)
                print("Binary Score: ", binary_score)
                print()

                total_score += binary_score
                num_questions += 1

            except Exception as e:
                continue

    return total_score / num_questions

###
100% Correct Articles with num_articles = 3

In [24]:
# Sample 1,000 random articles
random_article_names = random.sample(list(pre_2021_articles.keys()) + list(post_2024_articles.keys()), NUM_ARTICLES)
final_score = rag_evaluation_with_increased_context(random_article_names, num_articles = 1, p=1)
print("Final Average Binary Score: ", final_score)

Question:  In what division did Rochdale compete in the 1956-57 season?
Ground Truth Answer:  Third Division North
Model Answer:    Third Division North.
Binary Score:  1

Question:  How many seasons had Rochdale played in the Football League before the 1956-57 season?
Ground Truth Answer:  28
Model Answer:    Sure! Rochdale had played 29 seasons in the Football League before the 1956-57 season.
Binary Score:  0

Question:  What was the total number of seasons Rochdale had completed in the Football League after the 1956-57 season?
Ground Truth Answer:  29
Model Answer:    Five.
Binary Score:  0

Question:  Which year did Rochdale first compete in the Football League?
Ground Truth Answer:  Unknown
Model Answer:    Sure! Rochdale first competed in the Football League in 1921.
Binary Score:  0

Question:  What was the primary league Rochdale participated in during the 1956-57 season?
Ground Truth Answer:  Football League
Model Answer:    Third Division North.
Binary Score:  0

Question:  

Token indices sequence length is longer than the specified maximum sequence length for this model (2062 > 1024). Running this sequence through the model will result in indexing errors


Question:  How many editions of the São Paulo WCT tournament were held in total?
Ground Truth Answer:  3
Model Answer:    Four.
Binary Score:  0

Question:  What was the score of the singles final between Björn Borg and Guillermo Vilas?
Ground Truth Answer:  7–6(7–4), 6–2
Model Answer:    Sure! The score of the singles final between Björn Borg and Guillermo Vilas was 6-2, 6-3.
Binary Score:  0

Question:  Who was the second-seeded player at the 1976 São Paulo WCT tournament?
Ground Truth Answer:  Björn Borg
Model Answer:    Björn Borg
Binary Score:  1

Question:  Which group of the World Championship Tennis circuit was the 1976 São Paulo WCT tournament part of?
Ground Truth Answer:  Blue Group
Model Answer:    Blue Group
Binary Score:  1

Question:  Who won the 1798 Maryland gubernatorial election?
Ground Truth Answer:  Benjamin Ogle
Model Answer:    John Henry
Binary Score:  0

Question:  When was the 1798 Maryland gubernatorial election held?
Ground Truth Answer:  14 November 1798
Mo