In [1]:
# %pip install flaml[retrievechat]~=2.0.0rc5

## Set your API Endpoint

The [`config_list_from_json`](https://microsoft.github.io/FLAML/docs/reference/autogen/oai/openai_utils#config_list_from_json) function loads a list of configurations from an environment variable or a json file.


In [2]:
import os
os.environ["ALL_PROXY"] = ""

In [3]:
from flaml import autogen

config_list = autogen.config_list_from_json(
    env_or_file=".config.local",
    file_location=".",
    filter_dict={
        "model": {
            "gpt-4",
            "gpt4",
            "gpt-4-32k",
            "gpt-4-32k-0314",
            "gpt-35-turbo",
            "gpt-3.5-turbo",
        }
    },
)

assert len(config_list) > 0
config_list[0]['model'] = 'gpt-3.5-turbo'
print("models to use: ", [config_list[i]["model"] for i in range(len(config_list))])

models to use:  ['gpt-3.5-turbo']


## Construct agents for RetrieveChat

We start by initialzing the `RetrieveAssistantAgent` and `RetrieveUserProxyAgent`. The system message needs to be set to "You are a helpful assistant." for RetrieveAssistantAgent. The detailed instructions are given in the user message. Later we will use the `RetrieveUserProxyAgent.generate_init_prompt` to combine the instructions and a math problem for an initial prompt to be sent to the LLM assistant.

In [4]:
from flaml.autogen.agentchat.contrib.retrieve_assistant_agent import RetrieveAssistantAgent
from flaml.autogen.agentchat.contrib.retrieve_user_proxy_agent import RetrieveUserProxyAgent
import chromadb

# 1. create an RetrieveAssistantAgent instance named "assistant"
assistant = RetrieveAssistantAgent(
    name="assistant", 
    system_message="You are a helpful assistant.",
    max_consecutive_auto_reply=5,
    llm_config={
        "request_timeout": 600,
        "seed": 43,
        "config_list": config_list,
    },
)

# 2. create the RetrieveUserProxyAgent instance named "ragproxyagent"
corpus_file = "https://huggingface.co/datasets/thinkall/2WikiMultihopQA/resolve/main/corpus.txt"

# Create a new collection for NaturalQuestions dataset
ragproxyagent = RetrieveUserProxyAgent(
    name="ragproxyagent",
    human_input_mode="NEVER",
    max_consecutive_auto_reply=5,
    retrieve_config={
        "task": "multihop",
        "docs_path": corpus_file,
        "chunk_token_size": 2000,
        "model": config_list[0]["model"],
        "client": chromadb.PersistentClient(path="/tmp/chromadb"),
        "collection_name": "2wikimultihopqa",
        "chunk_mode": "one_line",
        "embedding_model": "all-MiniLM-L6-v2",
    },
)

### 2WikiMultihopQA

Use RetrieveChat to answer questions for [2WikiMultihopQA](https://github.com/Alab-NII/2wikimultihop) dataset.

We'll first create a new document collection based on all the context corpus, then we select some questions and answer them with RetrieveChat.


In [5]:
import json

queries_file = "https://huggingface.co/datasets/thinkall/2WikiMultihopQA/resolve/main/queries.jsonl"
!wget -O /tmp/chromadb/queries.jsonl $queries_file
queries = [json.loads(line) for line in open("/tmp/chromadb/queries.jsonl").readlines() if line]
questions = [q["text"] for q in queries]
answers = [q["metadata"]["answer"] for q in queries]
print(questions[:5])
print(answers[:5])
print("Number of questions:", len(questions))

--2023-09-07 17:50:38--  https://huggingface.co/datasets/thinkall/2WikiMultihopQA/resolve/main/queries.jsonl
Resolving huggingface.co (huggingface.co)... 18.154.227.7, 18.154.227.69, 18.154.227.67, ...
Connecting to huggingface.co (huggingface.co)|18.154.227.7|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2137700 (2.0M) [text/plain]
Saving to: ‘/tmp/chromadb/queries.jsonl’


2023-09-07 17:50:38 (10.4 MB/s) - ‘/tmp/chromadb/queries.jsonl’ saved [2137700/2137700]

['Who is the mother of the director of film Polish-Russian War (Film)?', 'Which film came out first, Blind Shaft or The Mask Of Fu Manchu?', "When did John V, Prince Of Anhalt-Zerbst's father die?", 'What is the award that the director of film Wearing Velvet Slippers Under A Golden Umbrella won?', 'Where was the director of film Ronnie Rocket born?']
[['Małgorzata Braunek'], ['The Mask Of Fu Manchu'], ['12 June 1516'], ['Myanmar Motion Picture Academy Awards'], ['Missoula, Montana']]
Number of questi

In [6]:
from io import StringIO 
import sys

class Capturing(list):
    def __enter__(self):
        self._stdout = sys.stdout
        sys.stdout = self._stringio = StringIO()
        return self
    def __exit__(self, *args):
        self.extend(self._stringio.getvalue().splitlines())
        del self._stringio    # free up some memory
        sys.stdout = self._stdout

In [7]:
import time
import re

num_questions = 20
n_results = 5  # number of documents to retrieve

retrieve_answers = []
questions_sample = []
answers_sample = []
st = time.time()
for idx, qa_problem in enumerate(questions[:num_questions]):
    if idx % 100 == 0:
        ct = time.time()
        print(f"\nProgress {idx/num_questions*100:.2f}%, Time Used {(ct-st)/3600:.2f} hours\n")
    assistant.reset()
    try:
        with Capturing() as print_output:
            ragproxyagent.initiate_chat(assistant, problem=qa_problem, n_results=n_results)
        retrieve_answers.append(re.sub(r'answer is', '', print_output[-3].split("\n")[-1], flags=re.IGNORECASE))
        questions_sample.append(qa_problem)
        answers_sample.append(answers[:num_questions][idx])
    except Exception as e:
        print(e)
        print("Error in problem: ", qa_problem)


Progress 0.00%, Time Used 0.00 hours



Collection 2wikimultihopqa already exists.


In [12]:
print(retrieve_answers[:5])
print("len(retrieve_answers):", len(retrieve_answers))
print("len(answers_sample):", len(answers_sample))
print("len(questions_sample):", len(questions_sample))

[' Xawery Żuławski.', ' `Update Context Which film came out first, Blind Shaft or The Mask Of Fu Manchu?`.', 'Answer: Update Context When did Ernest I, Prince of Anhalt-Dessau die?', ': Update Context: What is the award that the director of the film "Wearing Velvet Slippers Under A Golden Umbrella" won?', ': Update Context: Who is the director of the film "Ronnie Rocket"?']
len(retrieve_answers): 20
len(answers_sample): 20
len(questions_sample): 20


In [13]:
# https://qa.fastforwardlabs.com/no%20answer/null%20threshold/bert/distilbert/exact%20match/f1/robust%20predictions/2020/06/09/Evaluating_BERT_on_SQuAD.html#F1
def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def compute_f1_recall(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens), int(pred_tokens == truth_tokens)
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0, 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec), rec

def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""
    
    gold_answers = [answer["text"] for answer in example.answers if answer["text"]]

    # if gold_answers doesn't exist it's because this is a negative example - 
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = [""]
        
    return gold_answers

In [14]:
all_em_scores = []
all_f1_scores = []
all_recall_scores = []
for i in range(len(retrieve_answers)):
    prediction = retrieve_answers[i]
    gold_answers = answers_sample[i]

    em_score = max((compute_exact_match(prediction, answer)) for answer in gold_answers)
    f1_score = max((compute_f1_recall(prediction, answer)[0]) for answer in gold_answers)
    recall_score = max((compute_f1_recall(prediction, answer)[1]) for answer in gold_answers)

    all_em_scores.append(em_score)
    all_f1_scores.append(f1_score)
    all_recall_scores.append(recall_score)

    # if i % 10 == 0 or recall_score < 0.3:
    print(f"Question: {questions_sample[i]}")
    print(f"Prediction: {prediction}")
    print(f"True Answers: {gold_answers}")
    print(f"EM: {em_score} \t F1: {f1_score} \t Recall: {recall_score}")

print("=======================================")
print(f"Average EM: {sum(all_em_scores) / len(all_em_scores)}")
print(f"Average F1: {sum(all_f1_scores) / len(all_f1_scores)}")
print(f"Average Recall: {sum(all_recall_scores) / len(all_recall_scores)}")

Question: Who is the mother of the director of film Polish-Russian War (Film)?
Prediction:  Xawery Żuławski.
True Answers: ['Małgorzata Braunek']
EM: 0 	 F1: 0 	 Recall: 0
Question: Which film came out first, Blind Shaft or The Mask Of Fu Manchu?
Prediction:  `Update Context Which film came out first, Blind Shaft or The Mask Of Fu Manchu?`.
True Answers: ['The Mask Of Fu Manchu']
EM: 0 	 F1: 0.4444444444444445 	 Recall: 1.0
Question: When did John V, Prince Of Anhalt-Zerbst's father die?
Prediction: Answer: Update Context When did Ernest I, Prince of Anhalt-Dessau die?
True Answers: ['12 June 1516']
EM: 0 	 F1: 0 	 Recall: 0
Question: What is the award that the director of film Wearing Velvet Slippers Under A Golden Umbrella won?
Prediction: : Update Context: What is the award that the director of the film "Wearing Velvet Slippers Under A Golden Umbrella" won?
True Answers: ['Myanmar Motion Picture Academy Awards']
EM: 0 	 F1: 0 	 Recall: 0
Question: Where was the director of film Ronn

num_questions = 20 

## rephrase 1
n_results = 5  # number of documents to retrieve   
max_auto_reply = 3   

Average EM: 0.05   
Average F1: 0.1117725752508361   
Average Recall: 0.25   

## rephrase 2
n_results = 5  # number of documents to retrieve   
max_auto_reply = 3   

Average EM: 0.1   
Average F1: 0.18219201723905004   
Average Recall: 0.4025   

------
n_results = 10   
max_auto_reply = 3   

Average EM: 0.1   
Average F1: 0.1766518569150148   
Average Recall: 0.27   

----
seed=43   
max_auto_reply=5   
n_results = 5  # number of documents to retrieve   

Average EM: 0.1   
Average F1: 0.20555407011289364    
Average Recall: 0.42000000000000004   


## rephrase 3
n_results = 5  # number of documents to retrieve   
max_auto_reply = 3   

Average EM: 0.1   
Average F1: 0.15410256410256412   
Average Recall: 0.17333333333333334   


## rephrase 4: rephrase 2 + 2 examples from original 1
Average EM: 0.05    
Average F1: 0.21064342520224874   
Average Recall: 0.36666666666666664   

## rephrase 5
seed=43   
max_auto_reply=5    
n_results = 5  # number of documents to retriev   

Average EM: 0.05   
Average F1: 0.2185504201680672   
Average Recall: 0.42333333333333323   

## rephrase 7
0.39   

## rephrase 8
seed=43   
n_results = 5    
max_auto_reply = 5   

Average EM: 0.05   
Average F1: 0.16549730036942314   
Average Recall: 0.45   


## original 1
n_results = 5   
max_auto_reply = 3   

Average EM: 0.15   
Average F1: 0.2505793226381462   
Average Recall: 0.3333333333333333   

## original 2
n_results = 5    
max_auto_reply = 3   

Average EM: 0.0   
Average F1: 0.12080128205128204   
Average Recall: 0.2333333333333333   

In [11]:
# for qa_problem in questions[:min(500, num_questions)]:
#     print(f"\n\n>>>>>>>>>>>>>> case: {qa_problem} <<<<<<<<<<<<<<\n\n")
#     assistant.reset()
#     try:
#         ragproxyagent.initiate_chat(assistant, problem=qa_problem, n_results=n_results)
#     except Exception as e:
#         print(f"Exception: {e}")