In [None]:
import json
import os
import random

import numpy as np
import pandas as pd

from benchmark import Engine
from data import Benchmark
from llms import OpenAIHelper

random.seed(42)

# Setup

In [None]:
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))  # one level up

config_path = os.path.join(project_root, "config.json")
with open(config_path, "r") as f:
    config = json.load(f)

os.environ["OPENAI_API_KEY"] = config["OPENAI_API_KEY"]

## Experiment Setup

In [None]:
list_methods = [
    "baseline",
    "Authoritative",
    "Statistics",
    "Citations",
    "Fluency",
    "UniqueWords",
    "TechnicalTerms",
    "SimpleLanguage",
    "Quotes",
    "LLMstxt",
    "ContentImprovement",
    "seo_baseline-1"  # only 1 document uses the method. The document is pushed to position 1 (you can change 1 to any position)
    "seo_baseline_game_theory",  # it pushes the documents in selected_docs.json to the top positions. If there is only one document, the method is equivalent to "seo_baseline-1"
]
list_splits = ["retail", "videogames", "news", "web", "debate", "books"]

In [None]:
dataset_path = "parameterlab/c-seo-bench"
llm_name = "gpt-4o-mini-2024-07-18"
llm = OpenAIHelper(llm_name)
num_docs_in_context = 10
sample_size = None
method = list_methods[1]  # choose the method to use
split = list_splits[-2]  # choose the split to use

# check if the selected_docs.json file exists
if not os.path.exists(os.path.join(project_root, "data", split, "selected_docs.json")):
    raise FileNotFoundError(
        "The selected_docs.json file does not exist. Please run steps 1 and 2 to select the documents and improve them with a C-SEO method. Only then you can run C-SEO Benchmark (step 3)."
    )
print(
    f"You will run a conversational search engine powered by {llm_name} on the {split} split after improving the documents with the {method} C-SEO method."
)

In [None]:
def get_developer_prompt(split):
    if split in ["web", "news", "debate"]:
        developer_prompt = "Write an accurate and concise answer for the given user question, using _only_ the provided summarized web search results. The answer should be correct, high-quality, and written by an expert using an unbiased and journalistic tone. The answer should be informative, interesting, and engaging. The answer's logic and reasoning should be rigorous and defensible. Every sentence in the answer should be _immediately followed_ by an in-line citation to the search result(s). The cited search result(s) should fully support _all_ the information in the sentence. Search results need to be cited using [index]. When citing several search results, use [1][2][3] format rather than [1, 2, 3]. You can use multiple search results to respond comprehensively while avoiding irrelevant search results. Search results are not sorted."
    elif split == "videogames":
        developer_prompt = "You are a videogame recommender system for an online marketplace. Write an accurate and concise answer for the given user question, using _only_ the provided summarized web search results. The answer should be correct, high-quality, and written by an expert using an unbiased and journalistic tone. The answer should be informative, interesting, and engaging. The answer's logic and reasoning should be rigorous and defensible. Every sentence in the answer should be _immediately followed_ by an in-line citation to the search result(s). The cited search result(s) should fully support _all_ the information in the sentence. Search results need to be cited using [index]. When citing several search results, use [1][2][3] format rather than [1, 2, 3]. You can use multiple search results to respond comprehensively while avoiding irrelevant search results. Search results are not sorted."
    elif split == "retail":
        developer_prompt = "Your are a product recommender system for an online marketplace. Write an accurate and concise answer for the given user question, using _only_ the provided summarized web search results. The answer should be correct, high-quality, and written by an expert using an unbiased and journalistic tone. The answer should be informative, interesting, and engaging. The answer's logic and reasoning should be rigorous and defensible. Every sentence in the answer should be _immediately followed_ by an in-line citation to the search result(s). The cited search result(s) should fully support _all_ the information in the sentence. Search results need to be cited using [index]. When citing several search results, use [1][2][3] format rather than [1, 2, 3]. You can use multiple search results to respond comprehensively while avoiding irrelevant search results. Search results are not sorted."
    elif split == "books":
        developer_prompt = "Your are a book recommender system for an online marketplace. Write an accurate and concise answer for the given user question, using _only_ the provided summarized web search results. The answer should be correct, high-quality, and written by an expert using an unbiased and journalistic tone. The answer should be informative, interesting, and engaging. The answer's logic and reasoning should be rigorous and defensible. Every sentence in the answer should be _immediately followed_ by an in-line citation to the search result(s). The cited search result(s) should fully support _all_ the information in the sentence. Search results need to be cited using [index]. When citing several search results, use [1][2][3] format rather than [1, 2, 3]. You can use multiple search results to respond comprehensively while avoiding irrelevant search results. Search results are not sorted."
    else:
        raise ValueError(f"Split {split} not supported.")
    return developer_prompt


doc_type_mapping = {
    "books": "Synopsis",
    "web": "Web Page Snippet",
    "debate": "Web Page Snippet",
    "news": "News Article",
    "retail": "Product Description",
    "videogames": "Game Description",
}

In [None]:
engine = Engine()

In [None]:
# loading the dataset
dataset = Benchmark(
    num_docs_in_context=num_docs_in_context,
    method=method,
    sample_size=sample_size,
    data_path=dataset_path,
    split=split,
    doc_type=doc_type_mapping[split],
    selected_documents_path=os.path.join(
        project_root, "data", split, "selected_docs.json"
    ),
)

In [None]:
print(dataset[1]["user_prompt"])

In [None]:
# running the benchmark
running_folder = os.path.join(project_root, "experiments", "running", split, method)
batch_id = engine.run_benchmark(
    dataset, get_developer_prompt(split), llm, running_folder
)

In [None]:
status = llm.get_status(batch_id)
print(f"Status for {split} x {method}: {status}")

In [None]:
# Once the status is "completed", we can download and process the results

results_folder = running_folder.replace("running", "results")
if not os.path.exists(results_folder):
    os.makedirs(results_folder)

results, cost = llm.retrieve_results(batch_id)
df = engine.process_benchmark_responses(results, results_folder)
df.to_parquet(
    os.path.join(results_folder, "responses.parquet"),
    index=False,
)
print(f"Results saved in {results_folder} -- Cost: {cost}")