This notebook will run a C-SEO method on a list of documents from a domain of the benchmark.
We already provide the improved documents for all C-SEO methods in `data/{partition}/selected_docs.json`

In [1]:
import json
from data import Benchmark
import os
import random

In [2]:
notebook_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebook_dir, ".."))

# Setup

In [None]:
data_path = "parameterlab/c-seo-bench"
list_splits = ["retail", "videogames", "news", "web", "debate", "books"]

doc_type_mapping = {
    "books": "Synopsis",
    "web": "Web Page Snippet",
    "debate": "Web Page Snippet",
    "news": "News Article",
    "retail": "Product Description",
    "videogames": "Game Description",
}

# Parameters

In [4]:
split_name = list_splits[0]

In [5]:
# loading the dataset
dataset = Benchmark(
    num_docs_in_context=10,
    method="baseline",
    sample_size=None,
    data_path=data_path,
    split=split_name,
    doc_type=doc_type_mapping[split_name],
)

Loading Benchmark - retail dataset...
retail dataset loaded.


In [6]:
dataset[0]

{'user_prompt': "Question: holidaytraditions\n\nSearch Results:\nProduct Description 1:\nName: Graduation Ornament 2021 Guy – Class of 2021 Ornament – Personalized Christmas Ornaments – School, Teacher Ornaments – Unique Graduation Gift for Him – Polyresin Graduation Decorations 2021\nDescription:\nList of features:\nPremium Quality Polyresin Graduation Ornaments – Unlike other Holiday décor that breaks and falls apart in a matter of weeks. Our personalized graduation ornaments 2021 is durable and lightweight for lasting quality and easy hanging on the family tree for a special twinkle in your ornament tree décor! Vibrant colors and elegant details of a proud guy on his graduation day. Honor their hard work and everything that they've accomplished this year with our special custom graduation keepsake!\nExpert Artist Personalization – unlike other personalized ornaments for Christmas trees that often arrive with messy, clumsy personalization (or worse, the wrong name). Our talented in-h

In [7]:
# pick the documents you want to improve using a c-seo method
query_idx2list_docs = dict()
for query_idx, query_point in enumerate(dataset):
    # randomly pick a document to improve
    doc_indices = random.sample(
        range(len(query_point["list_docs"])), k=1
    )  # 1 document per query. You can change k to pick more documents (i.e., multiple adopters)
    docs = dict()
    for doc_idx in doc_indices:
        doc = query_point["list_docs"][doc_idx]
        docs[doc_idx] = {"doc": doc}
    query_idx2list_docs[query_idx] = docs

# save the selected documents to a file
output_folder = os.path.join(project_root, "data", split_name)
if not os.path.exists(output_folder):
    os.makedirs(output_folder)
with open(os.path.join(output_folder, "selected_docs.json"), "w") as f:
    json.dump(query_idx2list_docs, f, indent=4)