## Exploring the dataset
First download the dataset by running the download.sh file

In [None]:
import json
from pathlib import Path


DATA_DIR = Path.cwd()
DEV_DISTRACTOR = DATA_DIR / "hotpot_dev_distractor_v1.json"
DEV_FULLWIKI = DATA_DIR / "hotpot_dev_fullwiki_v1.json"
TRAIN_FILE = DATA_DIR / "hotpot_train_v1.1.json"




In [8]:

def load_jsonl_or_json(path: Path):
    with path.open("r", encoding="utf-8") as f:
        raw = f.read().strip()
        if not raw:
            return []
        if "\n" in raw and raw.lstrip().startswith("{"):
            return [json.loads(line) for line in raw.splitlines() if line.strip()]
        return json.loads(raw)

AVAILABLE_SPLITS = []
DEV_DISTRACTOR_DATA = []
DEV_FULLWIKI_DATA = []
TRAIN_DATA = []

if DEV_DISTRACTOR.exists():
    try:
        DEV_DISTRACTOR_DATA = load_jsonl_or_json(DEV_DISTRACTOR)
        AVAILABLE_SPLITS.append("dev_distractor")
    except Exception as e:
        print("Failed to load dev_distractor:", e)

if DEV_FULLWIKI.exists():
    try:
        DEV_FULLWIKI_DATA = load_jsonl_or_json(DEV_FULLWIKI)
        AVAILABLE_SPLITS.append("dev_fullwiki")
    except Exception as e:
        print("Failed to load dev_fullwiki:", e)

if TRAIN_FILE.exists():
    try:
        TRAIN_DATA = load_jsonl_or_json(TRAIN_FILE)
        AVAILABLE_SPLITS.append("train")
    except Exception as e:
        print("Failed to load train:", e)

print("Counts:", {
    "dev_distractor": len(DEV_DISTRACTOR_DATA),
    "dev_fullwiki": len(DEV_FULLWIKI_DATA),
    "train": len(TRAIN_DATA),
})


Counts: {'dev_distractor': 7405, 'dev_fullwiki': 7405, 'train': 90447}


## Inspecting the splits

In [16]:
from pprint import pprint

def show_samples(data, k=3):
    for i, ex in enumerate(data[:k]):
        print(f"--- sample {i+1} ---")
        pprint({
            "_id": ex.get("_id"),
            "question": ex.get("question"),
            "answer": ex.get("answer"),
            "type": ex.get("type"),
            "level": ex.get("level"),
            "context": ex.get("context"),
        })
        sfs = ex.get("supporting_facts")
        if sfs:
            print("supporting_facts:", sfs[:5])
        print()

for name, data in {
    "dev_distractor": DEV_DISTRACTOR_DATA,
    "dev_fullwiki": DEV_FULLWIKI_DATA,
    "train": TRAIN_DATA,
}.items():
    if data:
        print(f"\n=== {name} samples ===")
        show_samples(data, k=1)



=== dev_distractor samples ===
--- sample 1 ---
{'_id': '5a8b57f25542995d1e6f1371',
 'answer': 'yes',
 'context': [['Ed Wood (film)',
              ['Ed Wood is a 1994 American biographical period comedy-drama '
               'film directed and produced by Tim Burton, and starring Johnny '
               'Depp as cult filmmaker Ed Wood.',
               " The film concerns the period in Wood's life when he made his "
               'best-known films as well as his relationship with actor Bela '
               'Lugosi, played by Martin Landau.',
               ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa '
               'Marie, and Bill Murray are among the supporting cast.']],
             ['Scott Derrickson',
              ['Scott Derrickson (born July 16, 1966) is an American director, '
               'screenwriter and producer.',
               ' He lives in Los Angeles, California.',
               ' He is best known for directing horror films such as '
      

## Inspecting the a single sample

In [14]:
DEV_FULLWIKI_DATA[0]

{'_id': '5a8b57f25542995d1e6f1371',
 'answer': 'yes',
 'question': 'Were Scott Derrickson and Ed Wood of the same nationality?',
 'supporting_facts': [['Scott Derrickson', 0], ['Ed Wood', 0]],
 'context': [['Adam Collis',
   ['Adam Collis is an American filmmaker and actor.',
    ' He attended the Duke University from 1986 to 1990 and the University of California, Los Angeles from 2007 to 2010.',
    ' He also studied cinema at the University of Southern California from 1991 to 1997.',
    ' Collis first work was the assistant director for the Scott Derrickson\'s short "Love in the Ruins" (1995).',
    ' In 1998, he played "Crankshaft" in Eric Koyanagi\'s "Hundred Percent".']],
  ['Ed Wood (film)',
   ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.',
    " The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lug

## How many sentences are in the dev_fullwiki dataset?
in other words, how many sentences we need to embed & index?

In [27]:
total_sentences = 0
total_paragraphs = 0
print(f"Total questions: {len(DEV_FULLWIKI_DATA)}")
for i in DEV_FULLWIKI_DATA:
    paragraphs = i["context"]
    total_paragraphs += len(paragraphs)
    for title, sentences in paragraphs:
        total_sentences += len(sentences)

print(f"Total paragraphs: {total_paragraphs}") # each paragraph is a list of sentences
print(f"Total sentences: {total_sentences}")

    


Total questions: 7405
Total paragraphs: 73642
Total sentences: 314623


## Notes
- We can embed the entire paragraph but we will lose  sentence level information and the context in some cases might be too large for the embedding model 
- In case we are embedding each sentence, we have to append the title to each sentence before the embedding.