In [2]:
import pyterrier as pt

## Download the Collection

In [3]:
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/test.json?download=true" -o test.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/train.json?download=true" -o train.json
!curl -L "https://huggingface.co/datasets/Bhawna/ChroniclingAmericaQA/resolve/main/dev.json?download=true" -o validation.json

import json

files = ["train.json", "validation.json", "test.json"]

for path in files:
    print(f"\n===== {path} =====")
    try:
        with open(path, "r", encoding="utf-8") as f:
            # Read a few hundred characters to see what kind of JSON it is
            head = f.read(500)
            print("Preview of first 500 characters:\n")
            print(head[:500])
        # Try to load only part of the file
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            print(f"\nLoaded {len(data)} items (list).")
            print("Dictionary keys:", list(data[0].keys()))
            print(json.dumps(data[0], indent=2)[:600])
        elif isinstance(data, dict):
            print("\nTop-level is a dictionary. Keys:", list(data.keys()))
            for k, v in data.items():
                if isinstance(v, list):
                    print(f"Key '{k}' contains a list of {len(v)} items.")
                    if v:
                        print("First item keys:", list(v[0].keys()))
                        print(json.dumps(v[0], indent=2)[:600])
                        break
        else:
            print(f"Unexpected top-level type: {type(data)}")
    except Exception as e:
        print(f"Could not parse {path} as JSON: {e}")

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1346  100  1346    0     0   4063      0 --:--:-- --:--:-- --:--:--  4078
100 71.5M  100 71.5M    0     0  10.3M      0  0:00:06  0:00:06 --:--:-- 10.6M     0  0:00:07  0:00:02  0:00:05 12.7M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1352  100  1352    0     0   4387      0 --:--:-- --:--:-- --:--:--  4389
100 1315M  100 1315M    0     0  11.3M      0  0:01:56  0:01:56 --:--:-- 12.0M    0  6809k      0  0:03:17  0:00:06  0:03:11 8541k117k      0  0:02:45  0:00:08  0:02:37 11.0M0:02:00  0:00:51  0:01:09 11.8M  0  0:01:58  0:01:04  0:00:54 12.3M11.1M      0  0:01:57  0:01:08  0:00:49 12.0M      0  0:01:56  0:01:31  0:00:25 10.3M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
 

## Create the Document Collection

In [4]:
import os

inputs = ["train.json", "validation.json", "test.json"]
output = "document_collection.json"

def load_list_or_empty(path):
    if not os.path.exists(path) or os.path.getsize(path) == 0:
        print(f"Skipping {path} because it is missing or empty")
        return []
    try:
        with open(path, "r", encoding="utf-8") as f:
            data = json.load(f)
        if isinstance(data, list):
            return data
        print(f"Skipping {path} because it is not a list at the top level")
        return []
    except json.JSONDecodeError:
        print(f"Skipping {path} because it is not valid JSON")
        return []

def project(recs):
    out = []
    for r in recs:
        out.append({
            "para_id": r.get("para_id", ""),
            "context": r.get("context", ""),
            "raw_ocr": r.get("raw_ocr", ""),
            "publication_date": r.get("publication_date", "")
        })
    return out

all_recs = []
for p in inputs:
    recs = load_list_or_empty(p)
    print(f"Loaded {len(recs)} records from {p}")
    all_recs.extend(project(recs))

# deduplicate by para_id keeping the first one seen
uniq = {}
for rec in all_recs:
    pid = rec.get("para_id", "")
    if pid and pid not in uniq:
        uniq[pid] = rec

result = list(uniq.values())

with open(output, "w", encoding="utf-8") as f:
    json.dump(result, f, ensure_ascii=False, indent=2)

print(f"Wrote {len(result)} records to {output}")
print(json.dumps(result[:3], indent=2))

Loaded 439302 records from train.json
Loaded 24111 records from validation.json
Loaded 24084 records from test.json
Wrote 131921 records to document_collection.json
[
  {
    "para_id": "New_Hampshire_18070804_1",
    "context": "Aiscellaneous Repository. From the Albany Register, WAR, OR A PROSPECT OF IT, From recent instances of British Outrage. BY: WILLIAM RAY, Author of the contemplated publication, entitled, \u201cHorrors of Slavery, or the American Turf in Tripoli,\u201d VOTARIES of Freedom, arm! The British Lion roars! Legions of Valor, take th\u2019 alarm\u2014; Rash, rush to guard our shores! Behold the horrid deed\u2014 Your brethren gasping lie! Beneath a tyrant\u2019s hand they bleed\u2014 They groan\u2014they faint\u2014they die. Veterans of seventy-six, Awake the slumbering sword;\u2014 Hearts of your murderous foes transfix\u2014 'Tis vengeance gives the word. Remember Lexington, And Bunker\u2019s tragic hill; \u201cThe same who spilt your blood thereon, Your blood again

## Create the Test Queries

In [5]:
import json
import re
import unicodedata
import string

input_file = "test.json"
output_file = "test_queries.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

def clean_question(text):
    if not isinstance(text, str):
        return ""
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(rf"[{re.escape(string.punctuation)}]", " ", text)  # remove punctuation
    text = re.sub(r"\s+", " ", text)  # collapse multiple spaces
    return text.strip()

# Extract and clean
queries = [
    {
        "query_id": item.get("query_id", ""),
        "question": clean_question(item.get("question", "")),
    }
    for item in data
]

# Sort by query_id (assuming numeric)
queries = sorted(queries, key=lambda x: int(x["query_id"]) if str(x["query_id"]).isdigit() else x["query_id"])

# Keep only the first 10,000
queries = queries[:10000]

# Save new JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(queries, f, ensure_ascii=False, indent=2)

print(f"Saved {len(queries)} entries to {output_file}")
print(json.dumps(queries[:3], indent=2))

Saved 10000 entries to test_queries.json
[
  {
    "query_id": "test_1",
    "question": "How many lots did Thomas Peirce have"
  },
  {
    "query_id": "test_10",
    "question": "Who gave Hamilton the substance of what he had proposed on the part of General Hamilton"
  },
  {
    "query_id": "test_100",
    "question": "Who informs his FRIENDS and the PUBLIC that he has taken that justly celebrated INN in this city"
  }
]


## Create the QRels

In [6]:
input_file = "test.json"
qrels_file = "test_qrels.json"
answers_file = "test_query_answers.json"

# Load the data
with open(input_file, "r", encoding="utf-8") as f:
    data = json.load(f)

# Build the qrels file: query_id, iteration=0, para_id, relevance=1
qrels = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1
    }
    for item in data
]

# Build the query_answers file: same plus answer and org_answer
query_answers = [
    {
        "query_id": item.get("query_id", ""),
        "iteration": 0,
        "para_id": item.get("para_id", ""),
        "relevance": 1,
        "answer": item.get("answer", ""),
        "org_answer": item.get("org_answer", "")
    }
    for item in data
]

# Save both files
with open(qrels_file, "w", encoding="utf-8") as f:
    json.dump(qrels, f, ensure_ascii=False, indent=2)

with open(answers_file, "w", encoding="utf-8") as f:
    json.dump(query_answers, f, ensure_ascii=False, indent=2)

print(f"Saved {len(qrels)} entries to {qrels_file}")
print(f"Saved {len(query_answers)} entries to {answers_file}")
print("Sample qrels entry:", qrels[0])
print("Sample query_answers entry:", query_answers[0])

Saved 24084 entries to test_qrels.json
Saved 24084 entries to test_query_answers.json
Sample qrels entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1}
Sample query_answers entry: {'query_id': 'test_1', 'iteration': 0, 'para_id': 'New_Hampshire_18030125_16', 'relevance': 1, 'answer': '183', 'org_answer': '183'}


# Retrieval

In [7]:
import pyterrier as pt
import pandas as pd
import re 
if not pt.started():
    pt.init() 

  if not pt.started():
Java started and loaded: pyterrier.java.colab, pyterrier.java, pyterrier.java.24, pyterrier.terrier.java [version=5.11 (build: craig.macdonald 2025-01-13 21:29), helper_version=0.0.8]
java is now started automatically with default settings. To force initialisation early, run:
pt.java.init() # optional, forces java initialisation
  pt.init()


We turn everything into a pandas dataframe for easier manipulation

In [8]:
#  Convert the list of dictionaries into a pandas DataFrame
queries_df = pd.DataFrame(queries)
qrels_df = pd.DataFrame(qrels)

# Filter out rows where the 'question' is now empty
queries_df = queries_df[queries_df['question'].str.len() > 0]

# Display the cleaned DataFrame
print("Queries DataFrame:")
print(queries_df)
print("Qrels DataFrame:")
print(qrels_df)

Queries DataFrame:
        query_id                                           question
0         test_1               How many lots did Thomas Peirce have
1        test_10  Who gave Hamilton the substance of what he had...
2       test_100  Who informs his FRIENDS and the PUBLIC that he...
3      test_1000  Who was the Secretary of the Treasury of the U...
4     test_10000     Who made a speech in front of the Brooks House
...          ...                                                ...
9995  test_18995  How many rounds does McFadden need to fight Jo...
9996  test_18996  What company has a receiver appointed at Lawre...
9997  test_18997     How much did the price go on September 13 1900
9998  test_18998  What state is Salt Lake City Territory located in
9999  test_18999  What party did Truro Crane and John P Meakin b...

[10000 rows x 2 columns]
Qrels DataFrame:
         query_id  iteration                    para_id  relevance
0          test_1          0  New_Hampshire_18030125_16

In [9]:
# 1. Load the JSON file
input_file = "document_collection.json"

if os.path.exists(input_file):
    with open(input_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    # 2. Convert to DataFrame
    df = pd.DataFrame(data)

    # 3. Select ONLY 'para_id' and 'context'
    df = df[["para_id", "context"]]

    # 4. RENAME columns to match what PyTerrier needs
    df = df.rename(columns={
        "para_id": "docno"
    })

    # 5. Type Safety (Make sure they are strings)
    df["docno"] = df["docno"].astype(str)
    df["context"] = df["context"].astype(str)

    print(f"✅ Dataset created with {len(df)} documents.")
    print(df.head())

✅ Dataset created with 131921 documents.
                      docno                                            context
0  New_Hampshire_18070804_1  Aiscellaneous Repository. From the Albany Regi...
1  New_Hampshire_18070804_4  Surely he above the rest of his fellow mortals...
2  New_Hampshire_18070804_5  At Westmoreland, Mrs. Sally Lincoln, wife of M...
3  New_Hampshire_18070804_8  Upon the correction of this remedy the stomach...
4  New_Hampshire_18070804_9  Also FOR SALE AS ABOVE, NEW GOODS, STEPHEN HAR...


Critical fix starts here

In [10]:
# Force IDs to be strings in both dataframes , this is very important for pyterrier to read things correctly
queries_df = queries_df.assign(
    qid = queries_df['query_id'].astype(str),
    query = queries_df['question'].astype(str)
)

qrels_df = qrels_df.assign(
    qid = qrels_df['query_id'].astype(str),
    docno = qrels_df['para_id'].astype(str),
    label = qrels_df['relevance'].astype(int)
)

## Baseline 1: Character N-Gram Representation

Define a function to convert text into character n-grams:

In this function, we remove the stop words before creating the n-grams. To implement this, we use the NLTK library.

In [None]:
STOPWORDS = set(["the", "and", "is", "of", "in", "to", "a", "it", "that", "for", "on", 
                 "with", "as", "by", "at", "an", "be", "this", "which", "or", "from", 
                 "what", "where", "when", "who", "how", "why"])

def to_ngrams(text, n=3):
    """
    Converts a string into a space-separated sequence of character n-grams
    """
    if pd.isna(text) or text == "":
        return ""
    
    text = str(text).lower()

    # Remove non-alphanumeric chars
    text = re.sub(r'[^a-z0-9 ]', ' ', text)

    # Split into words
    words = text.split()
    
    # Check against the NLTK set
    clean_words = [w for w in words if w not in STOPWORDS and len(w) > 1]

    # 4. Join back together
    clean_text = " ".join(clean_words)

    # add padding to shorter words
    if len(clean_text) < n:
        return clean_text + ("_" * (n - len(clean_text)))
  
    # Generate n-grams
    ngrams = [clean_text[i : i+n] for i in range(len(clean_text) - n + 1)]

    # Replace spaces with underscores
    valid_ngrams = [x.replace(" ", "_") for x in ngrams]

    return " ".join(valid_ngrams)

### 4-Grams

In [22]:
df['text'] = df['context'].fillna("").astype(str).apply(lambda x: to_ngrams(x, n=4))

docs = df[["docno", "text"]].to_dict("records")
print(docs[:3])

[{'docno': 'New_Hampshire_18070804_1', 'text': 'aisc isce scel cell ella llan lane aneo neou eous ous_ us_r s_re _rep repo epos posi osit sito itor tory ory_ ry_a y_al _alb alba lban bany any_ ny_r y_re _reg regi egis gist iste ster ter_ er_w r_wa _war war_ ar_p r_pr _pro pros rosp ospe spec pect ect_ ct_r t_re _rec rece ecen cent ent_ nt_i t_in _ins inst nsta stan tanc ance nces ces_ es_b s_br _bri brit riti itis tish ish_ sh_o h_ou _out outr utra trag rage age_ ge_w e_wi _wil will illi llia liam iam_ am_r m_ra _ray ray_ ay_a y_au _aut auth utho thor hor_ or_c r_co _con cont onte ntem temp empl mpla plat late ated ted_ ed_p d_pu _pub publ ubli blic lica icat cati atio tion ion_ on_e n_en _ent enti ntit titl itle tled led_ ed_h d_ho _hor horr orro rror rors ors_ rs_s s_sl _sla slav lave aver very ery_ ry_a y_am _ame amer meri eric rica ican can_ an_t n_tu _tur turf urf_ rf_t f_tr _tri trip ripo ipol poli oli_ li_v i_vo _vot vota otar tari arie ries ies_ es_f s_fr _fre free reed eedo ed

In [23]:
fuzzy_query = queries_df.copy()
fuzzy_query['query'] = fuzzy_query['query'].apply(lambda x: to_ngrams(x, n=4))

# Sanity Check
print("fuzzy_query created:")
print(fuzzy_query.head(3))

fuzzy_query created:
   query_id                                           question       qid  \
0    test_1               How many lots did Thomas Peirce have    test_1   
1   test_10  Who gave Hamilton the substance of what he had...   test_10   
2  test_100  Who informs his FRIENDS and the PUBLIC that he...  test_100   

                                               query  
0  many any_ ny_l y_lo _lot lots ots_ ts_d s_di _...  
1  gave ave_ ve_h e_ha _ham hami amil milt ilto l...  
2  info nfor form orms rms_ ms_h s_hi _his his_ i...  


Indexing:

In [24]:
INDEX_FUZZY_PATH = "./indices/fuzzy_index_v1"

# Disable stemming and stopwords

indexer_fuzzy = pt.index.IterDictIndexer(
    INDEX_FUZZY_PATH,
    meta={"docno": 50},
    overwrite=True
)

# Set properties to disable standard English processing
indexer_fuzzy.setProperty("termpipelines", "") 
indexer_fuzzy.setProperty("tokeniser", "WhitespaceTokeniser") # Split only on the spaces we added

indexref = indexer_fuzzy.index(docs)
index_fuzzy = pt.IndexFactory.of(indexref)

print("Indexing complete")

Indexing complete


Then we use the BM25 Retriever on the fuzzy index:

In [25]:
bm25_fuzzy = pt.terrier.Retriever(
    index_fuzzy, 
    wmodel="BM25", 
    properties={"termpipelines": "", "tokeniser": "WhitespaceTokeniser"}, # must match indexing
    verbose=True # Shows a progress bar
)

In [26]:
# Select the first 100 queries
test_slice = 100
test_queries = fuzzy_query[:test_slice]

relevant_qids = test_queries['qid'].values
filtered_qrels = qrels_df[qrels_df['qid'].isin(relevant_qids)]

In [27]:
bm25_results = bm25_fuzzy.transform(test_queries)

TerrierRetr(BM25): 100%|██████████| 100/100 [00:10<00:00,  9.15q/s]


In [28]:
from pyterrier.measures import R, MAP

In [29]:
# 4. Evaluate
eval_metrics = pt.Evaluate(
    bm25_results,
    filtered_qrels,
    metrics=[R@1, R@100, MAP]
)
print(eval_metrics)

{'R@1': 0.71, 'R@100': 0.93, 'AP': 0.763444712518693}


### 5-Grams

Let's try 5-grams:

In [31]:
df['text'] = df['context'].fillna("").astype(str).apply(lambda x: to_ngrams(x, n=5))

docs = df[["docno", "text"]].to_dict("records")
print(docs[:3])

[{'docno': 'New_Hampshire_18070804_1', 'text': 'aisce iscel scell cella ellan llane laneo aneou neous eous_ ous_r us_re s_rep _repo repos eposi posit osito sitor itory tory_ ory_a ry_al y_alb _alba alban lbany bany_ any_r ny_re y_reg _regi regis egist giste ister ster_ ter_w er_wa r_war _war_ war_p ar_pr r_pro _pros prosp rospe ospec spect pect_ ect_r ct_re t_rec _rece recen ecent cent_ ent_i nt_in t_ins _inst insta nstan stanc tance ances nces_ ces_b es_br s_bri _brit briti ritis itish tish_ ish_o sh_ou h_out _outr outra utrag trage rage_ age_w ge_wi e_wil _will willi illia lliam liam_ iam_r am_ra m_ray _ray_ ray_a ay_au y_aut _auth autho uthor thor_ hor_c or_co r_con _cont conte ontem ntemp templ empla mplat plate lated ated_ ted_p ed_pu d_pub _publ publi ublic blica licat icati catio ation tion_ ion_e on_en n_ent _enti entit ntitl title itled tled_ led_h ed_ho d_hor _horr horro orror rrors rors_ ors_s rs_sl s_sla _slav slave laver avery very_ ery_a ry_am y_ame _amer ameri meric eric

In [32]:
fuzzy_query = queries_df.copy()
fuzzy_query['query'] = fuzzy_query['query'].apply(lambda x: to_ngrams(x, n=5))

# Sanity Check
print("fuzzy_query created:")
print(fuzzy_query.head(3))

fuzzy_query created:
   query_id                                           question       qid  \
0    test_1               How many lots did Thomas Peirce have    test_1   
1   test_10  Who gave Hamilton the substance of what he had...   test_10   
2  test_100  Who informs his FRIENDS and the PUBLIC that he...  test_100   

                                               query  
0  many_ any_l ny_lo y_lot _lots lots_ ots_d ts_d...  
1  gave_ ave_h ve_ha e_ham _hami hamil amilt milt...  
2  infor nform forms orms_ rms_h ms_hi s_his _his...  


Create the index again:

In [33]:
INDEX_FUZZY_PATH = "./indices/fuzzy_index_v2"

indexer_fuzzy = pt.index.IterDictIndexer(
    INDEX_FUZZY_PATH,
    meta={"docno": 50},
    overwrite=True
)

# Set properties to disable standard English processing
indexer_fuzzy.setProperty("termpipelines", "") 
indexer_fuzzy.setProperty("tokeniser", "WhitespaceTokeniser") # Split only on the spaces we added

indexref = indexer_fuzzy.index(docs)
index_fuzzy = pt.IndexFactory.of(indexref)

print("Indexing complete")

Indexing complete


In [34]:
bm25_fuzzy_5 = pt.terrier.Retriever(
    index_fuzzy, 
    wmodel="BM25", 
    properties={"termpipelines": "", "tokeniser": "WhitespaceTokeniser"}, # must match indexing
    verbose=True # Shows a progress bar
)

In [35]:
# Select the first 100 queries
test_slice = 100
test_queries = fuzzy_query[:test_slice]

relevant_qids = test_queries['qid'].values
filtered_qrels = qrels_df[qrels_df['qid'].isin(relevant_qids)]

In [36]:
bm25_5_results = bm25_fuzzy_5.transform(test_queries)

TerrierRetr(BM25): 100%|██████████| 100/100 [00:12<00:00,  8.30q/s]


In [37]:
# Evaluate
eval_metrics = pt.Evaluate(
    bm25_5_results,
    filtered_qrels,
    metrics=[R@1, R@100, MAP]
)

print(eval_metrics)

{'R@1': 0.72, 'R@100': 0.96, 'AP': 0.7764373180530402}


# Fusion Retrieval

Now, let's try to fuse 4 and 5 grams together:

In [None]:
# Load the Existing Indices
path_to_5g = "./indices/fuzzy_index_v2" # v2 is the 5-gram one
path_to_4g = "./indices/fuzzy_index_v1" # v1 is the 4-gram one

print(f"Loading 5-gram index from: {path_to_5g}")
index_5g_loaded = pt.IndexFactory.of(path_to_5g)

print(f"Loading 4-gram index from: {path_to_4g}")
index_4g_loaded = pt.IndexFactory.of(path_to_4g)

# Define Retrievers
bm25_5g = pt.terrier.Retriever(
    index_5g_loaded, 
    wmodel="BM25",
    controls={"c": 0.4, "bm25.k_1": 1.2},
    properties={"termpipelines": "", "tokeniser": "WhitespaceTokeniser"}
)

bm25_4g = pt.terrier.Retriever(
    index_4g_loaded, 
    wmodel="BM25",
    controls={"c": 0.4, "bm25.k_1": 1.2},
    properties={"termpipelines": "", "tokeniser": "WhitespaceTokeniser"}
)

# Define the Translators
def format_5g(row): return to_ngrams(row['question'], n=5)
def format_4g(row): return to_ngrams(row['question'], n=4)

# Create the Pipelines
pipe_5g = pt.apply.query(format_5g) >> bm25_5g
pipe_4g = pt.apply.query(format_4g) >> bm25_4g

# Fuse & Run
fusion_pipeline = (0.8 * pipe_5g) + (0.2 * pipe_4g)

print("Running Fusion on existing indices...")
results = fusion_pipeline.transform(queries_df[:100])

# Evaluate
from pyterrier.measures import R, MAP
relevant_qids = queries_df[:100]['qid'].values.astype(str)
filtered_qrels = qrels_df[qrels_df['qid'].isin(relevant_qids)]

eval_metrics = pt.Evaluate(
    results,
    filtered_qrels,
    metrics=[R@1, R@100, MAP]
)

print(eval_metrics)

Loading 5-gram index from: ./indices/fuzzy_index_v2
Loading 4-gram index from: ./indices/fuzzy_index_v1
Running Fusion on existing indices...
{'R@1': 0.69, 'R@100': 0.96, 'AP': 0.7489637691166716}


## RRF

Let's try reciprocal rank fusion to see if it get's better results:

In [None]:
import pyterrier_alpha as pta

In [None]:
"""
This transformer merges multiple ranking results by computing the reciprocal rank of each document in each ranking, and summing them up. 
The reciprocal rank is computed as 1/(rank + k), where k is a constant. 
The resulting score is used to rank the documents.
"""

# Create the fusion pipeline using RRF
rrf_pipeline = pta.fusion.RRFusion(
    pipe_5g, 
    pipe_4g, 
    k=60
)

results_rrf = rrf_pipeline.transform(queries_df[:100])

# Evaluate
eval_metrics = pt.Evaluate(
    results_rrf,
    filtered_qrels,
    metrics=[R@1, R@100, MAP]
)
print(eval_metrics)



{'R@1': 0.67, 'R@100': 0.94, 'AP': 0.7310782365821938}


We can also utilize the pt.Experiment method to compare our character n-gram representations:

In [None]:
# Run the Experiment
experiment = pt.Experiment(
    [pipe_4g, pipe_5g],        # List of systems to compare
    queries_df[:100],                 # The DATAFRAME with raw English queries
    filtered_qrels,                    # The qrels
    eval_metrics=["map", "recip_rank", R@1, R@100],
    names=["4-Gram Fuzzy", "5-Gram Fuzzy"], 
    baseline=0                         # Compare the second model against the first (0-index)
)

print(experiment)

           name       map  recip_rank   R@1  R@100  map +  map -  map p-value  \
0  4-Gram Fuzzy  0.717240    0.717240  0.64   0.92    NaN    NaN          NaN   
1  5-Gram Fuzzy  0.754516    0.754516  0.70   0.96   23.0    6.0     0.040295   

   recip_rank +  recip_rank -  recip_rank p-value  R@1 +  R@1 -  R@1 p-value  \
0           NaN           NaN                 NaN    NaN    NaN          NaN   
1          23.0           6.0            0.040295    7.0    1.0     0.033199   

   R@100 +  R@100 -  R@100 p-value  
0      NaN      NaN            NaN  
1      4.0      0.0       0.044935  
