# Anthology Pipeline

### Step 1: Import everything and load variables

In [None]:
import pyterrier as pt
import pandas as pd
from tira.third_party_integrations import ensure_pyterrier_is_loaded, get_input_directory_and_output_directory, persist_and_normalize_run
import json
from tqdm import tqdm

ensure_pyterrier_is_loaded()
input_directory, output_directory = get_input_directory_and_output_directory('iranthology-dataset-tira/')

### Step 2: Load the Data

In [None]:
print('Step 2: Load the data.')

queries = pt.io.read_topics(input_directory + '/queries.xml', format='trecxml')

documents = (json.loads(i) for i in open(input_directory + '/documents.jsonl', 'r'))
documents = [{'docno': i['docno'], 'text': i['text'], 'title': i['original_document']['title'], 'abstract': i['original_document']['abstract']} for i in documents]


### Step 3: Create the Index

In [None]:
print('Step 3: Create the Index.')

!rm -Rf ./index
iter_indexer = pt.IterDictIndexer("./index", meta={'docno' : 100, 'title': 10240, 'abstract': 10240, 'text': 10240})
index_ref = iter_indexer.index(tqdm(documents))

### Step 4: Create Run

In [None]:
print('Step 4: Create Run.')
bm25 = pt.BatchRetrieve(index_ref, wmodel="TF_IDF", verbose=True, metadata=['docno', 'text', 'title', 'abstract'])
bm25_title = pt.text.scorer(body_attr="title", wmodel="BM25")
bm25_abstract = pt.text.scorer(body_attr="abstract", wmodel="BM25")
bm25_text = pt.text.scorer(body_attr="text", wmodel="BM25")


# Here some "random" ranking formula that puts the highest weight on the title and
# reduces the weight of matches on the text field
# Here is big potential for improvements :)
combined_bm25_score = ((4*bm25_title) + (1*bm25_abstract) + (0.5*bm25_text))


dph_title = pt.text.scorer(body_attr="title", wmodel="DPH")
dph_abstract = pt.text.scorer(body_attr="abstract", wmodel="DPH")
dph_text = pt.text.scorer(body_attr="text", wmodel="DPH")

# Here some "random" ranking formula that puts the highest weight on the title and
# reduces the weight of matches on the text field
# Here is big potential for improvements :)
combined_dph_score = ((2*dph_title) + (1*dph_abstract) + (1*dph_text))

# The overall Pipeline: We retrieve the top-1000 results from BM25 that we re-rank using the combined BM25 and DPH scores.
# We just add the scores of BM25 and DPH
# Here is big potential for improvements :)
retrieval_pipeline = bm25 %1000 >> combined_bm25_score + combined_dph_score
run = retrieval_pipeline(queries)

Step 4: Create Run.


NameError: name 'pt' is not defined

In [None]:
run

### Step 5: Persist Run

In [None]:
print('Step 5: Persist Run.')

persist_and_normalize_run(run, output_file=output_directory, system_name='BM25', depth=1000)