# Anthology Pipeline

### Step 1: Import everything and load variables

In [25]:
import pyterrier as pt
import pandas as pd
from tira.third_party_integrations import ensure_pyterrier_is_loaded, get_input_directory_and_output_directory, persist_and_normalize_run
import json
from tqdm import tqdm
import os
if (pt.started()==False):
    pt.init()

os.environ["PYTERRIER_VERSION"] = str(pt.__version__)
os.environ["PYTERRIER_HELPER_VERSION"] = str(pt.__version__)

ensure_pyterrier_is_loaded()
input_directory, output_directory = get_input_directory_and_output_directory('iranthology-dataset-tira/')

I will use a small hardcoded example located in iranthology-dataset-tira/.
The output directory is /tmp/


### Step 2: Load the Data

In [26]:
print('Step 2: Load the data.')

queries = pt.io.read_topics(input_directory + 'queries.xml', format='trecxml')

documents = (json.loads(i) for i in open(input_directory + 'documents.jsonl', 'r'))


Step 2: Load the data.


### Step 3: Create the Index

In [27]:
print('Step 3: Create the Index.')

!rm -Rf ./index
iter_indexer = pt.IterDictIndexer("./index", meta={'docno' : 100})
index_ref = iter_indexer.index(tqdm(documents))

Step 3: Create the Index.




0it [00:00, ?it/s][A[A

1it [00:00,  3.22it/s][A[A

38it [00:00, 118.70it/s][A[A

310it [00:00, 925.54it/s][A[A

667it [00:00, 1645.35it/s][A[A

932it [00:00, 1673.22it/s][A[A

1217it [00:00, 1918.26it/s][A[A

1741it [00:01, 2732.40it/s][A[A

2368it [00:01, 3651.00it/s][A[A

2939it [00:01, 4220.56it/s][A[A

3400it [00:01, 4311.04it/s][A[A

4073it [00:01, 4876.06it/s][A[A

4678it [00:01, 5154.68it/s][A[A

5431it [00:01, 5699.22it/s][A[A

6008it [00:01, 5640.58it/s][A[A

6837it [00:01, 6311.75it/s][A[A

7473it [00:01, 5947.84it/s][A[A

8131it [00:02, 6089.29it/s][A[A

8746it [00:02, 5028.97it/s][A[A

9281it [00:02, 5007.85it/s][A[A

9804it [00:02, 4694.12it/s][A[A

10291it [00:02, 4481.41it/s][A[A

10840it [00:02, 4686.29it/s][A[A

11334it [00:02, 4702.77it/s][A[A

12352it [00:02, 5803.11it/s][A[A

12931it [00:03, 5616.07it/s][A[A

13492it [00:03, 5406.75it/s][A[A

14054it [00:03, 5438.23it/s][A[A

14598it [00:03, 5414.78it/s][A





17320it [00:03, 5582.15it/s][A[A

17899it [00:03, 5463.78it/s][A[A

18530it [00:04, 5552.22it/s][A[A

19214it [00:04, 5829.71it/s][A[A

20149it [00:04, 6156.42it/s][A[A

20767it [00:04, 5622.03it/s][A[A

21336it [00:04, 5273.78it/s][A[A

21868it [00:04, 5184.25it/s][A[A

22389it [00:04, 4196.26it/s][A[A

22836it [00:05, 3547.43it/s][A[A

23246it [00:05, 3614.02it/s][A[A

23690it [00:05, 3749.78it/s][A[A

24252it [00:05, 4095.11it/s][A[A

24715it [00:05, 4086.34it/s][A[A

0it [30:23, ?it/s]60.02it/s][A[A
0it [00:57, ?it/s]


26102it [00:05, 3275.65it/s][A[A

27092it [00:06, 4563.63it/s][A[A

27656it [00:06, 4652.26it/s][A[A

28301it [00:06, 4367.34it/s][A[A

28796it [00:06, 4078.73it/s][A[A

29244it [00:06, 3914.83it/s][A[A

30076it [00:06, 4627.04it/s][A[A

30563it [00:06, 4588.18it/s][A[A

31282it [00:06, 5166.70it/s][A[A

31820it [00:07, 5071.95it/s][A[A

32468it [00:07, 5444.20it/s][A[A

33028it [00:07, 5302.23it/s][A[A

33569i

18:16:34.695 [ForkJoinPool-2-worker-3] WARN org.terrier.structures.indexing.Indexer - Indexed 3 empty documents


### Step 4: Create Run

In [33]:
print('Step 4: Create Run.')
run = pt.BatchRetrieve(index_ref, wmodel="BM25", verbose=True, num_results=5)(queries)

Step 4: Create Run.


BR(BM25): 100%|██████████| 6/6 [00:00<00:00, 66.44q/s]


In [34]:
run

Unnamed: 0,qid,docid,docno,rank,score,query
0,1,24609,2017.cikm_conference-2017.312,0,23.099142,deep neural networks
1,1,15569,2020.wsdm_conference-2020.4,1,22.204831,deep neural networks
2,1,15691,2020.wsdm_conference-2020.126,2,21.78412,deep neural networks
3,1,15884,2015.mir_conference-2015.49,3,21.565786,deep neural networks
4,1,16582,2017.mir_conference-2017.38,4,21.565786,deep neural networks
5,2,10846,2003.sigirconf_conference-2003.73,0,3.794062,information retrieval
6,2,28283,2015.ictir_conference-2015.2,1,3.699611,information retrieval
7,2,53311,2011.tois_journal-ir0anthology0volumeA29A2.0,2,3.667209,information retrieval
8,2,51553,2006.ipm_journal-ir0anthology0volumeA42A1.2,3,3.619692,information retrieval
9,2,48845,2012.sigirjournals_journal-ir0anthology0volume...,4,3.618535,information retrieval


### Step 5: Persist Run

In [30]:
print('Step 5: Persist Run.')

persist_and_normalize_run(run, output_file=output_directory, system_name='BM25', depth=1000)

Step 5: Persist Run.
