# Creating indices for IIRC

In [1]:
import json

test_set = json.load(open('data/iirc_test.json','r'))
context_articles = json.load(open("data/context_articles.json",'r'))

In [2]:
from bs4 import BeautifulSoup


documents = []
all_titles = []

for item in test_set:
    if item['title'].lower() not in all_titles:
        documents.append({
                "title": item['title'],
                "content": item["text"]
            }
        )
        all_titles.append(item['title'].lower())
    for link in item["links"]:
        if link['target'].lower() in context_articles and link['target'].lower() not in all_titles:
            documents.append({
                "title": link['target'],
                "content": context_articles[link['target'].lower()]
            })
            all_titles.append(link['target'].lower())
        else:
            print(link['target'].lower())

9th paratroopers assault regiment "col moschin"
goldfinger (film)
list of international cricket council members
icc americas championship
the rev
avenged sevenfold
fox footy
herald sun
fox footy
herald sun
united states
judeo-iraqi arabic
maya civilization
black watch
suicidal tendencies
western hockey league
national hockey league
home run
minor league baseball
colonel
colonel
massachusetts institute of technology
israel
harvard business review
american football
college football
united states
billboard 200
romeo discography
billboard 200
master p
hip hop history
billboard 200
louisiana
arizona
state farm stadium
louisiana
united states
gulf of mexico
saffir–simpson scale
forgotten realms
list of dungeons & dragons rulebooks
mexico
napoleon iii
american football
national football league
mexico
lucha libre
protagonist
double dragon
world war ii
banff, alberta
american football
quarterback
college football
2009 nfl draft
new york city
los angeles
metal massacre
metal massacre
hull city a

In [3]:
from tqdm import tqdm
import spacy

nlp = spacy.blank("en")
nlp.add_pipe(nlp.create_pipe("sentencizer"))

stride = 2
max_length = 3

def window(documents, stride=2, max_length=3):
    treated_documents = []

    for j,document in enumerate(tqdm(documents)):
        doc_text = document['content']
        doc = nlp(doc_text[:10000])
        sentences = [sent.string.strip() for sent in doc.sents]
        for i in range(0, len(sentences), stride):
            segment = ' '.join(sentences[i:i + max_length])
            treated_documents.append({
                "title": document['title'],
                "contents": document['title']+". "+segment,
                "segment": segment
            })
            if i + max_length >= len(sentences):
                break
    return treated_documents

treated_documents = window(documents)

100%|██████████| 7028/7028 [01:53<00:00, 61.70it/s]


In [4]:
!mkdir data/iirc_indices

In [5]:
f = open("data/iirc_indices/contents.jsonl",'w')

for i, doc in enumerate(treated_documents):
    doc['id'] = i
    if doc['segment'] != "":
        f.write(json.dumps(doc)+"\n")

In [6]:
!python3 -m pyserini.index -collection JsonCollection -generator DefaultLuceneDocumentGenerator -threads 1 -input data/iirc_indices -index data/iirc_index -storeRaw

2022-10-19 19:49:52,586 INFO  [main] index.IndexCollection (IndexCollection.java:650) - Setting log level to INFO
2022-10-19 19:49:52,589 INFO  [main] index.IndexCollection (IndexCollection.java:653) - Starting indexer...
2022-10-19 19:49:52,589 INFO  [main] index.IndexCollection (IndexCollection.java:655) - DocumentCollection path: data/iirc_indices
2022-10-19 19:49:52,590 INFO  [main] index.IndexCollection (IndexCollection.java:656) - CollectionClass: JsonCollection
2022-10-19 19:49:52,590 INFO  [main] index.IndexCollection (IndexCollection.java:657) - Generator: DefaultLuceneDocumentGenerator
2022-10-19 19:49:52,590 INFO  [main] index.IndexCollection (IndexCollection.java:658) - Threads: 1
2022-10-19 19:49:52,591 INFO  [main] index.IndexCollection (IndexCollection.java:659) - Stemmer: porter
2022-10-19 19:49:52,591 INFO  [main] index.IndexCollection (IndexCollection.java:660) - Keep stopwords? false
2022-10-19 19:49:52,591 INFO  [main] index.IndexCollection (IndexCollection.java:661