# Information Retrieval

In [None]:
# Imports
## General
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

## Data retrieval
import os

## Data exploration
import glob
from nltk.tokenize import wordpunct_tokenize
from wordcloud import WordCloud

## Pyserini
import pyserini
from pyserini.index import IndexReader

## Models
from transformers import BertTokenizer, BertModel
import torch

# 1. Data retrieval

First we download the msmarco passage and index our documents in Lucene.

In [None]:
# Download msmarco
if not os.path.exists('data/'):
 # Download
 print('1/4 Start download')
 !curl https://msmarco.blob.core.windows.net/msmarcoranking/collection.tar.gz -o data/msmarco_passage/collection.tar.gz --create-dirs

 # Unzip
 print('2/4 Unzip')
 !tar xvfz data/msmarco_passage/collection.tar.gz -C data/msmarco_passage

 # Map .tsv to .json
 print('3/4 Map tsv to json')
 !git clone https://github.com/castorini/anserini.git
 !cd anserini && git checkout ad5ba1c76196436f8a0e28efdb69960d4873efe3
 !cd anserini && python ./src/main/python/msmarco/convert_collection_to_jsonl.py \
 --collection_path ../data/msmarco_passage/collection.tsv --output_folder ../data/msmarco_passage/collection_jsonl

 # Clean up
 print('4/4 Clean up')
 !rm data/msmarco_passage/collection.tar.gz
 !rm data/msmarco_passage/*.tsv
 !rm -rf sample_data
 !rm -rf -v anserini

In [None]:
# Index documents in Lucene
if not os.path.exists('indexes/'):
 !python -m pyserini.index.lucene -collection JsonCollection -generator DefaultLuceneDocumentGenerator -threads 9 \
 -input data/msmarco_passage/collection_jsonl -index indexes/lucene-index-msmarco-passage -storePositions -storeDocvectors -storeRaw

Then we fetch our queries

# 2. Data exploration
We will start with some data exploration, to gain a better understanding of what we are dealing with and what pre-processing steps we should consider.

First we create an index reader and look at some of the Lucene index statistics.

In [None]:
index_reader = IndexReader('indexes/lucene-index-msmarco-passage')

In [None]:
# Note that unless the index was built with `-optimize`, unique_terms will show -1
lucene_stats = index_reader.stats()
print(f'Number of documents in dataset:       {lucene_stats["documents"]}')
print(f'Number of empty documents in dataset: {lucene_stats["documents"] - lucene_stats["non_empty_documents"]}')
print(f'Number of terms in dataset:           {lucene_stats["total_terms"]}')

Now we will sample some documents to:
1. Look into the document length statistics and plot the number of words per document in a histogram
2. Plot the most common words in a wordcloud
3. Print some documents to look into

In [None]:
# Collect documents
documents = pd.DataFrame(columns=['id', 'contents'])
docs_filenames = glob.glob('data/msmarco_passage/collection_jsonl/*.json')

for filename in docs_filenames:
 file_docs = pd.read_json(filename, lines=True)
 sample = file_docs.sample(1000, random_state=1)
 documents = pd.concat([documents, sample], ignore_index=True)

In [None]:
# Document length statistics
words = list(map(lambda x: wordpunct_tokenize(x), documents['contents']))
lengths = np.array(list(map(lambda x: len(x), words)))

print(f'Average document length: {lengths.mean()}')
print(f'Minimum document length: {lengths.min()}')
print(f'Maximum document length: {lengths.max()}')

In [None]:
# Histogram
plt.hist(lengths)
plt.xlabel('Number of words in document')
plt.ylabel('Number of documents')
plt.title('Document lengths')
plt.show()
print('Figure 1. Histogram of the number of words per sampled document')

In [None]:
# Wordcloud
contents = ' '.join([word for word_list in words for word in word_list])

wordcloud = WordCloud(width=1600,height=400).generate(contents)
plt.figure(figsize = (25,25))
plt.imshow(wordcloud)
plt.axis("off")
plt.show()
print('Figure 2. Wordcloud of most common words in documents sample')

In [None]:
# Print samples
n_samples = 5
for idx, document in enumerate(documents['contents'].iloc[:n_samples], 1):
 print(f'\033[1m- Document {idx}:\033[0m')
 print(document)
 print()

# 3. Data pre-processing

In [None]:
We have the texts
We also have the Queries soon
for BM25 and word2vec we will probably need stop word removal / stemming


# 4. Model implementation

## 4.1 Word2Vec

## 4.2 ElMo

## 4.3 BERT

In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
model = BertModel.from_pretrained("bert-base-uncased")

# 5. Model evaluation

In [None]:
calculate previously handled metrics