# Prepare Evaluation Data

This notebook gathers open source data that can be used to evaluate a model for information retrieval (IR) tasks.  We will use the [DBpedia](https://github.com/iai-group/DBpedia-Entity/) dataset, which is a subset of the Text REtrieval Conference [(TREC)](https://trec.nist.gov/data.html) dataset; a common benchmarking dataset for IR models.

The approach we are using does not train per se: it simply calculates the distance between embeddings, and the sorted distances are evaluated against the labeled relevancy categories.  So all of the data gathered in this notebook is used only for evaluation.

In [1]:
import os, json, csv
import urllib.request
import typing as t
import numpy as np
import pandas as pd
from zipfile import ZipFile
from tqdm import tqdm
from more_itertools import take
from google.cloud import storage

In [2]:
DATA_PATH = "data"
GCS_BUCKET = "queryable-docs-artifacts-5024"
GCS_FOLDER_PATH = "ir_eval_data"

In [20]:
def download_and_unzip_dbpedia(save_path: str, dataset: str = "dbpedia-entity"):
    """
    Downloads and unzips the DBpedia dataset.
    DBpedia documentation: https://github.com/iai-group/DBpedia-Entity/
    Download link comes from: https://github.com/beir-cellar/beir
    """
    url = f"https://public.ukp.informatik.tu-darmstadt.de/thakur/BEIR/datasets/{dataset}.zip"
    fname = "dbpedia-entity.zip"

    # download the model to the current working directory
    urllib.request.urlretrieve(url, fname)

    # extract to a new folder
    if not os.path.exists(save_path):
        os.makedirs(save_path)
    with ZipFile(fname, "r") as zipf:
        zipf.extractall(path=save_path)

    # remove downloaded zip file
    os.remove(fname)


def load_dbpedia(from_path: str) -> t.Tuple[t.Dict[str, t.Dict[str, str]], t.Dict[str, str], t.Dict[str, t.Dict[str, int]]]:
    """
    Loads the DBpedia dataset.
    Code adapted from: https://github.com/beir-cellar/beir/blob/main/beir/datasets/data_loader.py

    The queries dict is a key:value map of the query ID to the query text.  Here is an example of a query:
    queries['TREC_Entity-9']

    The query relations dict maps query IDs to a dict of document IDs and their relevancy scores (0, 1, 2), where
    0 = irrelevant, 1 = relevant, and 2 = highly relevant.  Not all doc IDs appear in the dict results for a given
    query.
    qrels['TREC_Entity-9']

    Corpus maps document IDs to a dict of texts and titles.
    corpus['<dbpedia:Todd_Levy>']
    """
    corpus, queries, qrels = {}, {}, {}

    corpus_file = f"{from_path}/dbpedia-entity/corpus.jsonl"
    queries_file = f"{from_path}/dbpedia-entity/queries.jsonl"
    qrels_file = f"{from_path}/dbpedia-entity/qrels/test.tsv"

    # load the corpus
    num_lines = sum(1 for i in open(corpus_file, 'rb'))
    with open(corpus_file, encoding='utf8') as f:
        for line in tqdm(f, total=num_lines):
            line = json.loads(line)
            corpus[line.get("_id")] = {
                "text": line.get("text"),
                "title": line.get("title"),
            }

    # load the queries
    with open(queries_file, encoding='utf8') as f:
        for line in f:
            line = json.loads(line)
            queries[line.get("_id")] = line.get("text")

    # load the query:doc relationships
    reader = csv.reader(
        open(qrels_file, encoding="utf-8"),
        delimiter="\t",
        quoting=csv.QUOTE_MINIMAL
    )
    next(reader)

    for id, row in enumerate(reader):
        query_id, corpus_id, score = row[0], row[1], int(row[2])

        if query_id not in qrels:
            qrels[query_id] = {corpus_id: score}
        else:
            qrels[query_id][corpus_id] = score

    return corpus, queries, qrels


def upload_to_cloud_storage(data: dict, bucket_name: str, folder_path: str, fname: str):
    """Uploads dictionary as JSON file to Google Cloud Storage bucket."""          
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(folder_path + "/" + fname)
    blob.upload_from_string(json.dumps(data), timeout=int(60*5))
    return


In [4]:
# download the data
if not os.path.exists(f"{DATA_PATH}/dbpedia-entity/corpus.jsonl"):
    download_and_unzip_dbpedia(save_path=DATA_PATH)
corpus, queries, qrels = load_dbpedia(from_path=DATA_PATH)


100%|██████████| 4635922/4635922 [00:17<00:00, 269539.07it/s]


In [16]:
# data validation: how many queries have no relevant documents?
queries_without_relevant_docs = 0
for k, v in qrels.items():
    total_irrelevant = 0
    for doc, label in v.items():
        if label == 0:
            total_irrelevant += 1
    if total_irrelevant == len(v):
        queries_without_relevant_docs += 1
print(
    f"Queries without relevant documents: "
    f"{queries_without_relevant_docs} / {len(qrels)} = "
    f"{round(100 * queries_without_relevant_docs / len(qrels), 2)}%"
) 

# data validation: how many queries have no query relations?
q_minus_qrel = len(set(queries) - set(qrels))
print(
    f"Queries without query relations: "
    f"{q_minus_qrel} / {len(set(queries))} = "
    f"{round(100 * q_minus_qrel / len(set(queries)), 2)}%"
)

# data validation: how many query relations have no query data?
qrel_minus_q = len(set(qrels) - set(queries))
print(
    f"Query relations without queries: "
    f"{qrel_minus_q} / {len(set(qrels))} = "
    f"{round(100 * qrel_minus_q / len(set(qrels)), 2)}%"
)

# data validation: how many documents in query relations have no document data?
unique_docs_in_qrels = set()
for q, rels in qrels.items():
    unique_docs_in_qrels = unique_docs_in_qrels.union(set(rels))
qrel_minus_docs = len(unique_docs_in_qrels - set(corpus))
print(
    f"Documents in query relations without document data: "
    f"{qrel_minus_docs} / {len(unique_docs_in_qrels)} = "
    f"{round(100 * qrel_minus_docs / len(unique_docs_in_qrels), 2)}%"
)

# data validation: how many documents have no query relations?
docs_minus_qrels = len(set(corpus) - unique_docs_in_qrels)
print(
    f"Documents without query relations: "
    f"{docs_minus_qrels} / {len(set(corpus))} = "
    f"{round(100 * docs_minus_qrels / len(set(corpus)), 2)}%"
)

# assertion should be True if data validation counts are correct
assert(len(set(corpus)) - len(unique_docs_in_qrels) == docs_minus_qrels)

Queries without relevant documents: 0 / 400 = 0.0%
Queries without query relations: 67 / 467 = 14.35%
Query relations without queries: 0 / 400 = 0.0%
Documents in query relations without document data: 0 / 40724 = 0.0%
Documents without query relations: 4595198 / 4635922 = 99.12%


In [17]:
# remove documents without query relations - they will be of no use for evaluation
corpus = {k: v for k, v in corpus.items() if k in unique_docs_in_qrels}
print(f"New corpus size: {len(corpus)}")

# remove queries without query relations - they will be of no use for evaluation
queries = {k: v for k, v in queries.items() if k in set(qrels)}
print(f"New query count: {len(queries)}")

New corpus size: 40724
New query count: 400


In [21]:
# save the data to be used for evaluation
upload_to_cloud_storage(data=queries, bucket_name = GCS_BUCKET, folder_path=GCS_FOLDER_PATH, fname="queries.json")
upload_to_cloud_storage(data=qrels, bucket_name = GCS_BUCKET, folder_path=GCS_FOLDER_PATH, fname="qrels.json")
upload_to_cloud_storage(data=corpus, bucket_name = GCS_BUCKET, folder_path=GCS_FOLDER_PATH, fname="corpus.json")