# Calculating relevance metrics

In [None]:
import pandas as pd
import scipy as sp
import scipy.stats as st
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
import json
import random
from pprint import pprint
from time import sleep
from datetime import datetime, timedelta
from urllib import parse
from bs4 import BeautifulSoup
import requests
from irmetrics.topk import rr
matplotlib.rcParams['figure.figsize'] = [16, 9]

Set some useful parameters. It's important to set a custom User-agent here as we want to be able to exclude these searches from future relevance judgment sets when running against production. Running 20 bootstraps of 250 queries took about 25 minutes for me, adjust this to taste!

In [None]:
SERP = "https://caselaw.nationalarchives.gov.uk/judgments/search"
USER_AGENT = "Tim-MetricsBot/0.0.1"
TIMEOUT = 0.2
BOOTSTRAPS = 20
SAMPLE_SIZE = 250
RESULT_SELECTOR = ".judgment-listing__title"
DEFAULT_PER_PAGE = 10

Load the data file we created in the 'Parse cloudfront' notebook:

In [None]:
with open("data/search_queries_and_documents.json") as file:
    searches = json.load(file)

Define a method to randomly sample from our queries and relevance judgments - bootstraps are sampled with replacement, but the queries within each one  are sampled without - each query within a bootstrap is unique.

In [None]:
def select_searches(searches, n_bootstraps=BOOTSTRAPS, sample_size=SAMPLE_SIZE):
    bootstraps = []
    for i in range(0, n_bootstraps):
        bootstraps.append(random.choices(searches, k=sample_size))
    return bootstraps

Define a method to do the business of running the search, and returning the urls of the results.

In [None]:
def get_results_for_search(search):
    search["query"]["query"] = parse.unquote(search["query"]["query"][0]) # TODO this should be done in creating the judgments file, not here
    query_string = parse.urlencode(search["query"], doseq=True)
    response = requests.get("%s?%s" % (SERP, query_string), headers={'User-Agent': USER_AGENT})
    soup = BeautifulSoup(response.text, 'html.parser')
    result_elements = soup.select(RESULT_SELECTOR)
    urls = [e.find("a", href=True)["href"].split("?")[0] for e in result_elements]
    return urls

Define a method to compute the mean reciprocal rank for a list of 'true positives' and a corresponding list of search results

In [None]:
def mean_reciprocal_rank(search_results):
    trues = [r["true"] for r in search_results]
    returneds = [r["returned"] for r in search_results]
    longest = max(len(r) for r in returneds)
    returneds_padded = [r + [np.nan] * (longest - len(r)) for r in returneds]
    return rr(trues, np.vstack(returneds_padded)).mean()

Define a method to summarise the results across each bootstrap - returning the average MRR, std of MRR, the average rank of the true result and its confidence interval

In [None]:
def summarise_results(results): 
    mrrs = [r["mrr"] for r in results]
    avg_ranks = [1/mrr for mrr in mrrs]
    ci = st.t.interval(0.95, len(avg_ranks)-1, loc=np.mean(avg_ranks), scale=st.sem(avg_ranks))
    avg_avg_ranks = np.mean(avg_ranks)
    mrr_mean = np.mean(mrrs)
    mrr_std = np.std(mrrs)
    return {
        "mrr_mean": mrr_mean, 
        "mrr_std": mrr_std, 
        "avg_rank": avg_avg_ranks,
        "rank_ci": ci
    }

Put it all together and print the results! This will likely take a little while, and will print the MRR of each bootstrap as it goes, so you can see the progress. At the end you will see the summary we defined above, over all runs:

In [None]:
def run_evaluation(searches, bootstraps=BOOTSTRAPS, sample_size=SAMPLE_SIZE):
    bootstrap_results = []
    print("--- STARTING evaluation runs: %s runs of %s queries each. ---" % (bootstraps, sample_size))
    for (i, bootstrap) in enumerate(select_searches(searches, bootstraps, sample_size)):
        print("STARTING bootstrap run %s" % i+1)
        search_results = []
        for search in bootstrap:
            results = get_results_for_search(search)
            page = int(search["query"].get("page", [0])[0])
            per_page = int(search["query"].get("per_page", [DEFAULT_PER_PAGE])[0])
            # When page > 1, pad the results with nan, so that the rank takes into account the pagination.
            padded_results = ([np.nan] * (page * per_page)) + results
            search_results.append({"true": search["documents"][-1], "returned": padded_results, "search": search})
            sleep(TIMEOUT)
        mrr = mean_reciprocal_rank(search_results)
        print("FINISHED run %s, mean reciprocal rank: %.2f" % (i+1, mrr))
        bootstrap_results.append({"searches": search_results, "mrr": mrr})
    summary = summarise_results(bootstrap_results)
    print("--- FINISHED evaluation runs ---")
    print("--- SUMMARY ---")
    print("* Bootstrap resampling runs: %s" % bootstraps)
    print("* Sample size: %s" % sample_size)
    print("* Mean Reciprocal Rank (Standard deviation): %.2f (%.2f)" % (summary["mrr_mean"], summary["mrr_std"]))
    print("* Maximum-likelihood rank for true result (Confidence interval): %.2f (%.2f, %.2f)" % (summary["avg_rank"], summary["rank_ci"][0], summary["rank_ci"][1]))

In [None]:
run_evaluation(searches)