In [None]:
!pip install matchzoo-py

Collecting matchzoo-py
[?25l  Downloading https://files.pythonhosted.org/packages/ad/25/ee41c52865d6fe60c54eb362b00bed95d87198b88c301079631e30f597d0/matchzoo-py-1.1.1.tar.gz (109kB)
[K     |████████████████████████████████| 112kB 7.1MB/s 
Collecting pytorch-transformers>=1.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/a3/b7/d3d18008a67e0b968d1ab93ad444fc05699403fa662f634b2f2c318a508b/pytorch_transformers-1.2.0-py3-none-any.whl (176kB)
[K     |████████████████████████████████| 184kB 13.1MB/s 
[?25hCollecting nltk>=3.4.3
[?25l  Downloading https://files.pythonhosted.org/packages/92/75/ce35194d8e3022203cca0d2f896dbb88689f9b3fce8e9f9cff942913519d/nltk-3.5.zip (1.4MB)
[K     |████████████████████████████████| 1.4MB 15.7MB/s 
Collecting tqdm==4.38.0
[?25l  Downloading https://files.pythonhosted.org/packages/b9/08/8505f192efc72bfafec79655e1d8351d219e2b80b0dec4ae71f50934c17a/tqdm-4.38.0-py2.py3-none-any.whl (53kB)
[K     |████████████████████████████████| 61kB 7.8MB/s

In [None]:
import torch

device = None
if torch.cuda.is_available():
  device = torch.device("cuda")

print(device)

cuda


# Baseline and Preprocessing

This code prepares and lightly preprocesses given TREC XML corpus and runs a basic ranking method on the corpus for each of the given 100 queries.

- We represent both q,d via TF-IDF vectors and use Cosine Similarity as ranking function. 

- We return top 1000 docs, and also calculate Precision@50, as required.

- We prepare the returned relevant docs and queries in the right data format so that it can be consumed by MatchZoo-Py DataLoaders

In [None]:
# baseline.py

import math
import os
import pickle
import re
import string
import pandas as pd
import matchzoo as mz

from collections import Counter
from bs4 import BeautifulSoup

data_root = "./drive/My Drive/SNLP-Project-Data/"
ans_patterns = data_root + "patterns.txt"
test_questions = data_root + "test_questions.txt"
trec_corpus_xml = data_root + "trec_documents.xml"

processed_root = data_root + "processed/"
os.makedirs(processed_root, exist_ok=True)

processed_corpus = processed_root + "corpus.pkl"
processed_text_qs = processed_root + "test_qs.pkl"
processed_tfids = processed_root + "tfids.pkl"
processed_tfidf_repr = processed_root + "tfrepr.pkl"

question_extraction_pattern = "Number: (\d+) *\n\n\<desc\> Description\:\n(\w+.*)\n\n\<\/top>"


def get_test_questions(test_questions, ans_patterns, save=False):
    try:
        print("Loading from saved pickle")
        test_qs = pickle.load(open(processed_text_qs, "rb"))
        return test_qs

    except Exception as e:

        # get question id and text
        qs = {}
        questions_doc = open(test_questions).read()
        question_extraction_pattern = "^\<num\> Number: (\d+) *\n\n\<desc\> Description\:\n(\w+.*)\n\n\<\/top>$"
        result = re.findall(question_extraction_pattern, questions_doc, re.MULTILINE)

        for q in result:
            processed_q = q[1].lower()
            processed_q = processed_q.translate(str.maketrans('', '', string.punctuation))

            qs[int(q[0])] = {'raw_question': q[1], 'question': processed_q, 'ans_patterns': []}

        # get associated answer patterns
        ans_doc = open(ans_patterns).readlines()

        for ap in ans_doc:
            # print(ap)
            ap = ap.split(" ")
            id, pattern = ap[0], " ".join(ap[1:]).strip()
            qs[int(id)]['ans_patterns'].append(pattern)

        if save:
            print("saving processed questions, existing data will be overwritten")
            pickle.dump(
                qs,
                open(processed_text_qs, "wb")
            )

        return qs


def process_trec_xml(trec_corpus_xml, save=False):
    try:
        print("Loading from saved pickle")
        corpus = pickle.load(open(processed_corpus, "rb"))
        return corpus

    except Exception as e:

        print("Data doesn't exit or other error", e)
        print("Processing from scratch")

        corpus = {
            # doc_id -> doc_text
        }

        with open(trec_corpus_xml, 'r') as dh:

            soup = BeautifulSoup(dh, 'html.parser')
            
            # article_texts = soup.find_all('doc')
            # Using 'text' instead of 'doc' we can remove the byline and headline 
            # information and also publication, page information
            
            article_texts = soup.find_all('text')
            article_ids   = soup.find_all('docno')

            assert len(article_texts) == len(article_ids)

            print("Found %d articles..." % len(article_texts))

            #for a in article_texts:
            for a_id, a_text in zip(article_ids, article_texts):    
                
                # for now we don't separate byline / headline etc
                a_id = a_id.get_text().lower().strip()
                
                # remove common punct
                # TODO: Remove Byline, Dates, and other useless meta stuff - DONE
                text = a_text.get_text().lower()
                text = text.translate(str.maketrans('', '', string.punctuation))
                text = text.replace('\n', '')
                text = text.replace('\r', '')

                corpus[a_id] = text

        if save:
            print("saving processed corpus, existing data will be overwritten")
            pickle.dump(
                corpus,
                open(processed_corpus, "wb")
            )

        return corpus


# create representation of all docs in terms of their term freqs
def compute_tfidf_doc_repr(corpus, term_idfs, save=False):
    try:
        print("Loading from saved pickle")
        corpus_tfidf_repr = pickle.load(open(processed_tfidf_repr, "rb"))

        return corpus_tfidf_repr

    except Exception as e:

        corpus_tfidf_repr = {}

        for doc_id in corpus:

            tf_repr = Counter(corpus[doc_id].split(" "))
            doc_max = max(tf_repr.values())

            for k, v in tf_repr.items():
                # normalize by max freq
                tf_repr[k] = tf_repr[k] / doc_max
                # weight tf by idf
                tf_repr[k] = tf_repr[k] * term_idfs[k]

            corpus_tfidf_repr[doc_id] = tf_repr

        if save:
            print("saving tfid repr, existing data will be overwritten")
            pickle.dump(
                corpus_tfidf_repr,
                open(processed_tfidf_repr, "wb")
            )

        return corpus_tfidf_repr


# returns the idf weighted representation, given tf based repr as input
def get_tfidfs_repr(v, term_idfs):
    v = Counter(v.split(" "))
    q_max = max(v.values())

    for k, val in v.items():
        # normalize by max freq
        v[k] = v[k] / q_max
        # weight tf by idf
        try:
            v[k] = v[k] * term_idfs[k]
        except KeyError as ke:
            # we might not have IDF score for some question terms.
            # so we just use TF value, this is same as setting IDF = 1
            pass

    return v


def cosine_sim(q, d):
    # only terms common b/w q and d affect the dot product
    # all other entries are either zero in query or in doc    
    common_terms = set(q.keys()).intersection(set(d.keys()))

    dot_prod = 0

    for ct in common_terms:
        dot_prod += q[ct] * d[ct]

    mag_q = sum([v ** 2 for v in q.values()])
    mag_d = sum([v ** 2 for v in d.values()])

    denom = math.sqrt(mag_q) * math.sqrt(mag_d)

    score = dot_prod / denom

    return score


def compute_term_idfs(corpus, save=False):
    try:
        print("Loading from saved pickle")
        term_doc_freq = pickle.load(open(processed_tfids, "rb"))
        return term_doc_freq

    except Exception as e:

        term_doc_freq = {}
        N = len(corpus.keys())

        # first we get the document freq of a term 
        # i.e. how many docs contain that term
        # this is upper bounded by num of docs, of course
        for doc in corpus:

            # we are interested in just occurrence, and not actual freqs
            # that's why we convert the doc to set of non-repeating terms
            terms = set(corpus[doc].split(" "))

            for term in terms:

                if term in term_doc_freq.keys():
                    term_doc_freq[term] += 1
                else:
                    term_doc_freq[term] = 1

        # now that we have term's df, we inverse it and apply log normalization
        for t in term_doc_freq.keys():
            term_doc_freq[t] = math.log(N / term_doc_freq[t])

        if save:
            print("saving tfids, existing data will be overwritten")
            pickle.dump(
                term_doc_freq,
                open(processed_tfids, "wb")
            )

        return term_doc_freq

# Returnd doc ids and scores sorted in DESCENDING order
# i.e. the best doc will be at index 0 and so on
def get_relevant_docs(q, tfidf_reprs, term_idfs, how_many=1):
    assert how_many < len(tfidf_reprs)

    doc_scores = {
        # doc id -> doc score
    }

    q = get_tfidfs_repr(q['question'], term_idfs)

    for d in tfidf_reprs:
        doc_scores[d] = cosine_sim(q, tfidf_reprs[d])

    sorted_scores = sorted(doc_scores.items(), key=lambda x: x[1], reverse=True)[:how_many]

    # unpack the dict into separate lists
    doc_ids, scores = zip(*sorted_scores)

    return doc_ids, scores

# Calculate precision over the returned result set and query
# precision = #(rel and ret) / #(ret)
def precision_at_r(returned_docs, q, corpus):
    
    # This 'R' should be fixed at 50 for report
    # TODO: Confirm this
    
    R = len(returned_docs)
    assert R == 50, "R is not 50"

    relevant_count = 0

    # print(R, q)
    # check if doc is relevant wrt any of the answer patterns
    for d in returned_docs:
        rel = []
        for ap in q['ans_patterns']:
            rel.append(bool(re.search(ap.strip(), corpus[d], flags=re.IGNORECASE)))
        
        # count as relevant if at least one answer pattern matched somewhere in doc
        relevant_count += int(any(rel))

    # print(relevant_count)
    return relevant_count / R


# a utility function to find out how many docs are relevant 
# to a given query. Can be considered a histogram.
# we expect to see a highly skewed histogram, with 1-5 docs relevant per query.    
def relevant_per_query(corpus, queries):

    # init rel counts with zero per query
    from collections import defaultdict
    
    rel_counts = [0 for i in range(len(queries))]
    rel_anspat = defaultdict(int)
    # check if doc is relevant wrt any of the answer patterns
    for doc in corpus:
        for q in queries:
            #print(q)
            for ap in queries[q]['ans_patterns']:
                rel_anspat[ap.strip()] += bool(re.search(ap.strip(), corpus[doc], flags=re.IGNORECASE))
                rel_counts[q-1] += bool(re.search(ap.strip(), corpus[doc], flags=re.IGNORECASE))

    return rel_counts, rel_anspat
     

# Input:
# [
#    {
#       "query": "q1",
#       ID's are such that we can get doc text easily by indexing
#       "rel_docs": [doc_id1, doc_id2,..... doc_idK], 
#       "rel_doc_scores": [rel_doc1_score, rel_doc2_score,..... rel_docK_score],
#    }
# ]
# 
# Purpose of this function is to correctly prepare the data so that 
# it can be preprocessed by MatchZoo lib and then passed on to DRMM model
# for getting document score

# For ref: rel_docs, scores = get_relevant_docs(test_qs[q], tfidf_reprs, term_idfs, how_many=50)
# For ref: df = pd.DataFrame(data={'text_left': list('AABC'), 'text_right': list('abbc')})

def prepare_for_reranking(queries, returned_docs):


    # Notes for final testing on unseen data
    # if we can fit our test corpus in above paradigm, the histogram will be auto computed
    # via callbacks. Only thing it needs to calc score is text_left and text_right i.e. 
    # the X part in (X,y) tuples. Need to see how we can do this.
    # X = {text_left, text_right} . we don't need label
    
    pass

if __name__ == "__main__":

    HOW_MANY = 1000 # How many relevant docs to return per query
    
    corpus = process_trec_xml(trec_corpus_xml, save=True)
    term_idfs = compute_term_idfs(corpus, save=True)
    tfidf_reprs = compute_tfidf_doc_repr(corpus, term_idfs, save=True)
    test_qs = get_test_questions(test_questions, ans_patterns, save=True)

    precisions = []

    rerank_data = {
        'text_left': [], # for queries,
        'text_right': [] # for docs 
    }

    for q in test_qs.values():

        rel_docs, scores = get_relevant_docs(q, tfidf_reprs, term_idfs, how_many=HOW_MANY)
        
        # We need 1000 docs for next step, but for report we only need to calc.
        # precision@50. So we pass only top-50
        precisions.append(
            precision_at_r(
                rel_docs[:50], 
                q, 
                corpus
            )
        )

        # prepping for MatchZoo

        rerank_data['text_left'].extend([q['question']] * HOW_MANY)
        for doc in rel_docs:
            rerank_data['text_right'].append(corpus[doc])
    
    # We will pass this to our neural / advanced reranking method
    # For every query, we store the top 1000 docs
    df = pd.DataFrame(data = rerank_data)
    unseen_packed_raw = mz.pack(df, task='ranking')
    
    #########################################################################
    # REQUIRED FOR REPORT: 
    # (g) Sort the similarity scores and output the top 50 most relevant 
    # documents for a query along with their scores.
    #########################################################################
    
    # print(relevant_per_query(corpus, test_qs))
    #print(len(test_qs))
    print("Precision is: ", sum(precisions) / len(test_qs))


Loading from saved pickle
Loading from saved pickle
Loading from saved pickle
Loading from saved pickle
Precision is:  0.07999999999999993


In [None]:
#len(rerank_data['text_left']), len(rerank_data['text_right'])
len(unseen_packed_raw)

100000

# Loading Training data (WikiQA / SQuAD)

In [None]:
# contents of init.ipynb and drmm.ipynb

########################################################################
# improve over the performance of the baseline IR model by first ranking and 
# returning the top 1000 documents for a query with the baseline retriever.
# Then, you should develop your own method to re-rank these 1000 documents to
# return the top 50 documents, which should improve over the top documents 
# returned by baseline model
########################################################################

import torch
import numpy as np
import pandas as pd
import matchzoo as mz
print('matchzoo version', mz.__version__)

ranking_task = mz.tasks.Ranking(losses=mz.losses.RankCrossEntropyLoss(num_neg=10))
ranking_task.metrics = [
    mz.metrics.NormalizedDiscountedCumulativeGain(k=3),
    mz.metrics.NormalizedDiscountedCumulativeGain(k=5),
    mz.metrics.MeanAveragePrecision()
]

print("`ranking_task` initialized with metrics", ranking_task.metrics)
print('data loading ...')

# Packing code src: 
# https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/data_pack/pack.py
# filtered=True removes the questions without correct answers.
# Load data src: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/datasets/wiki_qa/load_data.py

train_pack_raw = mz.datasets.wiki_qa.load_data('train', task=ranking_task)
dev_pack_raw   = mz.datasets.wiki_qa.load_data('dev', task=ranking_task, filtered=True)
test_pack_raw  = mz.datasets.wiki_qa.load_data('test', task=ranking_task, filtered=True)

# In matchzoo text is presented as a left/right comparison task
# i.e. left could be `query` and right could be `doc`
# and comparison could be either ranking or classification
# In WikiQA
# text_left = Question
# text_right = Sentence

# How to make datapacks from python structures: 
# https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/data_pack/data_pack.py

# Notes for final testing on unseen data
# if we can fit our test corpus in above paradigm, the histogram will be auto-computed
# via callbacks. Only thing it needs to calc score is text_left and text_right i.e. 
# the X part in (X,y) tuples. Need to see how we can do this.

print('data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`')

matchzoo version 1.1.1
`ranking_task` initialized with metrics [normalized_discounted_cumulative_gain@3(0.0), normalized_discounted_cumulative_gain@5(0.0), mean_average_precision(0.0)]
data loading ...
Downloading data from https://download.microsoft.com/download/E/5/F/E5FCFCEE-7005-4814-853D-DAA7C66507E0/WikiQACorpus.zip
data loaded as `train_pack_raw` `dev_pack_raw` `test_pack_raw`


# Basic preprocessing with default preprocessor 

- Tokenization, Stop word removal, Lowercasing, Punctuation Removal etc.

In [None]:
import nltk
nltk.download('punkt')

# preprocessor specifies things like stop word removal, length truncation,
# frequency based filtering etc.
# From output it seems like it does Tokenize => Lowercase => PuncRemoval
# src: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/engine/base_model.py
# basic pre-processor: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/preprocessors/basic_preprocessor.py
# Freq Filter: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/preprocessors/units/frequency_filter.py
# By default the truncation on both left and right is 'None' i.e. they are not 
# truncated to any fixed length. I don't think they're are even needed for DRMM

preprocessor = mz.models.DRMM.get_default_preprocessor()

# Data pack source : https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/data_pack/data_pack.py

train_pack_processed = preprocessor.fit_transform(train_pack_raw)
dev_pack_processed = preprocessor.transform(dev_pack_raw)
test_pack_processed = preprocessor.transform(test_pack_raw)
unseen_pack_processed = preprocessor.transform(unseen_packed_raw)

# print some info on vocab size etc
print(preprocessor.context)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
Processing text_left with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 2118/2118 [00:00<00:00, 7612.45it/s]
Processing text_right with chain_transform of Tokenize => Lowercase => PuncRemoval: 100%|██████████| 18841/18841 [00:04<00:00, 3909.58it/s]
Processing text_right with append: 100%|██████████| 18841/18841 [00:00<00:00, 877420.55it/s]
Building FrequencyFilter from a datapack.: 100%|██████████| 18841/18841 [00:00<00:00, 124879.52it/s]
Processing text_right with transform: 100%|██████████| 18841/18841 [00:00<00:00, 135983.22it/s]
Processing text_left with extend: 100%|██████████| 2118/2118 [00:00<00:00, 392625.12it/s]
Processing text_right with extend: 100%|██████████| 18841/18841 [00:00<00:00, 647606.92it/s]
Building Vocabulary from a datapack.: 100%|██████████| 418412/418412 [00:00<00:00, 2236213.90it/s]
Processing text_left with chain_transform of 

{'filter_unit': <matchzoo.preprocessors.units.frequency_filter.FrequencyFilter object at 0x7fe20a5a8ac8>, 'vocab_unit': <matchzoo.preprocessors.units.vocabulary.Vocabulary object at 0x7fe210799f60>, 'vocab_size': 30059, 'embedding_input_dim': 30059}





# Load GLoVE Embeddings

In [None]:
# GloVE: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/datasets/embeddings/load_glove_embedding.py
glove_embedding = mz.datasets.embeddings.load_glove_embedding(dimension=300)

# Vocab unit src: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/preprocessors/units/vocabulary.py
term_index = preprocessor.context['vocab_unit'].state['term_index']
embedding_matrix = glove_embedding.build_matrix(term_index)
l2_norm = np.sqrt((embedding_matrix * embedding_matrix).sum(axis=1))
embedding_matrix = embedding_matrix / l2_norm[:, np.newaxis]

Downloading data from http://nlp.stanford.edu/data/glove.6B.zip


# Create DataLoaders from pre-processed data

- This step also includes a callback which does Histogram computation for each training example

In [None]:
# Function of callbacks - Callback is used to transform / compute relevant 
# statistics on a databatch. e.g. here we compute matching histogram

# Base callback source: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/engine/base_callback.py
# Hist callback source: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/dataloader/callbacks/histogram.py

histgram_callback = mz.dataloader.callbacks.Histogram(
    embedding_matrix, bin_size=30, hist_mode='LCH'
)

# src Dataset : https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/dataloader/dataset.py
# num_dup: Number of duplications per instance
# num_neg: Number of negative samples per instance

trainset = mz.dataloader.Dataset(
    data_pack=train_pack_processed,
    mode='pair',
    num_dup=5,
    num_neg=10,
    callbacks=[histgram_callback]
)
testset = mz.dataloader.Dataset(
    data_pack=test_pack_processed,
    callbacks=[histgram_callback]
)

unseen = mz.dataloader.Dataset(
    data_pack=unseen_pack_processed,
    batch_size=1000, # So we can get all top-1k ranked docs per query in 10 batch
    callbacks=[histgram_callback],
    shuffle=False
)

# padding callback src: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/engine/base_model.py#L226
padding_callback = mz.models.DRMM.get_default_padding_callback()

# DataLoader src: https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/dataloader/dataloader.py
trainloader = mz.dataloader.DataLoader(
    device=device,
    dataset=trainset,
    stage='train',
    #resample=True,
    callback=padding_callback
)
testloader = mz.dataloader.DataLoader(
    dataset=testset,
    device=device,
    stage='dev',
    callback=padding_callback
)

# create loader for unseen TREC XML data
unseenloader = mz.dataloader.DataLoader(
    dataset=unseen,
    device=device,
    #batch_size=HOW_MANY,
    stage='test', # 'test' bcz we don't have Y.
    callback=padding_callback
)

# Initialize DRMM Model

In [None]:
# DRMM src - https://github.com/NTMC-Community/MatchZoo-py/blob/master/matchzoo/models/drmm.py

model = mz.models.DRMM().to(device)

# DRMM fwd pass: https://github.com/NTMC-Community/MatchZoo-py/blob/49548ad4dd7da4c890ac786a09d9df9172c3af47/matchzoo/models/drmm.py#L70

model.params['task'] = ranking_task
model.params['mask_value'] = 0
model.params['embedding'] = embedding_matrix
model.params['hist_bin_size'] = 30
model.params['mlp_num_layers'] = 1
model.params['mlp_num_units'] = 10
model.params['mlp_num_fan_out'] = 1
model.params['mlp_activation_func'] = 'tanh'
model.params['embedding_freeze'] = True
model.build()

print(model.params)
print('Trainable params: ', sum(p.numel() for p in model.parameters() if p.requires_grad))

model_class                   <class 'matchzoo.models.drmm.DRMM'>
task                          Ranking Task
out_activation_func           None
with_embedding                True
embedding                     [[-0.04967236  0.01572789  0.07462103 ... -0.07714741 -0.05622825
   0.03582055]
 [-0.08508103  0.05267619 -0.07051839 ...  0.03369272 -0.02393593
  -0.08515697]
 [ 0.022119    0.08201521  0.01799603 ... -0.14142403  0.09055382
   0.02254226]
 ...
 [ 0.01527015  0.05386136 -0.00782055 ...  0.08603887  0.07638169
  -0.07216352]
 [ 0.09764317 -0.09725617  0.08593716 ... -0.00455429 -0.08631574
   0.08131319]
 [-0.07896867  0.00017807 -0.07912631 ... -0.05847381 -0.00127889
  -0.07484334]]
embedding_input_dim           30059
embedding_output_dim          300
padding_idx                   0
embedding_freeze              True
with_multi_layer_perceptron   True
mlp_num_units                 10
mlp_num_layers                1
mlp_num_fan_out               1
mlp_activation_func           

# Checking for bugs in data loading

In [None]:
batch = None
for X, _ in unseenloader:
  batch = X
  break

l, r = batch['text_left'], batch['text_right']

vocab_unit = preprocessor.context['vocab_unit']
term_idx_pad = vocab_unit.transform(['<PAD>'])[0]


len(r[[1,3,4,5]])
# for q,d in zip(l,r):
#   print(
#      ' '.join(
#          vocab_unit.state['index_term'][i] \
#          for i in q.tolist() \
#          if i != term_idx_pad)
#    )
#   print()
#   print(
#      ' '.join(
#          vocab_unit.state['index_term'][i] \
#          for i in d.tolist() \
#          if i != term_idx_pad)
#    )
#   print()
#   break

# for q in r:
#   print( ' '.join(
#          vocab_unit.state['index_term'][i] \
#          for i in q.tolist() \
#          if i != term_idx_pad))
  

4

In [None]:
selected = r[[1,3,4,5]]

for s in selected: 
    print( ' '.join(
         vocab_unit.state['index_term'][i] \
         for i in s.tolist() \
         if i != term_idx_pad) )
    print()

why did lady thatcher come out so strongly in support of john major is as easy as but clarke

the recovery in the world iron ore market that began last year picked in 1994 and should accelerate in 1995 according to the conference on trade and says next year should see the reversal of three of declining iron ore iron ore exports rose by nearly 8 per cent to tonnes in to a marked reduction in exporters stocks while global iron rose by 25 per cent to tonnes the stimulus from the economy more than offset a sharp drop in mining and former soviet union the report imported more than tonnes of steel last year and thus iron ore consumption in all the main iron ore imports also soared by 30 per cent confirming as the worlds most dynamic market for current and future iron the largest iron ore producer china mined tonnes last year of nearly 15 per cent over 1992 however domestic production 70 per cent of the needs points out iron rose 166 per cent to tonnes in 1993 or more than a the world says th

# Training

In [None]:
optimizer = torch.optim.Adadelta(model.parameters())

trainer = mz.trainers.Trainer(
    device='cpu',
    model=model,
    optimizer=optimizer,
    trainloader=trainloader,
    validloader=testloader,
    validate_interval=None,
    epochs=20
)

In [None]:
trainer.run()

HBox(children=(IntProgress(value=0, max=160), HTML(value='')))

[Iter-160 Loss-2.374]:
  Validation: normalized_discounted_cumulative_gain@3(0.0): 0.3794 - normalized_discounted_cumulative_gain@5(0.0): 0.4633 - mean_average_precision(0.0): 0.4298



HBox(children=(IntProgress(value=0, max=160), HTML(value='')))




KeyboardInterrupt: ignored

# Testing

# Get Predictions on Unseen (TREC XML) data

# Print Question and Top relevant doc for each query, for inspection

In [None]:
vocab_unit = preprocessor.context['vocab_unit']
term_idx_pad = vocab_unit.transform(['<PAD>'])[0]

# scores - K doc scores returned by model for a single query 
def get_top_K_indices(scores,K=50):
  
  scores_kv = { k:v[0] for k,v in enumerate(scores) }
  sorted_scores_kv = sorted(scores_kv.items(), 
                            key = lambda x: x[1], 
                            reverse=True)
  sorted_indices = [idx for idx, _ in sorted_scores_kv]

  # return top K indices
  return sorted_indices[:K]

# Calculate precision over the returned result set and query
# precision = #(rel and ret) / #(ret)
def precision_at_r_model(returned_docs_idx, questions):
    
    precisions = []
    i = 1
    
    for q, docs, (X, _) in zip(questions, returned_docs_idx, unseenloader):
      
      assert len(docs) == 50, "R is not 50"
      print("Q %d: %s" % (i, questions[q]['question']))
      i = i + 1
      
      returned_docs = X['text_right'][docs]
      relevant_count = 0
      
      for d_i,d in enumerate(returned_docs):
        
        d = ' '.join(vocab_unit.state['index_term'][t] \
         for t in d.tolist() \
         if t != 0)
        
        if d_i == 0: print(d) # print only first doc

        rel = []
        
        for ap in questions[q]['ans_patterns']:
            rel.append(bool(re.search(ap.strip(), d, flags=re.IGNORECASE)))
            # count as relevant if at least one answer pattern 
            # matched somewhere in doc
        relevant_count += int(any(rel))
      print(relevant_count)
      precisions.append(relevant_count / 50)

    return sum(precisions) / 10

per_query_scores = []
best_docs = []
queries = []
limit = 2
i = 0
b = 1

with torch.no_grad():
  for X,_ in unseenloader:
    
    print("Batch %d" % b)
    b = b + 1
    
    # save scores for top-1k docs
    pqs = model(X).tolist()
    
    # get indices of top-scoring 50 docs
    best_docs.append(
        get_top_K_indices(scores=pqs,K=50)
    )
    
    i = i + 1
    if i >= limit:
      break # try for 1 first

print(
    precision_at_r_model(best_docs, test_qs)
)


Batch 1
Batch 2


TypeError: ignored

In [None]:
#unseenloader.__iter__()
test_qs

{1: {'ans_patterns': ['Young'],
  'question': 'who is the author of the book the iron lady a biography of margaret thatcher',
  'raw_question': 'Who is the author of the book, "The Iron Lady: A Biography of Margaret Thatcher"?'},
 2: {'ans_patterns': ['\\$469,000'],
  'question': 'what was the monetary value of the nobel peace prize in 1989',
  'raw_question': 'What was the monetary value of the Nobel Peace Prize in 1989?'},
 3: {'ans_patterns': ['405',
   'automobiles?',
   'diesel\\s+motors?',
   '309s?',
   '106s?',
   '504s?',
   '505s?',
   '205s?',
   '306s?',
   'vehicles?',
   'cars?',
   'Peugeots',
   'plastic\\s+components'],
  'question': 'what does the peugeot company manufacture',
  'raw_question': 'What does the Peugeot company manufacture?'},
 4: {'ans_patterns': ['Pounds\\s+12\\s*(?:m|(?:million))'],
  'question': 'how much did mercury spend on advertising in 1993',
  'raw_question': 'How much did Mercury spend on advertising in 1993?'},
 5: {'ans_patterns': ['Horne'],

In [None]:
precision_at_r_model(
    returned_docs_idx, 
    test_qs)

{'ans_patterns': ['Young'],
 'question': 'who is the author of the book the iron lady a biography of margaret thatcher',
 'raw_question': 'Who is the author of the book, "The Iron Lady: A Biography of Margaret Thatcher"?'}