# Get topics

In [0]:
! wget https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml

--2020-04-18 09:40:32--  https://ir.nist.gov/covidSubmit/data/topics-rnd1.xml
Resolving ir.nist.gov (ir.nist.gov)... 129.6.24.92, 2610:20:6005:24::92
Connecting to ir.nist.gov (ir.nist.gov)|129.6.24.92|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 10348 (10K) [application/xml]
Saving to: ‘topics-rnd1.xml’


2020-04-18 09:40:33 (1.04 MB/s) - ‘topics-rnd1.xml’ saved [10348/10348]



## parse XML file to tsv file

In [0]:
from xml.dom import minidom

# parse an xml file by name
ftopics = minidom.parse('topics-rnd1.xml')

topics = ftopics.getElementsByTagName('topic')

# all topic attributes
with open('topics_rnd1.tsv','w') as out:
    for topic in topics:
        qid = topic.attributes['number'].value
        q = topic.getElementsByTagName("query")[0]
        query = q.firstChild.data
        qst = topic.getElementsByTagName("question")[0]
        question = qst.firstChild.data
        narr = topic.getElementsByTagName("narrative")[0]
        narrative = narr.firstChild.data

        out.write("\t".join([qid, query, question, narrative])+"\n")

In [0]:
! wc -l topics_rnd1.tsv

30 topics_rnd1.tsv


# Retrieve top 1000 docs per topic using BM25

## Get April 10 Covidex Index

In [0]:
! wget https://www.dropbox.com/s/j55t617yhvmegy8/lucene-index-covid-2020-04-10.tar.gz

--2020-04-18 10:13:48--  https://www.dropbox.com/s/j55t617yhvmegy8/lucene-index-covid-2020-04-10.tar.gz
Resolving www.dropbox.com (www.dropbox.com)... 162.125.9.1, 2620:100:601f:1::a27d:901
Connecting to www.dropbox.com (www.dropbox.com)|162.125.9.1|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/j55t617yhvmegy8/lucene-index-covid-2020-04-10.tar.gz [following]
--2020-04-18 10:13:48--  https://www.dropbox.com/s/raw/j55t617yhvmegy8/lucene-index-covid-2020-04-10.tar.gz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uccbc7e714481b80f7b866060d09.dl.dropboxusercontent.com/cd/0/inline/A2EVHhUIEKU3Msp-d-UxHriSixcxY5bXxKZxGtLH3eNhZoKMspZ1vs6QhwExfm_-SPmwfCLhtFRg3yK3i3Fy6Eb8K5Fk1Hzz2d1XtOyXFQ0-W1F0i51E8FtAwsEqXeUYugU/file# [following]
--2020-04-18 10:13:48--  https://uccbc7e714481b80f7b866060d09.dl.dropboxusercontent.com/cd/0/inline/A2EVHhUIEKU3Msp-d-UxHriSixcxY5bXxKZxGtLH3e

In [0]:
!tar xvfz lucene-index-covid-2020-04-10.tar.gz

lucene-index-covid-2020-04-10/
lucene-index-covid-2020-04-10/_0.si
lucene-index-covid-2020-04-10/_0.fnm
lucene-index-covid-2020-04-10/_0.fdx
lucene-index-covid-2020-04-10/_0.tvx
lucene-index-covid-2020-04-10/_0.tvd
lucene-index-covid-2020-04-10/_0.dii
lucene-index-covid-2020-04-10/_0_Lucene50_0.pos
lucene-index-covid-2020-04-10/_0_Lucene50_0.tip
lucene-index-covid-2020-04-10/_0_Lucene50_0.tim
lucene-index-covid-2020-04-10/_0.nvd
lucene-index-covid-2020-04-10/_0.nvm
lucene-index-covid-2020-04-10/_0_Lucene50_0.doc
lucene-index-covid-2020-04-10/segments_1
lucene-index-covid-2020-04-10/_0.fdt
lucene-index-covid-2020-04-10/write.lock
lucene-index-covid-2020-04-10/_0.dim
lucene-index-covid-2020-04-10/_0_Lucene80_0.dvd
lucene-index-covid-2020-04-10/_0_Lucene80_0.dvm


In [0]:
!du -h lucene-index-covid-2020-04-10

1.6G	lucene-index-covid-2020-04-10


## Doc ids concerned 

In [0]:
! wget https://ir.nist.gov/covidSubmit/data/docids-rnd1.txt

--2020-04-18 11:59:37--  https://ir.nist.gov/covidSubmit/data/docids-rnd1.txt
Resolving ir.nist.gov (ir.nist.gov)... 129.6.24.92, 2610:20:6005:24::92
Connecting to ir.nist.gov (ir.nist.gov)|129.6.24.92|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 459972 (449K) [text/plain]
Saving to: ‘docids-rnd1.txt’


2020-04-18 11:59:38 (655 KB/s) - ‘docids-rnd1.txt’ saved [459972/459972]



In [0]:
! wc -l /content/docids-rnd1.txt

51103 /content/docids-rnd1.txt


In [0]:
valid_doc_ids = set()
l=[]
with open('/content/docids-rnd1.txt') as f:
  for line in f:
    l.append(line.rstrip())
valid_doc_ids = set(l)
len(valid_doc_ids), len(l)

(51070, 51103)

## Retrieval with index


In [0]:
%%capture
!pip install pyserini==0.8.1.0

import json
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
# os.environ["JAVA_HOME"] = "/Library/Java/JavaVirtualMachines/jdk-13.0.2.jdk/Contents/Home"

In [0]:
from pyserini.search import pysearch

COVID_INDEX = '/content/lucene-index-covid-2020-04-10'

searcher = pysearch.SimpleSearcher(COVID_INDEX)
searcher.set_bm25_similarity(k1=0.9, b=0.4)
searcher.set_rm3_reranker()

In [0]:
import re

def clean_html(html):
    """
    Copied from NLTK package.
    Remove HTML markup from the given string.
    :param html: the HTML string to be cleaned
    :type html: str
    :rtype: str
    """
    html = str(html)
    # First we remove inline JavaScript/CSS:
    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
    # Then we remove html comments. This has to be done before removing regular
    # tags since comments can contain '>' characters.
    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
    # Next we can remove the remaining tags:
    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
    # Finally, we deal with whitespace
    cleaned = re.sub(r"&nbsp;", " ", cleaned)
    cleaned = re.sub(r"  ", " ", cleaned)
    cleaned = re.sub(r"\t", " ", cleaned)
    return cleaned.strip()

In [0]:
def search_document(searcher, topics, output_fn, collection='TREC-COVID', K=1000):
        with open(f"{output_fn}.tsv", 'w', encoding='utf-8') as out, open(f"{output_fn}_docs.tsv", 'w', encoding='utf-8') as out_docs:
            for qid in topics:
                query = topics[qid]
                hits = searcher.search(query, K)
                for i in range(len(hits)):
                    sim = hits[i].score
                    docno = hits[i].docid
                    label = 0
                    title = hits[i].lucene_document.get("title")
                    abstract = hits[i].lucene_document.get("abstract") 
                    
                    clean_abstract = clean_html(abstract)

                    out.write('{}\t{}\t{}\t{}\t{}\n'.format(qid, docno,round(float(sim), 11), i+1, label))
                    out.flush()
                    out_docs.write('{}\t{}\t{}\n'.format(docno, title, clean_abstract))
                    out_docs.flush()

In [0]:
def load_queries(path):
    queries = {}
    with open(path) as f:
        for i, line in enumerate(f):
                query_id, query, question, _ = line.rstrip().split('\t')
                queries[query_id] = question
                if i % 10 == 0:
                    print('Loading queries {}'.format(i))
    return queries
topics = load_queries('/content/topics_rnd1.tsv')

Loading queries 0
Loading queries 10
Loading queries 20


In [0]:
search_document(searcher, topics, 'COVID_run', K=1000)

In [0]:
! wc -l CORD19_run_1_docs.tsv

30000 CORD19_run_1_docs.tsv


In [0]:
import pandas as pd

docs = pd.read_csv('/content/COVID_run_docs.tsv', delimiter='\t', header =None, index_col=None, names=['did','title','abstract'])
docs.head()

Unnamed: 0,qid,did,sim,rank,rel
0,1,pyeb86on,2.0057,1,0
1,1,az1puj81,1.8659,2,0
2,1,nror02j6,1.8566,3,0
3,1,9pl7mta8,1.8374,4,0
4,1,ke5967zx,1.837399,5,0


In [0]:
docs_ = docs.drop_duplicates(['did'])
len(docs_)

9581

In [0]:
docs_[docs_['abstract'].isna()]

Unnamed: 0,did,title,abstract


In [0]:
docs_['abstract'] = docs_['abstract'].fillna('.')

In [0]:
docs_ = docs_[docs_['did'].isin(valid_doc_ids)]
len(docs_)

9581

In [0]:
docs_.to_csv('CORD19_run_1_docs.tsv', sep='\t', header=None, index=None)

In [0]:
! wc -l CORD19_run_1_docs.tsv

9581 CORD19_run_1_docs.tsv


In [0]:
run = pd.read_csv('/content/COVID_run.tsv', delimiter='\t', header =None, index_col=None, names=['qid','did','sim','rank','rel'])
run.head()

Unnamed: 0,qid,did,sim,rank,rel
0,1,pyeb86on,2.0057,1,0
1,1,az1puj81,1.8659,2,0
2,1,nror02j6,1.8566,3,0
3,1,9pl7mta8,1.8374,4,0
4,1,ke5967zx,1.837399,5,0


In [0]:
run_ = run[run['did'].isin(valid_doc_ids)]
len(run_)

30000

In [0]:
duplicateRowsDF = run[run.duplicated(['qid','did'], keep='last')]
len(duplicateRowsDF)

6

# Create Run document level from passage level

In [0]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
preds_path= "/content/drive/My Drive/preds/cord19"

In [0]:

runs = ['base','un_pair','mu_pair']
for run in runs:
        preds = pd.read_csv(f'{preds_path}/predictions_cord19_1_{run}.tsv', header=None, index_col=None, delimiter='\t', names=['id','pred','label'], dtype={'id':str})
        print(len(preds))

        query_doc_ids = pd.read_csv(f'{preds_path}/query_doc_ids_{run}.tsv', header=None, index_col=None, delimiter='\t', names=['id','qid','did','pass'], dtype={'id':str, 'qid':str,'did':str})
        print(len(query_doc_ids))

        preds_with_ids = pd.merge(preds,query_doc_ids, on='id')
        print(len(preds_with_ids))

        df = preds_with_ids.sort_values(by=['qid','did','pred'], ascending=[True,True,False])
        df['pred_max']=df.groupby(['qid','did'], sort=False)['pred'].transform(max)
        df= df.drop_duplicates(['qid','did'], keep='first')
        df = df.sort_values(by=['qid','pred_max'], ascending=[True, False])
        print(len(df))
        df.to_csv(f'{preds_path}/rank_predictions_cord19_1_{run}.tsv', sep='\t', header=None, index=None, columns=['qid','did','pass','pred_max'])

41941
41941
41941
29994
43359
43359
43359
29994
43359
43359
43359
29994


In [0]:
run_rank = pd.read_csv(f'{preds_path}/rank_predictions_cord19_1_{run}.tsv', header=None, index_col=None, delimiter='\t', names=['qid','did','pass','pred_max'], dtype={'id':str})
run_rank.head()

Unnamed: 0,qid,did,pass,pred_max
0,1,jbtrdvhe,0,2.097322
1,1,msohf5oa,0,1.966619
2,1,ec8lpgl3,0,1.920891
3,1,xuczplaf,0,1.888572
4,1,dqxfcwyu,0,1.857632


In [0]:
tag = "IRIT_marked"
runs = ['base','un_pair','mu_pair'] #'un_pass','un_pair','mu_pass',
for run in runs:
  with open(f'{preds_path}/run_trec_{run}.txt','w') as out:
    with open(f'{preds_path}/rank_predictions_cord19_1_{run}.tsv') as f:
      query=''
      for line in f:
        qid,did,pass_num,pred_max = line.rstrip().split('\t')
        if qid != query:
          query = qid
          rank = 1
        out.write(f"{qid} Q0 {did} {rank} {pred_max} {tag}_{run}\n")
        rank +=1

In [0]:
! gzip /content/drive/My\ Drive/preds/cord19/run_trec_mu_pair.txt

We use a BERT-base (12 layers, 768 hidden size) fine-tuned on Ms Marco passage set. We use a full ranking strategy with two stages: in the first stage, we use Anserini Bm25+ RM3 to retrieve top-1000 candidate documents for each topic using an index on the title+abstract of the CORD-19 documents, then we use the fine-tuned BERT to re-rank this list. 

We use a BERT-base (12 layers, 768 hidden size) fine-tuned on Ms Marco passage set with a marking strategy that puts focus on exact match signals between query and document terms. We use a full ranking strategy with two stages: in the first stage, we use Anserini Bm25+ RM3 to retrieve top-1000 candidate documents for each topic using an index on the title+abstract of the CORD-19 documents, then we use the fine-tuned BERT to re-rank this list.