This notebook experiments with the MSMarco Dataset, nothing that exciting to see here right now. But fun to toy with.

# Setup & Indexing

The next several cells we setup the SolrClient (run everytime), we download the MSMarco dataset, and index it into Solr.

In [None]:
from ltr.client import SolrClient
from ltr.index import rebuild


client=SolrClient()

In [None]:
# Download MSMarco Corpus and Queries -> data directory
from ltr import download
download_msmarco()

In [None]:
# Index to Solr. Docs & Question fro convenience

import csv
import sys

csv.field_size_limit(sys.maxsize)

def marco_docs():
    with open('data/msmarco-docs.tsv') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        i = 0;
        for row in reader:
        
                yield {"id": row[0],
                       "url": row[1],
                       "title": row[2],
                       "body": row[3],
                       "type": "document"}
                i+=1
                if i % 10000 == 0:
                    print("Dumped (%s/%s) %s" % (i, 3213835, row[1]))
                    
def marco_questions():
    with open('data/msmarco-doctrain-queries.tsv') as tsvfile:
        reader = csv.reader(tsvfile, delimiter='\t')
        i = 0;
        for row in reader:
        
                yield {"id": 'Q' + row[0],
                       "url": '',
                       "title": row[1],
                       "body": row[1],
                       "type": "question"}
                i+=1
                if i % 10000 == 0:
                    print("Dumped (q) (%s/%s) %s" % (i, 367013, row[1]))
                    

def marco_questions_and_docs():
    for q in marco_questions():
        yield q
    for d in marco_docs():
        yield d
        

In [None]:
from ltr.client import SolrClient
from ltr.index import rebuild


client=SolrClient()
rebuild(client, index='msmarco', doc_type='passage', doc_src=marco_questions_and_docs())

# Some experiments

In [None]:
# Parse out all the msmarco qrels

from ltr.helpers.msmarco.evaluate import QRel

qrels = {}

for qrel in QRel.read_qrels():
    qrels[qrel.qid] = qrel

In [None]:
from ltr.helpers.solr_escape import esc_kw


def eval_one(client, qrel, params, at=50):
    """ Eval a single Solr query param for MRR"""
    kw = esc_kw(qrel.keywords)

    params['q'] = kw
    params['rows'] = at
    params['fl'] = 'id'
    if 'fq' in params:
        params['fq'] = [params['fq'], 'type:document']
    else:
        params['fq'] = "type:document"
    
    hits = client.query(index='msmarco', query=params)
    
    ranking = [hit['id'] for hit in hits]
    rr = qrel.eval_rr(ranking)
    return rr

def eval_many(client, qrels, sample_qids, params):
    """ Execute many Solr searches, return MRR
        (this should show distribution...)"""
    sum_rr = 0.0
    num_evald = 0
    
    HIST_SLOTS = 100
    
    all_rrs = []

    print("Running %s queries" % len(sample_qids))

    for qid in sample_qids:
        qrel = qrels[qid]        
        rr = eval_one(client, qrel, params)
        
        all_rrs.append(rr)
  
        sum_rr += rr
        num_evald += 1
        print("%s, last rr: %s, mrr: %s" % (num_evald, rr, sum_rr / num_evald))      
    return all_rrs

In [None]:
def bigrams(terms):
    """ Every item in a list with it's next item """
    return zip(terms, terms[1:])

def analyze_tokenize(client, keywords):
    """ Use the appropriatte analyzer for bigrams"""
    tok_stream = client.analyze(index='msmarco', fieldtype='text_general',
                                text=keywords)
    terms = [tok['text'] for tok in tok_stream]
    return terms


def phrase_search(client, qrel):
    """Find the bigrams that optimize mrr"""
    params = {"qf": "body", "defType": "edismax"}
    
    sorted_bigrams = []
    
    base_rr = 0
    
    # Try to find the single best bigram...
    #all_bigrams = [big for big in bigrams(qrel.keywords.split(' '))]
    
    all_bigrams = [big for big in bigrams(analyze_tokenize(client, qrel.keywords))]
    all_bigrams.insert(0, ('', '') ) # Dont append any bigram
    for idx, bigram in enumerate(all_bigrams):
        if idx > 0 and bigram is not None and len(bigram) > 1: 
            params['fq'] = '{!lucene df=body}"%s %s"' % bigram
        
        rr = eval_one(client, qrel, params)
        if idx == 0:
            base_rr = rr
        sorted_bigrams.append( (rr - base_rr, bigram, idx) )
    sorted_bigrams.sort(key=lambda v: v[0], reverse=True)
    return sorted_bigrams
    
    

In [None]:
# Sample first queries to build a training set
import random


all_qrel_keys = [k for k in qrels.keys()]
random.shuffle(all_qrel_keys)
gain = 0
loss = 0
import csv

with open('data/train.csv', 'w') as f:
    
    writer = csv.writer(f)

    for idx, qid in enumerate(all_qrel_keys):
        qrel = qrels[qid]
        sorted_bigrams = phrase_search(client, qrel)
        formatted_big = [(scored_bigram[0], 
                          "%s %s" % (scored_bigram[1][0],scored_bigram[1][1]),
                          scored_bigram[2]
                         ) 
                         for scored_bigram in sorted_bigrams]
        
        for big in formatted_big:
            row = (big[0], big[1], big[2], qid, qrel.keywords)
            writer.writerow(row)

        gain += formatted_big[0][0]
        loss += formatted_big[-1][0]

        if (idx % 20 == 19):
            print("Generated training data %s queries" % idx)
            print("Gain/Loss %s/%s" % (gain/(idx+1), loss/(idx+1)))

In [None]:
all_rrs_title = eval_many(client, qrels, sample_qids, {"qf": "title", "defType": "edismax"})
all_rrs_body = eval_many(client, qrels, sample_qids, {"qf": "body", "defType": "edismax"})
all_rrs_all = eval_many(client, qrels, sample_qids, {"qf": "title body url", "defType": "edismax"})
all_rrs_all_tie = eval_many(client, qrels, sample_qids, {"qf": "title body url", "tie": 1.0, "defType": "edismax"})




In [None]:
# Matplotlib and seaborn for plotting
import matplotlib.pyplot as plt
import matplotlib
import numpy as np

%matplotlib inline

matplotlib.rcParams['font.size'] = 18
matplotlib.rcParams['figure.dpi'] = 200

from IPython.core.pylabtools import figsize
figsize(15, 5)


def draw_rr_dist(all_rrs, experiment_name):
    bins = [0.0, 0.05,0.1,0.15,0.2,0.25,
                 0.3,0.35,0.4,0.45,0.5,
                 0.55,0.6,0.65,0.7,0.75,
                 0.8,0.85,0.9,0.95,1.0]
    
    all_rrs = np.array(all_rrs)


    # Make the histogram using matplotlib, bins must be integet
    plt.hist(all_rrs, color = 'blue', edgecolor = 'black',
             bins = bins)

    # Add labels
    plt.title('RR by Query )' + experiment_name)
    plt.xlabel('RR'); plt.ylabel('Num Qs');
    plt.axvline(all_rrs.mean(), color='k', linestyle='dashed', linewidth=1)
    

In [None]:
draw_rr_dist(all_rrs_title, "Title Search")

In [None]:
draw_rr_dist(all_rrs_body, "Body Search")

In [None]:
draw_rr_dist(all_rrs_all, "Title Body Url Dismax Search")

In [None]:
draw_rr_dist(all_rrs_all_tie, "Title Body Url Dismax Search, tie")

In [None]:
all_rrs_all_pf2 = eval_many(client, qrels, sample_qids, 
                            {"qf": "body",
                             "pf2": "body",
                             "tie": 1.0, 
                             "defType": "edismax"})

