# Evaluate model performance against insurance test data

In [1]:
import sys
sys.path.append('..')
import numpy as np
import pandas as pd
from pathlib import Path
from src.utils import question_cleaner, display_qn_and_ans
from sklearn.metrics.pairwise import cosine_similarity
import logging
logging.basicConfig(filename='evaluation.log',level=logging.DEBUG)

  return f(*args, **kwds)
  return f(*args, **kwds)


## Load QA data

In [3]:
datapath=Path('../data')
df_query = pd.read_csv(datapath/'insuranceQA/V2/InsuranceQA.question.anslabel.raw.100.pool.solr.test.encoded', delimiter='\t', header=None)
df_doc = pd.read_csv(datapath/'insuranceQA/V2/InsuranceQA.label2answer.raw.encoded', delimiter='\t', header=None)
df_ind2word = pd.read_csv(datapath/'insuranceQA/V2/vocabulary', sep='\t', header=None, quotechar='', quoting=3, keep_default_na=False)
dict_ind2word = pd.Series(df_ind2word[1].values,index=df_ind2word[0].values).to_dict()

## Extract only questions that have answers
The dataset has this weird thing where the questions that have no correct answers have random answers in the answer column that does not match the question.
Also set the index for df_doc for easy reference with .loc later.

In [4]:
df_query=question_cleaner(df_query)
df_doc=df_doc.set_index(0)

total:2000, removed:677, remainder:1323


## Convert from tokens to full text

In [5]:
def wordifier(tokes):
    return ' '.join([dict_ind2word[ind] for ind in tokes.strip().split(' ')])
df_doc['text']=df_doc.apply(lambda x: wordifier(x[1]), axis=1)
df_query['text']=df_query.apply(lambda x: wordifier(x[1]), axis=1)
display(df_query.head(1))
display(df_doc.head(1))

Unnamed: 0,0,1,2,3,text
4,medicare-insurance,idx_2363 idx_467 idx_8080 idx_31 idx_9966 idx_...,9128,9128 13322 21601 21471 6442 5412 24861 23536 2...,Will Medicare Pay For Smoking Cessation?


Unnamed: 0_level_0,1,text
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,idx_1 idx_2 idx_3 idx_4 idx_5 idx_6 idx_7 idx_...,Coverage follows the car. Example 1: If you we...


In [6]:
display_qn_and_ans(df_query, df_doc, index=0)

Question is: Will Medicare Pay For Smoking Cessation?
Answer index:  [9128]
Answers:  ['Medicare will not pay for smoking cessation products such as nicotine substitutes (Nicorette gum, nicotine patch, etc), and Medicare will not pay for pills that reduce the craving to smoke. But Medicare will pay for up to 8 face-to-face smoking cessation counseling sessions with a qualified Medicare doctor during a 12 month period.']


# Start scoring

In [7]:
def ranker(model, question_vectors, df_query, df_doc):
    predictions=[]
    gts=[]
    for ii, question_vector in enumerate(question_vectors):
        kb=[int(xx) for xx in (df_query[3].iloc[ii]).split(' ')]
        gt = [int(xx) for xx in (df_query[2].iloc[ii]).split(' ')]
        doc_vectors = model.predict(df_doc.loc[kb]['text'].tolist())
        cossim = cosine_similarity(doc_vectors, question_vector.reshape(1, -1))
        sortargs=np.flip(cossim.argsort(axis=0))
        returnedans = [kb[jj[0]] for jj in sortargs]
        predictions.append(returnedans)
        gts.append(gt)
    return predictions, gts
        
def scorer(predictions, gts, k=3):
    'returns score@k'
    score=0
    total=0
    for gt, prediction in zip(gts, predictions):
        if bool(set(gt) & set(prediction[:k])):
            score+=1
        total+=1
    return score/total


## USE QA

In [11]:
# !mkdir google_use_qa
# !curl -L "https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/1?tf-hub-format=compressed" | tar -zxvC ./google_use_qa

mkdir: cannot create directory ‘google_use_qa’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
assets/
saved_model.pb
tfhub_module.pb
variables/
variables/variables.index
variables/variables.data-00000-of-00001
100  317M  100  317M    0     0  82.3M      0  0:00:03  0:00:03 --:--:-- 96.8M


In [8]:
%%time
from src.model import GoldenRetriever
model = GoldenRetriever()
question_vectors = model.predict(df_query['text'].tolist(), type='query')
print('questions vectorized!')
predictions, gts = ranker(model, question_vectors, df_query, df_doc)
for k in range(5):
    print('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))
model.close()

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


model initiated!
questions vectorized!
Score @1: 0.3870
Score @2: 0.5193
Score @3: 0.5896
Score @4: 0.6478
Score @5: 0.6977
CPU times: user 10min 31s, sys: 4min 58s, total: 15min 29s
Wall time: 17min 14s


## USE

In [8]:
%%time
from src.model import USEModel
model = USEModel()
question_vectors = model.predict(df_query['text'].tolist())
predictions, gts = ranker(model, question_vectors, df_query, df_doc)
for k in range(5):
    print('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))
model.close()

Score @1: 0.2509
Score @2: 0.3462
Score @3: 0.4271
Score @4: 0.4807
Score @5: 0.5344
CPU times: user 7min 28s, sys: 5min 4s, total: 12min 32s
Wall time: 3min 3s


## Infersent

In [10]:
%%time
# increased batch size from 32 -> 256, moved to gpu.
from src.model import InferSent
model = InferSent()
model.infersent.cuda()
model.build_vocab(df_query['text'].tolist())
model.update_vocab(df_doc['text'].tolist())
question_vectors = model.predict(df_query['text'].tolist())
predictions, gts = ranker(model, question_vectors, df_query, df_doc)
for k in range(5):
    print('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))
    logging.info('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))

Found 1408(/1417) words with w2v vectors
Vocab size : 1408
Found 25628(/31486) words with w2v vectors
New vocab size : 27036 (added 25628 words)
Score @1: 0.0831
Score @2: 0.1338
Score @3: 0.1814
Score @4: 0.2260
Score @5: 0.2683
CPU times: user 17min 56s, sys: 8min 13s, total: 26min 10s
Wall time: 14min 26s


# Test on a tfidf baseline

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [9]:
class bow_baseline():
    def __init__(self):
        self.vectorizer=TfidfVectorizer()

    def fit(self, text):
        self.vectorizer.fit(text)
        
    def predict(self, text):
        return self.vectorizer.transform(text)

In [10]:
%%time
model = bow_baseline()
model.fit(df_query['text'].tolist())
model.fit(df_doc['text'].tolist())
question_vectors = model.predict(df_query['text'].tolist())
predictions, gts = ranker(model, question_vectors, df_query, df_doc)
for k in range(5):
    print('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))
    logging.info('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))

Score @1: 0.2457
Score @2: 0.3492
Score @3: 0.4127
Score @4: 0.4611
Score @5: 0.4989
CPU times: user 20.9 s, sys: 0 ns, total: 20.9 s
Wall time: 20.9 s
