# Evaluate model performance against insurance test data

In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
from src.utils import question_cleaner, display_qn_and_ans
from sklearn.metrics.pairwise import cosine_similarity
import logging
logging.basicConfig(filename='evaluation.log',level=logging.DEBUG)

  return f(*args, **kwds)
  return f(*args, **kwds)


## Load QA data

In [2]:
datapath=Path('./data')
df_query = pd.read_csv(datapath/'insuranceQA/V2/InsuranceQA.question.anslabel.raw.100.pool.solr.test.encoded', delimiter='\t', header=None)
df_doc = pd.read_csv(datapath/'insuranceQA/V2/InsuranceQA.label2answer.raw.encoded', delimiter='\t', header=None)
df_ind2word = pd.read_csv(datapath/'insuranceQA/V2/vocabulary', sep='\t', header=None, quotechar='', quoting=3, keep_default_na=False)
dict_ind2word = pd.Series(df_ind2word[1].values,index=df_ind2word[0].values).to_dict()

## Extract only questions that have answers
The dataset has this weird thing where the questions that have no correct answers have random answers in the answer column that does not match the question.
Also set the index for df_doc for easy reference with .loc later.

In [3]:
df_query=question_cleaner(df_query)
df_doc=df_doc.set_index(0)

total:2000, removed:677, remainder:1323


## Convert from tokens to full text

In [4]:
def wordifier(tokes):
    return ' '.join([dict_ind2word[ind] for ind in tokes.strip().split(' ')])
df_doc['text']=df_doc.apply(lambda x: wordifier(x[1]), axis=1)
df_query['text']=df_query.apply(lambda x: wordifier(x[1]), axis=1)
display(df_query.head(1))
display(df_doc.head(1))

Unnamed: 0,0,1,2,3,text
4,medicare-insurance,idx_2363 idx_467 idx_8080 idx_31 idx_9966 idx_...,9128,9128 13322 21601 21471 6442 5412 24861 23536 2...,Will Medicare Pay For Smoking Cessation?


Unnamed: 0_level_0,1,text
0,Unnamed: 1_level_1,Unnamed: 2_level_1
1,idx_1 idx_2 idx_3 idx_4 idx_5 idx_6 idx_7 idx_...,Coverage follows the car. Example 1: If you we...


In [5]:
display_qn_and_ans(df_query, df_doc, index=0)

Question is: Will Medicare Pay For Smoking Cessation?
Answer index:  [9128]
Answers:  ['Medicare will not pay for smoking cessation products such as nicotine substitutes (Nicorette gum, nicotine patch, etc), and Medicare will not pay for pills that reduce the craving to smoke. But Medicare will pay for up to 8 face-to-face smoking cessation counseling sessions with a qualified Medicare doctor during a 12 month period.']


# Start scoring

In [6]:
def ranker(model, question_vectors, df_query, df_doc):
    predictions=[]
    gts=[]
    for ii, question_vector in enumerate(question_vectors):
        kb=[int(xx) for xx in (df_query[3].iloc[ii]).split(' ')]
        gt = [int(xx) for xx in (df_query[2].iloc[ii]).split(' ')]
        doc_vectors = model.predict(df_doc.loc[kb]['text'].tolist())
        cossim = cosine_similarity(doc_vectors, question_vector.reshape(1, -1))
        sortargs=np.flip(cossim.argsort(axis=0))
        returnedans = [kb[jj[0]] for jj in sortargs]
        predictions.append(returnedans)
        gts.append(gt)
    return predictions, gts
        
def scorer(predictions, gts, k=3):
    'returns score@k'
    score=0
    total=0
    for gt, prediction in zip(gts, predictions):
        if bool(set(gt) & set(prediction[:k])):
            score+=1
        total+=1
    return score/total


## USE QA

In [7]:
from src.model import QnaEncoderModel
model = QnaEncoderModel()
question_vectors = model.predict(df_query['text'].tolist(), type='query')
print('questions vectorized!')
predictions, gts = ranker(model, question_vectors, df_query, df_doc)
for k in range(5):
    print('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))
model.close()

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


Instructions for updating:
Colocations handled automatically by placer.


W0913 08:28:35.181463 140347068307200 deprecation.py:323] From /anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0913 08:28:50.823433 140347068307200 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0913 08:28:57.575173 140347068307200 saver.py:1483] Saver not created because there are no variables in the graph to restore


questions vectorized!
Score @1: 0.3870
Score @2: 0.5193
Score @3: 0.5896
Score @4: 0.6478
Score @5: 0.6977


## USE

In [7]:
from src.model import USEModel
model = USEModel()
question_vectors = model.predict(df_query['text'].tolist())
predictions, gts = ranker(model, question_vectors, df_query, df_doc)
for k in range(5):
    print('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))
model.close()

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


Instructions for updating:
Colocations handled automatically by placer.


W0913 14:48:45.433717 140079693276928 deprecation.py:323] From /anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0913 14:48:46.988466 140079693276928 saver.py:1483] Saver not created because there are no variables in the graph to restore


Score @1: 0.2509
Score @2: 0.3462
Score @3: 0.4271
Score @4: 0.4807
Score @5: 0.5344


## Infersent

In [7]:
%%time
from src.model import InferSent
model = InferSent()
model.build_vocab(df_query['text'].tolist())
model.update_vocab(df_doc['text'].tolist())
question_vectors = model.predict(df_query['text'].tolist())
predictions, gts = ranker(model, question_vectors, df_query, df_doc)
for k in range(5):
    print('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))
    logging.info('Score @{}: {:.4f}'.format(k+1, scorer(predictions, gts, k+1)))

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


Found 1408(/1417) words with w2v vectors
Vocab size : 1408
Found 25628(/31486) words with w2v vectors
New vocab size : 27036 (added 25628 words)
Score @1: 0.0831
Score @2: 0.1338
Score @3: 0.1814
Score @4: 0.2260
Score @5: 0.2683
CPU times: user 19h 25min 14s, sys: 25min 46s, total: 19h 51min 1s
Wall time: 3h 33min 14s
