# Testing different embedding schemes for information retrieval
## Step 1: Load sample text

In [1]:
import numpy as np

In [2]:
def read_txt(path):
    with open(path, 'r', encoding="utf-8") as f:
        text = f.readlines()
#         text = [x.decode("utf-8") for x in f.readlines()]
    return text

text = read_txt('./data/fund_guide_extend.txt')
# text = [x.decode("utf-8") for x in text]

In [14]:
condition_terms = []
stringg=''
for tex in text:
    if (tex=='\n'):
        if (stringg != ''):
            condition_terms.append(stringg)
            stringg=''
        else: pass
    else: stringg+=tex
# condition_terms=[x.replace('\n', '. ') for x in condition_terms]

In [1]:
print(len(condition_terms))
condition_terms[2]

# Step 2: Load sample questions

In [16]:
import pandas as pd
df_queries = pd.read_csv('./data/Consolidated emails 2.csv', encoding='iso-8859-1')
print(len(df_queries))

31


In [17]:
# Answer collection
initial_list=df_queries['Email Queries'].values
modified_list=[val for val in initial_list for _ in (0, 1, 2)]
df_comparisons = pd.DataFrame({'queries' : modified_list})

# Step 3: define similarity function
The cosine similarity function returns the cosine similarity given a query string, an encoder, and an array of knowledgebase embeddings

In [18]:
from sklearn.metrics.pairwise import cosine_similarity

In [19]:
def cosine_sim_results(query_str, encoder, kb_embeddings, **kwargs):
    if kwargs:
        qn_embedding = encoder(query_str, kwargs.get('tokenize', None))
    else:
        qn_embedding = encoder(query_str)
    results = cosine_similarity(kb_embeddings, qn_embedding)
    return results

# Utility functions

In [20]:
def dummy_embed_fn(vector):
    return vector

# Exp 1: Test InferSent model

In [21]:
from InferSent.models import InferSent
import torch
V = 1
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [22]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [23]:
infersent.build_vocab(condition_terms, tokenize=True)

Found 2646(/2721) words with w2v vectors
Vocab size : 2646


In [24]:
infersent.update_vocab(condition_terms, tokenize=True)

Found 0(/75) words with w2v vectors
New vocab size : 2646 (added 0 words)


In [25]:
response_results = infersent.encode(condition_terms, tokenize=True)
questions = list(df_queries['Email Queries'])
question_results = infersent.encode(questions, tokenize=True)

In [26]:
print(response_results.shape, question_results.shape)

(148, 4096) (31, 4096)


In [27]:
# sample question
# question=['how frequent can i get disbursements?']
# condition_terms[cosine_sim_results(question, infersent.encode, embeddings, tokenize=True).argmax()]

In [28]:
# question=['who is RESPONSIBLE FOR DATA CHARGES?']
# sortargs=cosine_sim_results(question, infersent.encode, embeddings, tokenize=True).argsort(axis=0)
# print(sortargs.shape)
# for ii,arg in enumerate(sortargs[::-1,0]):
#     print(ii, condition_terms[arg])
#     if ii==4:
#         break

In [29]:
# list(cosine_sim_results(question, infersent.encode, embeddings, tokenize=True).argsort(axis=0).shape[-2:])

## Test with example queries

In [2]:
responses=[]
for index, ii in enumerate(df_queries.iterrows()):
    print('QN: ', ii[1]['Email Queries'])
    sortargs=np.flip(cosine_sim_results(question_results[index].reshape(1, -1), dummy_embed_fn, response_results).argsort(axis=0))
    for ans in range(3):
        responses.append(condition_terms[sortargs[ans,0]])
        print('ANS: ', condition_terms[sortargs[ans,0]])
df_comparisons['infersent']=responses

In [31]:
# responses=[]
# for ii in df_queries.iterrows():
#     print('QN: ', ii[1]['Email Queries'])
#     sortargs=np.flip(cosine_sim_results([ii[1]['Email Queries']], infersent.encode, embeddings, tokenize=True).argsort(axis=0))
#     for ans in range(3):
#         print(sortargs[ans, 0])
#         responses.append(condition_terms[sortargs[ans,0]])
#         print('ANS: ', condition_terms[sortargs[ans,0]])
# #     answer = condition_terms[cosine_sim_results([ii[1]['Email Queries']], infersent.encode, embeddings, tokenize=True).argmax()]
# #     print('ANS: ', answer)
# #     responses.append(answer)
# #     print('\n')
# df_comparisons['infersent']=responses

# Exp 2: Google universal sentence encoder

In [32]:
# !pip install tensorflow-gpu
# !pip install tensorflow-hub
import tensorflow as tf
import tensorflow_hub as hub

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [33]:
#download the model to local so it can be used again and again
# !mkdir google_use
# Download the module, and uncompress it to the destination folder. 
# !curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed" | tar -zxvC ./google_use

In [34]:
embed = hub.Module("./google_use")

Instructions for updating:
Colocations handled automatically by placer.


W0911 05:06:37.878517 139722147989248 deprecation.py:323] From /anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [35]:
def use_embed(terms):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(terms))
    return message_embeddings

# to only load session once.
# def embed_useT(module):
#     with tf.Graph().as_default():
#         sentences = tf.placeholder(tf.string)
#         embed = hub.Module(module)
#         embeddings = embed(sentences)
#         session = tf.train.MonitoredSession()
#     return lambda x: session.run(embeddings, {sentences: x})


In [36]:
response_results = use_embed(condition_terms)
all_questions = df_queries['Email Queries']
questions=list(all_questions)
question_results = use_embed(questions)
print(response_results.shape, question_results.shape)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0911 05:06:50.658029 139722147989248 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0911 05:07:01.882642 139722147989248 saver.py:1483] Saver not created because there are no variables in the graph to restore


(148, 512) (31, 512)


In [37]:
# responses=[]
# for ii in df_queries.iterrows():
#     print('QN: ', ii[1]['Email Queries'])
#     answer = condition_terms[cosine_sim_results([ii[1]['Email Queries']], use_embed, message_embeddings).argmax()]
#     print('ANS: ', answer)
#     responses.append(answer)
#     print('\n')
# df_comparisons['use']=responses

In [3]:

responses=[]
for index, ii in enumerate(df_queries.iterrows()):
    print('QN: ', ii[1]['Email Queries'])
    sortargs=np.flip(cosine_sim_results(question_results[index].reshape(1, -1), dummy_embed_fn, response_results).argsort(axis=0))
    for ans in range(3):
        responses.append(condition_terms[sortargs[ans,0]])
        print('ANS: ', condition_terms[sortargs[ans,0]])
df_comparisons['use']=responses

In [39]:
# responses=[]
# for ii in df_queries.iterrows():
#     print('QN: ', ii[1]['Email Queries'])
#     sortargs=np.flip(cosine_sim_results([ii[1]['Email Queries']], use_embed, message_embeddings).argsort(axis=0))
#     for ans in range(3):
#         responses.append(condition_terms[sortargs[ans,0]])
#         print('ANS: ', condition_terms[sortargs[ans,0]])
# df_comparisons['use']=responses

# Exp 3: Test the new QnA USE

In [40]:
# !pip install sentencepiece
# !pip install tf-sentencepiece

In [41]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tf_sentencepiece

In [42]:
# define inputs
responses = condition_terms
response_contexts = responses # no need to provide context
all_questions = df_queries['Email Queries']
questions=list(all_questions)

In [43]:
# Set up graph.
g = tf.Graph()
with g.as_default():
  embed = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/1")
  question_embeddings = embed(
    dict(input=questions),
    signature="question_encoder", as_dict=True)

  response_embeddings = embed(
    dict(input=responses,
         context=response_contexts),
    signature="response_encoder", as_dict=True)

  init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

# Initialize session.
session = tf.Session(graph=g)
session.run(init_op)



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0911 05:08:36.115073 139722147989248 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0911 05:09:54.914249 139722147989248 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [44]:
# Compute embeddings.
response_results = session.run(response_embeddings)


In [45]:
question_results = session.run(question_embeddings)

In [46]:
response_results['outputs'].shape

(148, 512)

In [47]:
# responses = []
# for index, ii in enumerate(df_queries.iterrows()):
#     print('QN: ', ii[1]['Email Queries'])
#     answer=condition_terms[cosine_sim_results(question_results['outputs'][index].reshape(1, -1), dummy_embed_fn, response_results['outputs']).argmax()]
#     print('ANS: ', answer)
#     responses.append(answer)
#     print('\n')
# df_comparisons['use_qa']=responses

In [4]:
responses=[]
for index, ii in enumerate(df_queries.iterrows()):
    print('QN: ', ii[1]['Email Queries'])
    sortargs=np.flip(cosine_sim_results(question_results['outputs'][index].reshape(1, -1), dummy_embed_fn, response_results['outputs']).argsort(axis=0))
    for ans in range(3):
        responses.append(condition_terms[sortargs[ans,0]])
        print('ANS: ', condition_terms[sortargs[ans,0]])
df_comparisons['use_qa']=responses

# Exp 3a: use the same encoder for both queries and inputs
It looks like there is more to the model than just some contextual input. the query and responses are treated differently!

In [49]:
# # Set up graph.
# g = tf.Graph()
# with g.as_default():
#   module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/1")
#   question_embeddings = module(
#     dict(input=questions),
#     signature="question_encoder", as_dict=True)

#   response_embeddings = module(
#     dict(input=responses),
#     signature="question_encoder", as_dict=True)

#   init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
# g.finalize()

# # Initialize session.
# session = tf.Session(graph=g)
# session.run(init_op)

# # Compute embeddings.
# response_results = session.run(response_embeddings)
# question_results = session.run(question_embeddings)

# responses = []
# for index, ii in enumerate(df_queries.iterrows()):
#     print('QN: ', ii[1]['Email Queries'])
#     answer=condition_terms[cosine_sim_results(question_results['outputs'][index].reshape(1, -1), dummy_embed_fn, response_results['outputs']).argmax()]
#     print('ANS: ', answer)
#     responses.append(answer)
#     print('\n')
# df_comparisons['use_qa_sp']=responses

# Exp 3b: test with inner product instead of cos dist
inner product does not normalize the magnitude of the vectors. I get the same results from the USE qna model.

In [50]:
# qna is not used like that in the examples. Instead, an inner product is taken.
# ip = np.inner(question_results["outputs"], response_results["outputs"])

In [51]:
# responses = []
# for ii in ip.argmax(axis=1):
#     responses.append(condition_terms[ii])
# df_comparisons['use_qa_dot'] = responses


# Exp 4: test with facebook LASER

In [5]:
# df_comparisons.head()

# Save to csv

In [53]:
df_comparisons.to_csv('./predictions_3_extend_batch2.csv', index=False)

In [18]:
df_ind = pd.read_csv('./predictions_3_extend_c.csv')

In [19]:
df_ind['use_qa']=df_comparisons['use_qa']

In [34]:
df_ind.to_csv('./predictions_3_extend_c.csv', index=False)

NameError: name 'df_ind' is not defined

In [6]:
df_ind.head()

# Test insurance db

In [230]:
df_query = pd.read_csv('../insuranceQA/V2/InsuranceQA.question.anslabel.raw.100.pool.solr.test.encoded', delimiter='\t', header=None)
df_doc = pd.read_csv('../insuranceQA/V2/InsuranceQA.label2answer.raw.encoded', delimiter='\t', header=None)
df_ind2word = pd.read_csv('../insuranceQA/V2/vocabulary', sep='\t', header=None, quotechar='', quoting=3, keep_default_na=False)
dict_ind2word = pd.Series(df_ind2word[1].values,index=df_ind2word[0].values).to_dict()
# dict_ind2word.update(dict_ind2word_v1)

In [231]:
def wordifier(tokes):
    return ' '.join([dict_ind2word[ind] for ind in tokes.strip().split(' ')])
df_doc['text']=df_doc.apply(lambda x: wordifier(x[1]), axis=1)
df_query['text']=df_query.apply(lambda x: wordifier(x[1]), axis=1)
df_query

Unnamed: 0,0,1,2,3,text
0,life-insurance,idx_1285 idx_65774 idx_862 idx_605 idx_448 idx...,16164 99 26337,15813 3286 22367 21353 4977 6406 24335 16681 2...,What Happens When Term Life Insurance Is Paid Up?
1,renters-insurance,idx_1285 idx_1010 idx_999 idx_136 idx_65807,22542 4380,2235 26739 24916 17855 3406 21201 70 19553 220...,What Does Renters Insurance Cover?
2,home-insurance,idx_1010 idx_17002 idx_382 idx_65840 idx_14927...,26439,23486 2424 14974 3344 7712 6220 5346 12474 558...,Does Owning A Pitbull Raise Homeowners Insurance?
3,long-term-care-insurance,idx_1285 idx_239 idx_49 idx_739 idx_31 idx_57 ...,6996,17044 11643 1904 25722 17842 12460 20363 6904 ...,What Should You Look For In Long Term Care Ins...
4,medicare-insurance,idx_2363 idx_467 idx_8080 idx_31 idx_9966 idx_...,9128,9128 13322 21601 21471 6442 5412 24861 23536 2...,Will Medicare Pay For Smoking Cessation?
5,life-insurance,idx_1285 idx_3815 idx_66759 idx_31 idx_448 idx...,14495,2922 14495 17021 19342 27115 26050 15659 9422 ...,What Is Eoi For Life Insurance?
6,life-insurance,idx_1285 idx_2837 idx_2363 idx_6434 idx_448 id...,7846 21998,1921 3228 21998 16946 4388 3091 5107 12931 104...,What Companies Will Issue Life Insurance To Th...
7,health-insurance,idx_3189 idx_3815 idx_441 idx_136 idx_1132 idx...,3183,26289 13394 4703 25717 1438 1449 25865 25950 1...,Why Is Health Insurance So Expensive In New York?
8,health-insurance,idx_862 idx_1233 idx_2684 idx_9744 idx_510 idx...,12919,4655 17619 24092 5011 19350 1140 16681 22024 2...,When Do We Have To Have Health Insurance?
9,renters-insurance,idx_1632 idx_510 idx_20062 idx_382 idx_999 idx...,19814,3705 7537 2707 14075 3620 11522 11466 5052 893...,How To Submit A Renters Insurance Claim?


In [232]:
df_doc[df_doc[0]==99]['text'].values


array(["Term life insurance is never paid up. Assuming you make your premium payments, if you are still alive at the end of the term life insurance policy's stated time period (e.g. - 10 years) the policy ends and the life insurance protection ceases. In this case the insurance company has won the bet, you didn't die during the period the policy was taken out for and they get to keep all the premium paid for taking on your risk during that time period. The only exception to this general rule is in the case where a Return of Premium rider was purchased with the term life policy. In this situation if you are still alive when the term life policy reaches the end of its stated period of coverage the policyholder is refunded the premium they have paid to the insurance company over the life of the policy. You do pay additional premium costs for this Return of Premium provision."],
      dtype=object)

In [179]:
' '.join([dict_ind2word[ind] for ind in df_ins[1][3].split(' ')])

'Can I Drive A New Car Home Without Insurance?'

In [143]:
with open('../insuranceQA/V2/vocabulary') as f:
    print(f.readline().split('\t'))

['idx_17904', 'rating/result\n']
