# Testing different embedding schemes for information retrieval
## Step 1: Load sample text

In [1]:
def read_txt(path):
    with open(path, 'r', encoding="utf-8") as f:
        text = f.readlines()
#         text = [x.decode("utf-8") for x in f.readlines()]
    return text

text = read_txt('./data/fund_guide.txt')
# text = [x.decode("utf-8") for x in text]

In [2]:
condition_terms = []
stringg=''
for tex in text:
    if ((tex=='\n') and (stringg != '')):
        condition_terms.append(stringg)
        stringg=''
    else: stringg+=tex
condition_terms=[x.replace('\n', ' ') for x in condition_terms]

In [3]:
print(len(condition_terms))
# condition_terms

32


# Step 2: Load sample questions

In [4]:
import pandas as pd
df_queries = pd.read_csv('./data/Consolidated emails.csv', encoding='iso-8859-1')
print(len(df_queries))

  return f(*args, **kwds)
  return f(*args, **kwds)


73


In [59]:
# Answer collection
df_comparisons = pd.DataFrame({'queries' : df_queries['Email Queries'].copy()})

# Step 3: define similarity function
The cosine similarity function returns the cosine similarity given a query string, an encoder, and an array of knowledgebase embeddings

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

In [7]:
def cosine_sim_results(query_str, encoder, kb_embeddings, **kwargs):
    if kwargs:
        qn_embedding = encoder(query_str, kwargs.get('tokenize', None))
    else:
        qn_embedding = encoder(query_str)
    results = cosine_similarity(kb_embeddings, qn_embedding)
    return results

# Exp 1: Test InferSent model

In [8]:
from InferSent.models import InferSent
import torch
V = 1
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

<All keys matched successfully>

In [9]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [10]:
infersent.build_vocab(condition_terms, tokenize=True)

Found 1094(/1124) words with w2v vectors
Vocab size : 1094


In [11]:
embeddings = infersent.encode(condition_terms, tokenize=True)

In [12]:
embeddings.shape

(32, 4096)

In [13]:
# sample question
# question=['how frequent can i get disbursements?']
# condition_terms[cosine_sim_results(question, infersent.encode, embeddings, tokenize=True).argmax()]

In [14]:
# question=['who is RESPONSIBLE FOR DATA CHARGES?']
# sortargs=cosine_sim_results(question, infersent.encode, embeddings, tokenize=True).argsort(axis=0)
# print(sortargs.shape)
# for ii,arg in enumerate(sortargs[::-1,0]):
#     print(ii, condition_terms[arg])
#     if ii==4:
#         break

In [15]:
# list(cosine_sim_results(question, infersent.encode, embeddings, tokenize=True).argsort(axis=0).shape[-2:])

## Test with example queries

In [70]:
responses=[]
for ii in df_queries.iterrows():
    print('QN: ', ii[1]['Email Queries'])
    answer = condition_terms[cosine_sim_results([ii[1]['Email Queries']], infersent.encode, embeddings, tokenize=True).argmax()]
    print('ANS: ', answer)
    responses.append(answer)
    print('\n')
df_comparisons['infersent']=responses

# Exp 2: Google universal sentence encoder

In [17]:
# !pip install tensorflow-gpu
# !pip install tensorflow-hub
import tensorflow as tf
import tensorflow_hub as hub

  return f(*args, **kwds)
  from ._conv import register_converters as _register_converters


In [18]:
#download the model to local so it can be used again and again
!mkdir google_use
# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed" | tar -zxvC ./google_use

mkdir: cannot create directory ‘google_use’: File exists
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
./
./tfhub_module.pb
./variables/
./variables/variables.data-00000-of-00001
 93  745M   93  696M    0     0  66.7M      0  0:00:11  0:00:10  0:00:01 70.8M./variables/variables.index
./assets/
./saved_model.pb
100  745M  100  745M    0     0  67.0M      0  0:00:11  0:00:11 --:--:-- 70.3M


In [19]:
embed = hub.Module("./google_use")

Instructions for updating:
Colocations handled automatically by placer.


W0904 05:39:11.343397 140700019140352 deprecation.py:323] From /anaconda/envs/py35/lib/python3.5/site-packages/tensorflow/python/ops/control_flow_ops.py:3632: colocate_with (from tensorflow.python.framework.ops) is deprecated and will be removed in a future version.
Instructions for updating:
Colocations handled automatically by placer.


In [20]:
def use_embed(terms):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(terms))
    return message_embeddings

# to only load session once.
# def embed_useT(module):
#     with tf.Graph().as_default():
#         sentences = tf.placeholder(tf.string)
#         embed = hub.Module(module)
#         embeddings = embed(sentences)
#         session = tf.train.MonitoredSession()
#     return lambda x: session.run(embeddings, {sentences: x})


In [21]:
message_embeddings = use_embed(condition_terms)
message_embeddings.shape

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0904 05:39:15.498502 140700019140352 saver.py:1483] Saver not created because there are no variables in the graph to restore


(32, 512)

In [23]:
responses=[]
for ii in df_queries.iterrows():
    print('QN: ', ii[1]['Email Queries'])
    answer = condition_terms[cosine_sim_results([ii[1]['Email Queries']], use_embed, message_embeddings).argmax()]
    print('ANS: ', answer)
    responses.append(answer)
    print('\n')
df_comparisons['use']=responses

# Exp 3: Test the new QnA USE

In [None]:
# !pip install sentencepiece
# !pip install tf-sentencepiece

In [24]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tf_sentencepiece

In [25]:
# define inputs
responses = condition_terms
response_contexts = responses # no need to provide context
all_questions = df_queries['Email Queries']
questions=list(all_questions)

In [26]:
# Set up graph.
g = tf.Graph()
with g.as_default():
  module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/1")
  question_embeddings = module(
    dict(input=questions),
    signature="question_encoder", as_dict=True)

  response_embeddings = module(
    dict(input=responses,
         context=response_contexts),
    signature="response_encoder", as_dict=True)

  init_op = tf.group([tf.global_variables_initializer(), tf.tables_initializer()])
g.finalize()

# Initialize session.
session = tf.Session(graph=g)
session.run(init_op)



INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0904 06:25:29.440153 140700019140352 saver.py:1483] Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


I0904 06:26:54.083668 140700019140352 saver.py:1483] Saver not created because there are no variables in the graph to restore


In [27]:
# Compute embeddings.
response_results = session.run(response_embeddings)


In [28]:
question_results = session.run(question_embeddings)

In [29]:
response_results['outputs'].shape

(32, 512)

In [30]:
def dummy_embed_fn(vector):
    return vector

In [31]:
def qna_use_embed(question_str):
    questions = question_str
    question_results = session.run(question_embeddings)
    return question_results


# np.inner(question_results["outputs"], response_result["outputs"])

In [71]:
responses = []
for index, ii in enumerate(df_queries.iterrows()):
    print('QN: ', ii[1]['Email Queries'])
    answer=condition_terms[cosine_sim_results(question_results['outputs'][index].reshape(1, -1), dummy_embed_fn, response_results['outputs']).argmax()]
    print('ANS: ', answer)
    responses.append(answer)
    print('\n')
df_comparisons['use_qa']=responses

# Exp 4: test with inner product instead of cos dist
inner product does not normalize the magnitude of the vectors.

In [54]:
# qna is not used like that in the examples. Instead, an inner product is taken.
ip = np.inner(question_results["outputs"], response_results["outputs"])

In [65]:
responses = []
for ii in ip.argmax(axis=1):
    responses.append(condition_terms[ii])
df_comparisons['use_qa_dot'] = responses


# Save to csv

In [69]:
df_comparisons.to_csv('./predictions.csv', index=False)