# Testing different embedding schemes for information retrieval
## Step 1: Load sample text

In [8]:
def read_txt(path):
    with open(path, 'r', encoding="utf-8") as f:
        text = f.readlines()
#         text = [x.decode("utf-8") for x in f.readlines()]
    return text

text = read_txt('./data/apple.txt')
# text = [x.decode("utf-8") for x in text]

In [9]:
condition_terms = []
stringg=''
for tex in text[1:]:
    if tex=='\n':
        condition_terms.append(stringg)
        stringg=''
    else: stringg+=tex
condition_terms=[x.replace('\n', ' ') for x in condition_terms]

In [10]:
print(len(condition_terms))
condition_terms

31


['A. INTRODUCTION TO OUR SERVICES This Agreement governs your use of Apple’s services (“Services”), through which you can buy, get, license, rent or subscribe to content, apps (“Apps”), and other in-app services (collectively, “Content”). Content may be offered through the Services by Apple or a third party. Our Services are available for your use in your country of residence (“Home Country”). To use our Services, you need compatible hardware, software (latest version recommended and sometimes required) and Internet access (fees may apply). Our Services’ performance may be affected by these factors. ',
 'B. USING OUR SERVICES PAYMENTS, TAXES, AND REFUNDS You can acquire Content on our Services for free or for a charge, either of which is referred to as a “Transaction.” Each Transaction is an electronic contract between you and Apple, and/or you and the entity providing the Content on our Services. However, if you are a customer of Apple Distribution International and you acquire an App

## Test InferSent model

In [None]:
!mkdir GloVe
!curl -Lo GloVe/glove.840B.300d.zip http://nlp.stanford.edu/data/glove.840B.300d.zip
!unzip GloVe/glove.840B.300d.zip -d GloVe/
!mkdir fastText
!curl -Lo fastText/crawl-300d-2M.vec.zip https://dl.fbaipublicfiles.com/fasttext/vectors-english/crawl-300d-2M.vec.zip
!unzip fastText/crawl-300d-2M.vec.zip -d fastText/

In [3]:
!mkdir encoder
!curl -Lo encoder/infersent1.pkl https://dl.fbaipublicfiles.com/infersent/infersent1.pkl
!curl -Lo encoder/infersent2.pkl https://dl.fbaipublicfiles.com/infersent/infersent2.pkl

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  22.0M      0  0:00:06  0:00:06 --:--:-- 26.1M
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  146M  100  146M    0     0  17.5M      0  0:00:08  0:00:08 --:--:-- 25.5M


In [13]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [14]:
from InferSent.models import InferSent
import torch
V = 1
MODEL_PATH = 'encoder/infersent%s.pkl' % V
params_model = {'bsize': 64, 'word_emb_dim': 300, 'enc_lstm_dim': 2048,
                'pool_type': 'max', 'dpout_model': 0.0, 'version': V}
infersent = InferSent(params_model)
infersent.load_state_dict(torch.load(MODEL_PATH))

In [15]:
W2V_PATH = 'fastText/crawl-300d-2M.vec'
infersent.set_w2v_path(W2V_PATH)

In [16]:
infersent.build_vocab(condition_terms, tokenize=True)

Found 1454(/1492) words with w2v vectors
Vocab size : 1454


In [17]:
embeddings = infersent.encode(condition_terms, tokenize=True)

In [42]:
# infersent.visualize('A man plays an instrument.', tokenize=True)

In [19]:
embeddings.shape

(31, 4096)

### Test question

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [37]:
def cosine_sim_results(query_str, encoder, kb_embeddings, **kwargs):
    if kwargs:
        qn_embedding = encoder(question, kwargs.get('tokenize', None))
    else:
        qn_embedding = encoder(question)
    results = cosine_similarity(kb_embeddings, qn_embedding)
    return results

In [30]:
question=['is apple RESPONSIBLE FOR DATA CHARGES incurred?']
condition_terms[cosine_sim_results(question, infersent.encode, embeddings, tokenize=True).argmax()]

'APPLE IS NOT RESPONSIBLE FOR DATA CHARGES YOU MAY INCUR IN CONNECTION WITH YOUR USE OF THE SERVICES. '

## Google universal sentence encoder

In [3]:
# !pip install tensorflow-gpu
# !pip install tensorflow-hub
import tensorflow as tf
import tensorflow_hub as hub

Collecting tensorflow-hub
[?25l  Downloading https://files.pythonhosted.org/packages/b5/be/f18c352d84382d9c795a0f37eaf16d42ace7d161fbb0ad20bdcd5e550015/tensorflow_hub-0.5.0-py2.py3-none-any.whl (78kB)
[K    100% |################################| 81kB 5.9MB/s ta 0:00:011
Installing collected packages: tensorflow-hub
Successfully installed tensorflow-hub-0.5.0


In [4]:
#download the model to local so it can be used again and again
!mkdir google_use
# Download the module, and uncompress it to the destination folder. 
!curl -L "https://tfhub.dev/google/universal-sentence-encoder-large/3?tf-hub-format=compressed" | tar -zxvC ./google_use

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
./
./tfhub_module.pb
./variables/
./variables/variables.data-00000-of-00001
 94  745M   94  707M    0     0  64.9M      0  0:00:11  0:00:10  0:00:01 84.4M./variables/variables.index
./assets/
./saved_model.pb
100  745M  100  745M    0     0  21.2M      0  0:00:35  0:00:35 --:--:-- 10.6M


In [5]:
embed = hub.Module("./google_use")

In [34]:
def use_embed(terms):
    with tf.Session() as session:
        session.run([tf.global_variables_initializer(), tf.tables_initializer()])
        message_embeddings = session.run(embed(terms))
    return message_embeddings

# to only load session once.
# def embed_useT(module):
#     with tf.Graph().as_default():
#         sentences = tf.placeholder(tf.string)
#         embed = hub.Module(module)
#         embeddings = embed(sentences)
#         session = tf.train.MonitoredSession()
#     return lambda x: session.run(embeddings, {sentences: x})


In [35]:
message_embeddings = use_embed(condition_terms)
message_embeddings.shape

(31, 512)

In [41]:
question=['who is RESPONSIBLE FOR CHARGES incurred?']
condition_terms[cosine_sim_results(question, use_embed, message_embeddings).argmax()]

"WAIVER AND INDEMNITY BY USING THE SERVICES, YOU AGREE, TO THE EXTENT PERMITTED BY LAW, TO INDEMNIFY AND HOLD APPLE, ITS DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATES, AGENTS, CONTRACTORS, AND LICENSORS HARMLESS WITH RESPECT TO ANY CLAIMS ARISING OUT OF YOUR BREACH OF THIS AGREEMENT, YOUR USE OF THE SERVICES, OR ANY ACTION TAKEN BY APPLE AS PART OF ITS INVESTIGATION OF A SUSPECTED VIOLATION OF THIS AGREEMENT OR AS A RESULT OF ITS FINDING OR DECISION THAT A VIOLATION OF THIS AGREEMENT HAS OCCURRED. YOU AGREE THAT YOU SHALL NOT SUE OR RECOVER ANY DAMAGES FROM APPLE, ITS DIRECTORS, OFFICERS, EMPLOYEES, AFFILIATES, AGENTS, CONTRACTORS, AND LICENSORS AS A RESULT OF ITS DECISION TO REMOVE OR REFUSE TO PROCESS ANY INFORMATION OR CONTENT, TO WARN YOU, TO SUSPEND OR TERMINATE YOUR ACCESS TO THE SERVICES, OR TO TAKE ANY OTHER ACTION DURING THE INVESTIGATION OF A SUSPECTED VIOLATION OR AS A RESULT OF APPLE'S CONCLUSION THAT A VIOLATION OF THIS AGREEMENT HAS OCCURRED. THIS WAIVER AND INDEMNITY PROVIS