## Porting GoldenRetriever to TF2

https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3  
Official code sample for TF2  

Transfer learning in TF-hub w TF1.15  
https://colab.research.google.com/github/tensorflow/hub/blob/master/docs/tutorials/text_classification_with_tf_hub.ipynb#scrollTo=6OPyVxHuiTEE

In [2]:
# !pip install tensorflow_text
# !pip install --upgrade tensorflow-hub

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import numpy as np
import tensorflow_text

questions = ["What is your age?"]
responses = ["I am 20 years old.", "good morning"]
response_contexts = ["I will be 21 next year.", "great day."]

module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3')

question_embeddings = module.signatures['question_encoder'](
            tf.constant(questions))
response_embeddings = module.signatures['response_encoder'](
        input=tf.constant(responses),
        context=tf.constant(response_contexts))

np.inner(question_embeddings['outputs'], response_embeddings['outputs'])

array([[0.40884   , 0.08877401]], dtype=float32)

In [168]:
# # Keras Sequential Implementation
# module_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3')
# model = tf.keras.Sequential()
# model.add(hub_layer)

# model.compile(loss='sparse_categorical_crossentropy',
#               optimizer=keras.optimizers.RMSprop(),
#               metrics=['accuracy'])

# question_embeddings = module.signatures['question_encoder'](
#             tf.constant(questions))
# response_embeddings = module.signatures['response_encoder'](
#         input=tf.constant(responses),
#         context=tf.constant(response_contexts))

tensorflow_hub.keras_layer.KerasLayer

### Finetuning USE w GradientTape

In [36]:
%load_ext tensorboard

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import numpy as np
import tensorflow_text

questions = ["What is your age?"]
responses = ["I am 20 years old.", "good morning"]
response_contexts = ["I will be 21 next year.", "great day."]

# load the module: v3 does not support finetuning? but v2 is fine
module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/2')

# get trainable layers
#v=['QA/Final/Response_tuning/ResidualHidden_1/AdjustDepth/projection/kernel']
v=['QA/Final/Response_tuning/ResidualHidden_1/dense/kernel']
var_finetune=[x for x in module.variables for vv in v if vv in x.name] #get the weights we want to finetune.

# optimizer & losses
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.CosineSimilarity(axis=1)
loss_history = []

with tf.GradientTape() as tape:
    # https://www.tensorflow.org/guide/eager
    
    # get encodings
    question_embeddings = module.signatures['question_encoder'](tf.constant(questions))['outputs']
    response_embeddings = module.signatures['response_encoder'](input=tf.constant(responses), 
                                                                context=tf.constant(response_contexts))['outputs']

    # https://www.tensorflow.org/api_docs/python/tf/keras/losses/CosineSimilarity
    loss_value = loss(question_embeddings, response_embeddings)

# record and apply loss gradients    
loss_history.append(loss_value.numpy().mean())

print("BEFORE BACKPROP")
print(var_finetune)
print("")

print("...calculating and applying gradients...")
grads = tape.gradient(loss_value, var_finetune)
optimizer.apply_gradients(zip(grads, var_finetune))
print("")

print("AFTER BACKPROP")
print(var_finetune)

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard
BEFORE BACKPROP
[<tf.Variable 'QA/Final/Response_tuning/ResidualHidden_1/dense/kernel:0' shape=(320, 512) dtype=float32, numpy=
array([[ 0.11290824, -0.007661  ,  0.13388894, ..., -0.03849068,
        -0.05095735, -0.06322648],
       [-0.02622743, -0.02499395, -0.01445046, ..., -0.10326502,
         0.00695672, -0.17296325],
       [-0.02357727, -0.08032651, -0.04250011, ..., -0.04690072,
         0.01988911, -0.01170817],
       ...,
       [-0.00305502,  0.00504641,  0.01790689, ..., -0.02388328,
         0.03720526,  0.04548807],
       [ 0.04789947, -0.02582268,  0.08293641, ...,  0.0698828 ,
        -0.04037469, -0.02779369],
       [-0.05143448, -0.06723368,  0.02879738, ..., -0.04495105,
        -0.04067428, -0.01053122]], dtype=float32)>]
calculating and applying gradients
AFTER BACKPROP
[<tf.Variable 'QA/Final/Response_tuning/ResidualHidden_1/dense/kernel:0' shape=(320, 512) dtype=float3

The last few values in the array has changed, indicating that there is successful tuning.

In [39]:
# https://www.tensorflow.org/tensorboard/graphs
# %tensorboard --logdir logs

TODO list:
 <font color=green>
1. Init function
2. Predict
3. make_query   
    
</font>

4. Finetune 
5. Contrastive loss

In [10]:
import sys
sys.path.append('../src')
# import os
# os.listdir('../src')

In [17]:
import tensorflow as tf
import tensorflow_hub as hub
import numpy as np
import tensorflow_text
#from metric_learning import triplet_loss, contrastive_loss
#from tensorflow.train import Saver
from utils import split_txt, read_txt, clean_txt, read_kb_csv
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.optimizers import Adam


In [113]:
class GoldenRetriever:
    """GoldenRetriever model for information retrieval prediction and finetuning.
    Parameters
    ----------
    lr: Learning rate (default 0.6)
    loss: loss function to use. Options are 'cosine'(default), 'contrastive', or 'triplet' which is a triplet loss based on cosine distance.
    margin: margin to be used if loss='triplet' (default 0.1)

    Example:
    >>> gr = GoldenRetriever()
    >>> text_list = ['I love my chew toy!', 'I hate Mondays.']
    >>> gr.load_kb(text_list=text_list)
    >>> gr.make_query('what do you not love?', top_k=1)
    ['I hate Mondays.']
    """
    
    
    def __init__(self, lr=0.6, margin=0.3, loss='triplet'):
        # self.v=['module/QA/Final/Response_tuning/ResidualHidden_1/dense/kernel','module/QA/Final/Response_tuning/ResidualHidden_0/dense/kernel', 'module/QA/Final/Response_tuning/ResidualHidden_1/AdjustDepth/projection/kernel']
        self.v=['module/QA/Final/Response_tuning/ResidualHidden_1/AdjustDepth/projection/kernel']
        self.lr = lr
        self.margin = margin
        self.loss = loss
        self.vectorized_knowledge = {}
        self.text = {}
        self.questions = {}
        
        # init saved model
        self.embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3')
        self.question_encoder = module.signatures['question_encoder']
        self.response_encoder = module.signatures['response_encoder']
        print('model initiated!')
        
        # to-confirm: question response placeholders really not needed?
        # TODO: negative response encoder for contrastive loss
        
        
    def predict(self, text, context=None, type='response'):
        """Return the tensor representing embedding of input text.
        Type can be 'query' or 'response' """
        if type=='query':
            return self.question_encoder(tf.constant([text]))['outputs']
            # return self.session.run(self.question_embeddings, feed_dict={self.question:text})['outputs']
        elif type=='response':
            if not context:
                context = text
            return self.response_encoder(input=tf.constant(text),
                                         context=tf.constant(context))['outputs']
        else: print('Type of prediction not defined')
        
    def make_query(self, querystring, top_k=5, index=False, predict_type='query', kb_name='default_kb'):
        """Make a query against the stored vectorized knowledge. 
        Choose index=True to return sorted index of matches.
        type can be 'query' or 'response' if you are comparing statements
        """
        similarity_score=cosine_similarity(self.vectorized_knowledge[kb_name], self.predict([querystring], type=predict_type))
        sortargs=np.flip(similarity_score.argsort(axis=0))
        sortargs=[x[0] for x in sortargs]
        sorted_ans=[self.text[kb_name][i] for i in sortargs]
        if index:
            return sorted_ans[:top_k], sortargs[:top_k]
        return sorted_ans[:top_k], similarity_score[sortargs[:top_k]] 
        
        
    def load_kb(self, path_to_kb=None, text_list=None, question_list=None, 
                raw_text=None, is_faq=False, kb_name='default_kb'):
        r"""Give either path to .txt document or list of clauses.
        For text document, each clause is separated by 2 newlines ('\\n\\n')"""
        if text_list:
            self.text[kb_name] = text_list
            if is_faq:
                self.questions[kb_name] = question_list
        elif path_to_kb:
            if is_faq:
                self.text[kb_name], self.questions[kb_name] = split_txt(read_txt(path_to_kb), is_faq)
            else:
                self.text[kb_name] = split_txt(read_txt(path_to_kb), is_faq)
        elif raw_text:
            delim = '\n'
            self.text[kb_name] = split_txt([front+delim for front in raw_text.split('\n')])
        else: raise NameError('invalid kb input!')
        self.vectorized_knowledge[kb_name] = self.predict(clean_txt(self.text[kb_name]), type='response')
        print('knowledge base lock and loaded!')
        
    def load_csv_kb(self, path_to_kb=None, kb_name='default_kb', meta_col='meta', answer_col='answer', 
                    query_col='question', answer_str_col='answer', cutoff=None):
        self.text[kb_name], self.questions[kb_name] = read_kb_csv(path_to_kb, meta_col=meta_col, answer_col=answer_col, 
                            query_col=query_col, answer_str_col=answer_str_col, cutoff=None)
        self.vectorized_knowledge[kb_name] = self.predict(clean_txt(self.text[kb_name]), type='response')
        print('knowledge base (csv) lock and loaded!')

In [115]:
%time gr = GoldenRetriever()

# encode 1 question
encoded_ques = gr.predict('How old are you?', 
                          type='query')

# encode multiple questions
encoded_ques = gr.predict(['How old are you?', 'What time is it?'], 
                          type='query')

# one response w context
encoded_res = gr.predict("I am 20 years old.", 
                         context="I will be 21 next year.", 
                         type='response')

# multiple responses w/0 context
encoded_res = gr.predict(["I am 20 years old.", "I love apple cider"], 
                         type='response')

# load knowledge bases
gr.load_kb(path_to_kb='../data/aiap.txt', is_faq=True, kb_name='aiap')
# gr.load_kb(path_to_kb='./data/resale_tnc.txt', kb_name='resale_tnc')
# gr.load_kb(path_to_kb='./data/fund_guide_tnc_full.txt', kb_name='nrf')
# gr.load_csv_kb(path_to_kb='./data/pdpa.csv', cutoff=196, kb_name='pdpa')

# make query
gr.make_query('What kind of candidates are you looking for?', top_k=2, kb_name='aiap')

model initiated!
CPU times: user 18 s, sys: 769 ms, total: 18.8 s
Wall time: 18.6 s
knowledge base lock and loaded!


(['We are looking for candidates who possess a keen interest in the area of machine learning and data science. We believe that candidates can come from any area of specialisation, and our requirements are as follow:\ni)   Singaporean with a polytechnic diploma or university degree,\nii) Proficient in Python or R and iii) Is able to implement Machine Learning Algorithms or have a background in Mathematics / Statistics / Computer Science. \nBeyond that, demonstrated statistical fundamentals and programming ability will be helpful for the technical tests, but a keen learning attitude will be the most important to carry you through the programme. \n',
  'Candidates can expect to be equipped in some or all of the following skills: data modelling/tuning, data engineering, data product-related software engineering, cloud applications. It ranges between individuals, but candidates can be adequately prepared in fields of data science, engineering and consultancy\n'],
 array([[0.3085595 ],
     

In [116]:
hub.load?