## Porting GoldenRetriever to TF2

### 1. USE-QA in TF2
https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3
Official code sample for TF2  

In [None]:
# !pip install tensorflow-addons
# !pip install tensorflow_text
# !pip install --upgrade tensorflow-hub

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import numpy as np
import tensorflow_text

questions = ["What is your age?"]
responses = ["I am 20 years old.", "good morning"]
response_contexts = ["I will be 21 next year.", "great day."]

module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3')

question_embeddings = module.signatures['question_encoder'](
            tf.constant(questions))
response_embeddings = module.signatures['response_encoder'](
        input=tf.constant(responses),
        context=tf.constant(response_contexts))

np.inner(question_embeddings['outputs'], response_embeddings['outputs'])

### 2. Golden Retriever in TF2

In [None]:
# https://www.tensorflow.org/tensorboard/graphs
# %tensorboard --logdir logs

In [6]:
import sys
sys.path.append('../src')

import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_hub as hub
import datetime
import numpy as np
import tensorflow_text
from utils import split_txt, read_txt, clean_txt, read_kb_csv
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.optimizers import Adam


def triplet_loss(anchor_vector, positive_vector, negative_vector, metric='cosine_dist', margin=0.009):
    """Computes the triplet loss with semi-hard negative mining.
    The loss encourages the positive distances (between a pair of embeddings with
    the same labels) to be smaller than the minimum negative distance among
    which are at least greater than the positive distance plus the margin constant
    (called semi-hard negative) in the mini-batch. If no such negative exists,
    uses the largest negative distance instead.
    See: https://arxiv.org/abs/1503.03832.

    Args:
        labels: 1-D tf.int32 `Tensor` with shape [batch_size] of
        multiclass integer labels.
        embeddings: 2-D float `Tensor` of embedding vectors. Embeddings should
        be l2 normalized.
        metric: 'cosine_dist' (default)
        margin: Float, margin term in the loss definition. default based on https://arxiv.org/pdf/1508.01585.pdf

    Returns:
        triplet_loss: tf.float32 scalar.
    """
    cosine_distance = tf.keras.losses.CosineSimilarity(axis=1)
    d_pos = cosine_distance(anchor_vector, positive_vector)
    d_neg = cosine_distance(anchor_vector, negative_vector)
    loss = tf.maximum(0., margin + d_pos - d_neg)
    loss = tf.reduce_mean(loss)
    return loss



class GoldenRetriever:
    """GoldenRetriever model for information retrieval prediction and finetuning.
    Parameters
    ----------
    margin: margin to be used if loss='triplet' (default 0.3)
    loss: loss function to use. Options are 'cosine', 'contrastive', or 'triplet'(default) which is a triplet loss based on cosine distance.
    **kwargs: keyword arguments for Adam() optimizer

    Example:
    >>> gr = GoldenRetriever()
    >>> text_list = ['I love my chew toy!', 'I hate Mondays.']
    >>> gr.load_kb(text_list=text_list)
    >>> gr.make_query('what do you not love?', top_k=1)
    ['I hate Mondays.']
    """
    
    
    def __init__(self, margin=0.3, loss='triplet', **kwargs):
        # self.v=['QA/Final/Response_tuning/ResidualHidden_1/dense/kernel','QA/Final/Response_tuning/ResidualHidden_0/dense/kernel', 'QA/Final/Response_tuning/ResidualHidden_1/AdjustDepth/projection/kernel']
        self.v=['QA/Final/Response_tuning/ResidualHidden_1/dense/kernel']
        self.margin = margin
        self.loss = loss
        self.vectorized_knowledge = {}
        self.text = {}
        self.questions = {}

        # init saved model
        self.embed = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/2')
        self.question_encoder = self.embed.signatures['question_encoder']
        self.response_encoder = self.embed.signatures['response_encoder']
        self.neg_response_encoder = self.embed.signatures['response_encoder']
        print('model initiated!')
        
        # optimizer & losses
        self.optimizer = tf.keras.optimizers.Adam(**kwargs)
        self.cost_history = []
        self.var_finetune=[x for x in self.embed.variables for vv in self.v if vv in x.name] #get the weights we want to finetune.

        
        
    def predict(self, text, context=None, type='response'):
        """Return the tensor representing embedding of input text.
        Type can be 'query' or 'response' """
        if type=='query':
            return self.question_encoder(tf.constant([text]))['outputs']
            # return self.session.run(self.question_embeddings, feed_dict={self.question:text})['outputs']
        elif type=='response':
            if not context:
                context = text
            return self.response_encoder(input=tf.constant(text),
                                         context=tf.constant(context))['outputs']
        else: print('Type of prediction not defined')
        
    def make_query(self, querystring, top_k=5, index=False, predict_type='query', kb_name='default_kb'):
        """Make a query against the stored vectorized knowledge. 
        Choose index=True to return sorted index of matches.
        type can be 'query' or 'response' if you are comparing statements
        """
        similarity_score=cosine_similarity(self.vectorized_knowledge[kb_name], self.predict([querystring], type=predict_type))
        sortargs=np.flip(similarity_score.argsort(axis=0))
        sortargs=[x[0] for x in sortargs]
        sorted_ans=[self.text[kb_name][i] for i in sortargs]
        if index:
            return sorted_ans[:top_k], sortargs[:top_k]
        return sorted_ans[:top_k], similarity_score[sortargs[:top_k]] 
        
        
    def finetune(self, question, answer, context, neg_answer=[], neg_answer_context=[], label=[]):
        """
        Apply gradients on
        """
        with tf.GradientTape() as tape:
            # https://www.tensorflow.org/guide/eager

            # get encodings
            question_embeddings = self.question_encoder(tf.constant(question))['outputs']
            response_embeddings = self.response_encoder(input=tf.constant(answer), 
                                                        context=tf.constant(context))['outputs']

            if self.loss == 'cosine':
                """
                # https://www.tensorflow.org/api_docs/python/tf/keras/losses/CosineSimilarity
                """
                self.cost = tf.keras.losses.CosineSimilarity(axis=1)
                cost_value = self.cost(question_embeddings, response_embeddings)
                
            elif self.loss == 'contrastive':
                """
                https://www.tensorflow.org/addons/api_docs/python/tfa/losses/ContrastiveLoss
                
                y_true to be a vector of binary labels
                y_hat to be the respective distances
                """
                self.cosine_dist = tf.keras.losses.CosineSimilarity(axis=1)
                cosine_dist_value = self.cosine_dist(question_embeddings, response_embeddings)
                
                self.cost = tfa.losses.contrastive.ContrastiveLoss(margin = self.margin)
                cost_value = self.cost(label, cosine_dist_value)
                
            elif self.loss == 'triplet':
                """
                https://www.tensorflow.org/addons/tutorials/losses_triplet
                """
                neg_response_embeddings = self.neg_response_encoder(input=tf.constant(neg_answer), 
                                                                    context=tf.constant(neg_answer_context))['outputs']
                self.cost = tfa.losses.TripletSemiHardLoss(margin = self.margin)
                cost_value = triplet_loss(question_embeddings, response_embeddings, neg_response_embeddings)

                
        # record loss     
        self.cost_history.append(cost_value.numpy().mean())
        
        # apply gradient
        grads = tape.gradient(cost_value, self.var_finetune)
        self.optimizer.apply_gradients(zip(grads, self.var_finetune))

        return cost_value.numpy().mean()
        
    def load_kb(self, path_to_kb=None, text_list=None, question_list=None, 
                raw_text=None, is_faq=False, kb_name='default_kb'):
        r"""Give either path to .txt document or list of clauses.
        For text document, each clause is separated by 2 newlines ('\\n\\n')"""
        if text_list:
            self.text[kb_name] = text_list
            if is_faq:
                self.questions[kb_name] = question_list
        elif path_to_kb:
            if is_faq:
                self.text[kb_name], self.questions[kb_name] = split_txt(read_txt(path_to_kb), is_faq)
            else:
                self.text[kb_name] = split_txt(read_txt(path_to_kb), is_faq)
        elif raw_text:
            delim = '\n'
            self.text[kb_name] = split_txt([front+delim for front in raw_text.split('\n')])
        else: raise NameError('invalid kb input!')
        self.vectorized_knowledge[kb_name] = self.predict(clean_txt(self.text[kb_name]), type='response')
        print('knowledge base lock and loaded!')
        
    def load_csv_kb(self, path_to_kb=None, kb_name='default_kb', meta_col='meta', answer_col='answer', 
                    query_col='question', answer_str_col='answer', cutoff=None):
        self.text[kb_name], self.questions[kb_name] = read_kb_csv(path_to_kb, meta_col=meta_col, answer_col=answer_col, 
                            query_col=query_col, answer_str_col=answer_str_col, cutoff=None)
        self.vectorized_knowledge[kb_name] = self.predict(clean_txt(self.text[kb_name]), type='response')
        print('knowledge base (csv) lock and loaded!')
        
    def export(self, savepath='fine_tuned_{}'.format(datetime.datetime.now())):
        '''Path should include partial filename.'''
        tf.saved_model.save(self.embed, 'fine_tuned')

    def restore(self, savepath):
        self.embed = tf.saved_model.load(savepath)
    

### 2.1 Testing key functions

In [None]:
# import sys
# sys.path.append('../src')

# %time gr = GoldenRetriever()

# # encode 1 question
# encoded_ques = gr.predict('How old are you?', 
#                           type='query')

# # encode multiple questions
# encoded_ques = gr.predict(['How old are you?', 'What time is it?'], 
#                           type='query')

# # one response w context
# encoded_res = gr.predict("I am 20 years old.", 
#                          context="I will be 21 next year.", 
#                          type='response')

# # multiple responses w/0 context
# encoded_res = gr.predict(["I am 20 years old.", "I love apple cider"], 
#                          type='response')

# # load knowledge bases
# gr.load_kb(path_to_kb='../data/aiap.txt', is_faq=True, kb_name='aiap')
# # gr.load_kb(path_to_kb='./data/resale_tnc.txt', kb_name='resale_tnc')
# # gr.load_kb(path_to_kb='./data/fund_guide_tnc_full.txt', kb_name='nrf')
# # gr.load_csv_kb(path_to_kb='./data/pdpa.csv', cutoff=196, kb_name='pdpa')

# # make query
# gr.make_query('What kind of candidates are you looking for?', top_k=2, kb_name='aiap')




# """
# 1. Cosine loss
# """
# gr = GoldenRetriever(loss='cosine')

# print("BEFORE BACKPROP")
# print(gr.var_finetune)
# print("")

# questions = ["What is your age?"]
# responses = ["I am 20 years old.", "good morning"]
# response_contexts = ["I will be 21 next year.", "great day."]
# gr.finetune(questions, responses, response_contexts, label=[1,0])

# print("AFTER BACKPROP")
# print(gr.var_finetune)


# """
# 2. Contrastive loss
# """
# gr = GoldenRetriever(loss='contrastive')

# print("BEFORE BACKPROP")
# print(gr.var_finetune)
# print("")

# questions = ["What is your age?"]
# responses = ["I am 20 years old.", "good morning"]
# response_contexts = ["I will be 21 next year.", "great day."]
# gr.finetune(questions, responses, response_contexts, label=[1,0])

# print("AFTER BACKPROP")
# print(gr.var_finetune)

# """
# 3. Triplet loss
# """
# gr = GoldenRetriever(loss='triplet')

# print("BEFORE BACKPROP")
# print(gr.var_finetune)
# print("")

# questions = ["What is your age?"]
# responses = ["The top section of the spine is damaged."]
# response_contexts = ["Call the nurse."]
# %time gr.finetune(questions, responses, response_contexts, neg_answer = ["I will be 21 years old."], neg_answer_context = ["Time is running out for the elderly and the young."])

# print("AFTER BACKPROP")
# print(gr.var_finetune)

The last few values in the array has changed, indicating that there is successful tuning.

### 3. Testing src saved model

In [1]:
import sys
sys.path.append('..')

from src.model import GoldenRetriever
print("timing init")
%time gr = GoldenRetriever({'learning_rate':0.001, 'beta_1':0.9, 'beta_2':0.999})
print("")

# encode 1 question
print("timing ques encoding")
%time encoded_ques = gr.predict('How old are you?', type='query')
print("")

# encode multiple questions
encoded_ques = gr.predict(['How old are you?', 'What time is it?'], 
                          type='query')

# one response w context
print("timing response encoding")
%time encoded_res = gr.predict("I am 20 years old.", context="I will be 21 next year.", type='response')
print("")

# multiple responses w/0 context
encoded_res = gr.predict(["I am 20 years old.", "I love apple cider"], type='response')

# load knowledge bases
gr.load_kb(path_to_kb='../data/aiap.txt', is_faq=True, kb_name='aiap')
# gr.load_kb(path_to_kb='./data/resale_tnc.txt', kb_name='resale_tnc')
# gr.load_kb(path_to_kb='./data/fund_guide_tnc_full.txt', kb_name='nrf')
# gr.load_csv_kb(path_to_kb='./data/pdpa.csv', cutoff=196, kb_name='pdpa')

# make query
gr.make_query('What kind of candidates are you looking for?', top_k=2, kb_name='aiap')


"""
Testing loss functions
"""
v=['QA/Final/Response_tuning/ResidualHidden_1/dense/kernel']
var_finetune=[x for x in gr.embed.variables for vv in v if vv in x.name] #get the weights we want to finetune.

"""
1. Cosine loss
"""
gr = GoldenRetriever(loss='cosine')

print("BEFORE BACKPROP")
print(gr.var_finetune)
print("")

questions = ["What is your age?"]
responses = ["I am 20 years old.", "good morning"]
response_contexts = ["I will be 21 next year.", "great day."]
%time gr.finetune(questions, responses, response_contexts, label=[1,0])

print("AFTER BACKPROP")
print(gr.var_finetune)


"""
2. Contrastive loss
"""
gr = GoldenRetriever(loss='contrastive')

print("BEFORE BACKPROP")
print(gr.var_finetune)
print("")

questions = ["What is your age?"]
responses = ["I am 20 years old.", "good morning"]
response_contexts = ["I will be 21 next year.", "great day."]
%time gr.finetune(questions, responses, response_contexts, label=[1,0])

print("AFTER BACKPROP")
print(gr.var_finetune)


"""
3. Triplet loss
"""
gr = GoldenRetriever(loss='triplet')

print("BEFORE BACKPROP")
print(gr.var_finetune)
print("")

questions = ["What is your age?"]
responses = ["The top section of the spine is damaged."]
response_contexts = ["Call the nurse."]
%time gr.finetune(questions, responses, response_contexts, neg_answer = ["I will be 21 years old."], neg_answer_context = ["Time is running out for the elderly and the young."])

print("AFTER BACKPROP")
print(gr.var_finetune)

timing init
model initiated!
CPU times: user 19.2 s, sys: 1.88 s, total: 21.1 s
Wall time: 30.7 s

timing ques encoding
CPU times: user 2.04 s, sys: 66.8 ms, total: 2.11 s
Wall time: 2.12 s

timing response encoding
CPU times: user 2.25 s, sys: 13.4 ms, total: 2.26 s
Wall time: 2.24 s

knowledge base lock and loaded!
model initiated!
BEFORE BACKPROP
[<tf.Variable 'QA/Final/Response_tuning/ResidualHidden_1/dense/kernel:0' shape=(320, 512) dtype=float32, numpy=
array([[ 0.11290824, -0.007661  ,  0.13388894, ..., -0.03849068,
        -0.05095735, -0.06322648],
       [-0.02622743, -0.02499395, -0.01445046, ..., -0.10326502,
         0.00695672, -0.17296325],
       [-0.02357727, -0.08032651, -0.04250011, ..., -0.04690072,
         0.01988911, -0.01170817],
       ...,
       [-0.00305502,  0.00504641,  0.01790689, ..., -0.02388328,
         0.03720526,  0.04548807],
       [ 0.04789947, -0.02582268,  0.08293641, ...,  0.0698828 ,
        -0.04037469, -0.02779369],
       [-0.05143448, -0.

In [2]:

"""
3. Triplet loss
"""
gr = GoldenRetriever(loss='triplet')

print("BEFORE BACKPROP")
print(gr.var_finetune)
print("")

questions = ["What is your age?"]
responses = ["I am 20 years old."]
response_contexts = ["I will be 21 next year."]
%time gr.finetune(questions, responses, response_contexts, neg_answer = ["good morning"], neg_answer_context = ["great day."])

print("AFTER BACKPROP")
print(gr.var_finetune)

model initiated!
BEFORE BACKPROP
[<tf.Variable 'QA/Final/Response_tuning/ResidualHidden_1/dense/kernel:0' shape=(320, 512) dtype=float32, numpy=
array([[ 0.11290824, -0.007661  ,  0.13388894, ..., -0.03849068,
        -0.05095735, -0.06322648],
       [-0.02622743, -0.02499395, -0.01445046, ..., -0.10326502,
         0.00695672, -0.17296325],
       [-0.02357727, -0.08032651, -0.04250011, ..., -0.04690072,
         0.01988911, -0.01170817],
       ...,
       [-0.00305502,  0.00504641,  0.01790689, ..., -0.02388328,
         0.03720526,  0.04548807],
       [ 0.04789947, -0.02582268,  0.08293641, ...,  0.0698828 ,
        -0.04037469, -0.02779369],
       [-0.05143448, -0.06723368,  0.02879738, ..., -0.04495105,
        -0.04067428, -0.01053122]], dtype=float32)>]

CPU times: user 27.9 s, sys: 165 ms, total: 28 s
Wall time: 27.8 s
AFTER BACKPROP
[<tf.Variable 'QA/Final/Response_tuning/ResidualHidden_1/dense/kernel:0' shape=(320, 512) dtype=float32, numpy=
array([[ 0.11290824, -0.007661

# Appendix

### Testing finetune()

In [None]:
%load_ext tensorboard

import tensorflow as tf
import tensorflow_hub as hub
from tensorflow import keras
import numpy as np
import tensorflow_text

questions = ["What is your age?"]
responses = ["I am 20 years old.", "good morning"]
response_contexts = ["I will be 21 next year.", "great day."]

# load the module: v3 does not support finetuning? but v2 is fine
module = hub.load('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/2')
question_encoder = module.signatures['question_encoder']
response_encoder = module.signatures['response_encoder']
        
# get trainable layers
#v=['QA/Final/Response_tuning/ResidualHidden_1/AdjustDepth/projection/kernel']
v=['QA/Final/Response_tuning/ResidualHidden_1/dense/kernel']
var_finetune=[x for x in module.variables for vv in v if vv in x.name] #get the weights we want to finetune.

# optimizer & losses
optimizer = tf.keras.optimizers.Adam()
loss = tf.keras.losses.CosineSimilarity(axis=1)
loss_history = []

with tf.GradientTape() as tape:
    # https://www.tensorflow.org/guide/eager
    
    # get encodings
    question_embeddings = question_encoder(tf.constant(questions))['outputs']
    response_embeddings = response_encoder(input=tf.constant(responses), 
                                           context=tf.constant(response_contexts))['outputs']

    # https://www.tensorflow.org/api_docs/python/tf/keras/losses/CosineSimilarity
    loss_value = loss(question_embeddings, response_embeddings)

# record and apply loss gradients    
loss_history.append(loss_value.numpy().mean())

print("BEFORE BACKPROP")
print(var_finetune)
print("")

print("...calculating and applying gradients...")
grads = tape.gradient(loss_value, var_finetune)
optimizer.apply_gradients(zip(grads, var_finetune))
print("")

print("AFTER BACKPROP")
print(var_finetune)



### TF2.Keras Sequential() implementation

In [None]:
# # Keras Sequential Implementation
# module_layer = hub.KerasLayer('https://tfhub.dev/google/universal-sentence-encoder-multilingual-qa/3')
# model = tf.keras.Sequential()
# model.add(hub_layer)

# model.compile(loss='sparse_categorical_crossentropy',
#               optimizer=keras.optimizers.RMSprop(),
#               metrics=['accuracy'])

# question_embeddings = module.signatures['question_encoder'](
#             tf.constant(questions))
# response_embeddings = module.signatures['response_encoder'](
#         input=tf.constant(responses),
#         context=tf.constant(response_contexts))

### Connecting to Azure SQL Server

In [None]:
print("poodbc drivers")
print(pyodbc.drivers())

odbc_str = "" 

import pyodbc 
conn = pyodbc.connect(odbc_str)

cursor = conn.cursor()
cursor.execute('SELECT * FROM dbo.users')

for row in cursor:
    print(row)

### Testing tensorflow_addon 's triplet loss class

In [None]:
# multiple responses w/0 context
encoded_ques = gr.predict(["How old are you?"], 
                         type='query')

# multiple responses w/0 context
encoded_res1 = gr.predict(["I am 20 years old."], #, "I love apple cider"], 
                         type='response')

# multiple responses w/0 context
encoded_res2 = gr.predict([ "I love apple cider"], 
                         type='response')

APN = tf.stack([encoded_ques, encoded_res1, encoded_res2])

tfa_triplet_cost = tfa.losses.TripletSemiHardLoss(margin = 0.5)
# cost_value = tfa_triplet_cost([1], APN)

from tensorflow_addons.losses import metric_learning
metric_learning.pairwise_distance(APN, squared=True)

In [None]:
labels, embeddings = y_true, y_pred
# Reshape label tensor to [batch_size, 1].
lshape = tf.shape(labels)
labels = tf.reshape(labels, [lshape[0], 1])

# Build pairwise squared distance matrix.
pdist_matrix = metric_learning.pairwise_distance(embeddings, squared=True)
# Build pairwise binary adjacency matrix.
adjacency = tf.math.equal(labels, tf.transpose(labels))
# Invert so we can select negatives only.
adjacency_not = tf.math.logical_not(adjacency)

batch_size = tf.size(labels)

# Compute the mask.
pdist_matrix_tile = tf.tile(pdist_matrix, [batch_size, 1])
mask = tf.math.logical_and(
    tf.tile(adjacency_not, [batch_size, 1]),
    tf.math.greater(pdist_matrix_tile,
                    tf.reshape(tf.transpose(pdist_matrix), [-1, 1])))
mask_final = tf.reshape(
    tf.math.greater(
        tf.math.reduce_sum(
            tf.cast(mask, dtype=tf.dtypes.float32), 1, keepdims=True),
        0.0), [batch_size, batch_size])
mask_final = tf.transpose(mask_final)

adjacency_not = tf.cast(adjacency_not, dtype=tf.dtypes.float32)
mask = tf.cast(mask, dtype=tf.dtypes.float32)

# negatives_outside: smallest D_an where D_an > D_ap.
negatives_outside = tf.reshape(
    _masked_minimum(pdist_matrix_tile, mask), [batch_size, batch_size])
negatives_outside = tf.transpose(negatives_outside)

# negatives_inside: largest D_an.
negatives_inside = tf.tile(
    _masked_maximum(pdist_matrix, adjacency_not), [1, batch_size])
semi_hard_negatives = tf.where(mask_final, negatives_outside,
                               negatives_inside)

loss_mat = tf.math.add(margin, pdist_matrix - semi_hard_negatives)

mask_positives = tf.cast(
    adjacency, dtype=tf.dtypes.float32) - tf.linalg.diag(
        tf.ones([batch_size]))

# In lifted-struct, the authors multiply 0.5 for upper triangular
#   in semihard, they take all positive pairs except the diagonal.
num_positives = tf.math.reduce_sum(mask_positives)

triplet_loss = tf.math.truediv(
    tf.math.reduce_sum(
        tf.math.maximum(tf.math.multiply(loss_mat, mask_positives), 0.0)),
    num_positives)

return triplet_loss

In [None]:
import streamlit as st
# from src.model2 import GoldenRetriever

print("import success")

# @st.cache(allow_output_mutation=True)
# def init():
#     retriever = GoldenRetriever()
#     #retriever.restore('./google_use_nrf_pdpa_tuned/variables-0')
#     retriever.load_csv_kb(path_to_kb='./data/pdpa.csv', cutoff=196, kb_name='pdpa')
#     retriever.load_kb(path_to_kb='./data/aiap.txt', is_faq=True, kb_name='aiap')
#     retriever.load_kb(path_to_kb='./data/resale_tnc.txt', kb_name='resale_tnc')
#     # retriever.load_kb(path_to_kb='./data/fund_guide_tnc_full.txt', kb_name='nrf')
#     return retriever

gr = GoldenRetriever()
gr.load_csv_kb(path_to_kb='../data/pdpa.csv', cutoff=196, kb_name='pdpa')
gr.load_kb(path_to_kb='../data/aiap.txt', is_faq=True, kb_name='aiap')
print("init success")

st.title('GoldenRetriever')
st.header('This Information Retrieval demo allows you to query FAQs, T&Cs, or your own knowledge base in natural language.')
st.markdown('View the source code [here](https://github.com/nickyeolk/info_retrieve)!')
st.markdown('Visit our [community](https://makerspace.aisingapore.org/community/ai-makerspace/) and ask us a question!')
kb_to_starqn = {'pdpa':"Can an organization retain the physical NRIC?",
                'resale_tnc':"How much is the option fee?",
                'aiap':"Do I need to pay for the program?",
                # 'nrf':"Can I vire from EOM into travel?",
                'raw_kb':"What do you not love?"}

def format_func(kb_name):
    namedicts={'pdpa':'PDPA',
                'resale_tnc':'HDB Resale',
                'aiap':'AIAP',
                # 'nrf':'NRF',
                'raw_kb':'Paste Raw Text'}
    return namedicts[kb_name]
kb = st.selectbox('Select Knowledge Base', options=['pdpa', 'resale_tnc', 'aiap', 'raw_kb'],
                    format_func=format_func)
if kb=='raw_kb':
    kb_raw = st.text_area(label='Paste raw text (terms separated by empty line)', 
                        value="""I love my chew toy!\n\nI hate Mondays.\n""")
top_k = st.radio('Number of Results', options=[1,2,3], index=2)
data = st.text_input(label='Input query here', value=kb_to_starqn[kb])
if st.button('Fetch') or (data != kb_to_starqn[kb]): #So the answer will not appear right away
    if kb=='raw_kb':
        gr.load_kb(raw_text=kb_raw, kb_name='raw_kb')
    prediction, scores = gr.make_query(data, top_k=int(top_k), kb_name=kb)
    qn_string="""<h3><text>Question: </text>{}</h3>""".format(data)
    st.markdown(qn_string, unsafe_allow_html=True)

    for ansnum, result in enumerate(prediction):
        anshead_string = """<h3><text>Answer {}</text></h3>""".format(ansnum+1)
        st.markdown(anshead_string, unsafe_allow_html=True)
        reply_string="""<table>"""
        lines = [line for line in result.split('\n') if line]
        for line in lines:
            reply_string += """<tr>"""
            tabledatas = line.split(';;')
            for tabledata in tabledatas:
                if len(tabledatas)>1:
                    line_string = """<td>{}</td>""".format(tabledata)
                else:
                    line_string = """<td colspan=42>{}</td>""".format(tabledata)
                reply_string += line_string
            reply_string += """</tr>"""
        reply_string+="""</table><br>"""
        st.markdown(reply_string, unsafe_allow_html=True)

st.markdown(
"""
<details><summary>Sample sentences</summary>
<strong>PDPA</strong>
<p>How long can an organisation retain its customers' personal data?</p>
<strong>HDB resale terms and conditions</strong>
<p>Do I need to pay back CPF?</p>
<strong>AIAP</strong>
<p>What will be covered during the program?</p>
<strong>Raw text </strong><a href="https://www.straitstimes.com/asia/east-asia/china-wants-centralised-digital-currency-after-bitcoin-crackdown" target="_blank">China Digital Currency</a><i> (Select all, copy, and paste into raw text box)</i>
<p>Which electronic payment gateways support the currency?</p>
</details>"""
, unsafe_allow_html=True)
