<a href="https://colab.research.google.com/github/mohammedterry/cpop_tests/blob/master/Robust_Sentences.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
from scipy.spatial.distance import cosine, euclidean #sample metrics to measure vector similarity
import tensorflow as tf
import tensorflow_hub as hub
tf.logging.set_verbosity(tf.logging.ERROR) #hide log messages from tensorflow

class RobustSentences():  
  def __init__(self, sentences): #initialise with a list of sentences
    self.universal_encoder = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-large/3") #large Universal Encoder (bi-LSTM with attention version)
    self.sentences = sentences
    self.vectors = self._embed(sentences)
      
  def __call__(self,new_sentence, metric = 0, n_most_similar = 1): 
    vector = self._embed([new_sentence])[0]
    distance = [cosine,euclidean][metric] #default metric 0: cosine, 1: euclidean (different ways to measure similarity of a vector)
    scores = [distance(vector, vec) for vec in self.vectors] #measure similarity of each sentence to the new sentence using the chosen metric
    ranked_candidates_idx = sorted([(score,idx) for idx,score in enumerate(scores)]) #order the sentences according to the lowest difference
    return [self.sentences[idx] for _,idx in ranked_candidates_idx[:n_most_similar]] #return the n_most_similar sentences as strings

  def _embed(self, sentences):
    with tf.Session() as session:
      session.run([tf.global_variables_initializer(), tf.tables_initializer()])
      return session.run(self.universal_encoder(sentences))

In [0]:
rs = RobustSentences([  #give a list of sentences
    "I like my phone",
    "My phone is not good.",
    "Your cellphone looks great.",
    "Will it snow tomorrow?",
    "Recently a lot of hurricanes have hit the US",
    "Global warming is real",
    "An apple a day, keeps the doctors away",
    "Eating strawberries is healthy",
    "Is paleo better than keto?",
    "How old are you?",
    "what is your age?",
    "Cats chase dogs",
    "Dogs chase cats",
])

In [26]:
rs("how young are you")  #ask a completely new sentence and it returns the most similar from the list given

['How old are you?']

In [32]:
rs("how young are you",metric = 1) #try alternative methods for measuring vector similarity (default 0: cosine, 1: euclidean)

['How old are you?']

In [33]:
rs("i like puppies", n_most_similar = 3) #adjust how many sentences are returned 

['Dogs chase cats', 'Cats chase dogs', 'I like my phone']