# TL;DR
This notebook demonstrates how powerful sentence embeddings from Universal Sentence Encoder are.  
These sentence representations can be used in varities of NLP tasks.

In [None]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import nltk

import re
import pandas as pd
import json

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


## Universal Sentence Encoder
It is the model for encoding sentences into embedding vectors that specifically target transfer learning to other NLP tasks. The model is efficient and result in accurate performance on diverse transfer tasks.  

**References**  
- [arxiv](https://arxiv.org/abs/1803.11175)
- [tensorflow hub](https://tfhub.dev/google/universal-sentence-encoder-large/3)
- [colab notebook](https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder.ipynb)
- [my personal usecase](https://github.com/NISH1001/lyrics2vec/blob/master/lyrics2vec.ipynb)


### Load Embedding Tensor

In [None]:
# https://github.com/NISH1001/machine-learning-into-the-void/blob/master/nlp/universal-sentence-encocder-semantic-similarity.ipynb
# tensroflow hub module for Universal sentence Encoder
module_url = "https://tfhub.dev/google/universal-sentence-encoder-large/3" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]

In [None]:
embed = hub.Module(module_url)

## Feature Extractor

This is just a simple function to wrap tensorflow call.


In [None]:
def get_features(texts):
    if type(texts) is str:
        texts = [texts]
    with tf.Session() as sess:
        sess.run([tf.global_variables_initializer(), tf.tables_initializer()])
        return sess.run(embed(texts))
    

## Preprocess Textual Mess

In [None]:
def remove_stopwords(stop_words, tokens):
    res = []
    for token in tokens:
        if not token in stop_words:
            res.append(token)
    return res

def process_text(text):
    text = text.encode('ascii', errors='ignore').decode()
    text = text.lower()
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'#+', ' ', text )
    text = re.sub(r'@[A-Za-z0-9]+', ' ', text)
    text = re.sub(r"([A-Za-z]+)'s", r"\1 is", text)
    #text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"won't", "will not ", text)
    text = re.sub(r"isn't", "is not ", text)
    text = re.sub(r"can't", "can not ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub(r'\d+', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip()
    return text

def lemmatize(tokens):
    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_list = []
    for token in tokens:
        lemma = lemmatizer.lemmatize(token, 'v')
        if lemma == token:
            lemma = lemmatizer.lemmatize(token)
        lemma_list.append(lemma)
    # return [ lemmatizer.lemmatize(token, 'v') for token in tokens ]
    return lemma_list


def process_all(text):
    text = process_text(text)
    return ' '.join(remove_stopwords(stop_words, text.split()))

process_text("Hello! Who are you?")

'hello who are you'

## Load Data
Here, I am using some dummy texts of mine.  
Most of the data is taken from [here](https://github.com/NISH1001/rnn-for-text/blob/master/data/input.txt)

In [None]:
df = pd.read_csv('/content/drive/My Drive/research_project/nlp_privacy_policy_analyze/data/Annotated_privacy_policy_segments_100.csv', encoding='utf-8')
filtredData = df[df['annotated_class'] != 'OTHER']
data = filtredData['statement']
data

## Preprocess Data

In [None]:
data_processed = list(map(process_text, data))
len(data_processed)
data_processed

#### Peek Data

In [None]:
# peek 
[d[:100] for d in data_processed ]

## Create Sentence Embedding
Here, we use Universal Sentence Encoder to featurize each text.  
This will create some type of representation of text in latent space.  
The length of each vector is 512.

In [None]:
BASE_VECTORS = get_features(data)

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [None]:
BASE_VECTORS.shape

(53, 512)

## Define Similarity Metric
We use cosine similarity to find simiarity between two vectors.

In [None]:
def cosine_similarity(v1, v2):
    mag1 = np.linalg.norm(v1)
    mag2 = np.linalg.norm(v2)
    if (not mag1) or (not mag2):
        return 0
    return np.dot(v1, v2) / (mag1 * mag2)

In [None]:
def test_similiarity(text1, text2):
    vec1 = get_features(text1)[0]
    vec2 = get_features(text2)[0]
    print(vec1.shape)
    return cosine_similarity(vec1, vec2)

## Semantic Matching/Search
Use the data we defined earlier

In [None]:
def semantic_search(query, data, vectors):
    query = process_text(query)
    print("Extracting features...")
    query_vec = get_features(query)[0].ravel()
    res = []
    for i, d in enumerate(data):
        qvec = vectors[i].ravel()
        sim = cosine_similarity(query_vec, qvec)
        res.append((sim, d[:100], i))
    return sorted(res, key=lambda x : x[0], reverse=True)

def semantic_search_requirements(query, data, vectors):
    query = process_text(query)
    print("Extracting features...")
    query_vec = get_features(query)[0].ravel()
    res = []
    for i, row in data.iterrows():
        qvec = vectors[i].ravel()
        sim = cosine_similarity(query_vec, qvec)
        if(sim >= 0.5):
          res.append((sim, row['clean_sentence'], row['annotated_class'], i))
    return sorted(res, key=lambda x : x[0], reverse=True)

#### Query 1

In [None]:
for i, row in filtredData.iterrows():
  filtredData.at[i, 'clean_sentence'] = process_text(row['statement'])

filtredData.reset_index(drop=True, inplace=True)

In [None]:
df_requirements = pd.read_csv('/content/drive/My Drive/research_project/nlp_privacy_policy_analyze/data/processed_privacy_policy_segments_sample_100_3.csv', encoding='utf-8')
print(df_requirements)
semantic_results = []
for i, row in df_requirements.iterrows():
  sentence = row['statement']
  if(pd.notna(sentence) and sentence.strip() != ""):
    print(row['statement'])
    result = semantic_search_requirements(row['statement'], filtredData, BASE_VECTORS)
    print(result)
    df_requirements.at[i, 'semantic_similarity'] = str(result)
    if result:
      df_requirements.at[i, 'sim_class'] = str(result[0][2])
  # semantic_results.append([df_requirements['id'][i], df_requirements['requirement'][i], result])
#result_df = pd.DataFrame(semantic_results)

df_requirements.to_csv('/content/drive/My Drive/research_project/nlp_privacy_policy_analyze/data/semenatic_sim_privacy_policy_segments_sample_100_3.csv', sep=',')