### Tag generation prototype v0.2
Created: 12 July 2022
v0.2 created: 15 July 2022

parrot: https://github.com/PrithivirajDamodaran/Parrot_Paraphraser <br>
rasa prototype: https://colab.research.google.com/drive/1RGWrQv3e0CRDPDROQ3ZmUWTmlRljasGi#scrollTo=776uG9Q6DTnf <br>

In [67]:
import re
import random
import itertools
import warnings
warnings.filterwarnings("ignore")

# numpy
import numpy as np
from numpy.linalg import norm

# nltk, spacy, gensim
import spacy
import nltk
from nltk.corpus import wordnet as wn
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from gensim.models.word2vec import Word2Vec
import gensim.downloader as api

# pytorch, parrot
from parrot import Parrot
import torch

# sentence transformers
from sentence_transformers import SentenceTransformer, util

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/matthewstachyra/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/matthewstachyra/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthewstachyra/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## what will make up tags?
#### current philosophy is to create as many unique but semantically related tokens as possible, and use those as tags

#### WIP tags generated
1. tokens from words in note
2. tokens from paraphrases of note
3. synonyms of nouns in note
4. synonyms of nouns in each paraphrase

## using `parrot` paraphraser to generate paraphrases, demo below

In [4]:
parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")

In [5]:
phrases = ["Can you recommed some upscale restaurants in Newyork?",
           "I was sad that I failed my exam but I knew that I could score better next time."
]

for phrase in phrases:
    print("-"*100)
    print("Input_phrase: ", phrase)
    print("-"*100)
    para_phrases = parrot.augment(input_phrase=phrase, use_gpu=False)
    for para_phrase in para_phrases:
        print(para_phrase)

----------------------------------------------------------------------------------------------------
Input_phrase:  Can you recommed some upscale restaurants in Newyork?
----------------------------------------------------------------------------------------------------
('list some upscale restaurants in newyork?', 27)
('can you recommend some of the best restaurants in newyork?', 23)
('can you recommend some highend restaurants in newyork?', 20)
('can you recommend some good upscale restaurants in new york?', 19)
('can you recommend some upscale restaurants in new york?', 14)
('can you recommend some upscale restaurants in newyork?', 13)
----------------------------------------------------------------------------------------------------
Input_phrase:  I was sad that I failed my exam but I knew that I could score better next time.
----------------------------------------------------------------------------------------------------
('i was sad to miss the exam but i knew i could score a 

## creating classes to establish pipeline: preprocessing text, generating synonyms, generating paraphrases, generating tags

In [9]:
!python -m spacy download en_core_web_sm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Collecting en-core-web-sm==3.4.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.4.0/en_core_web_sm-3.4.0-py3-none-any.whl (12.8 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m26.8 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m0:01[0m01[0m
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.4.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')


In [6]:
class Preprocessor:
    def __init__(self, note, remove_stopwords=False, lemmatize=False):
        if not note: raise ValueError("Error: Input is invalid. It should be a string.")
        self.note = note
        self.stopwords = remove_stopwords
        self.lemmatize = lemmatize
        
    def __call__(self):
        tokenizer = RegexpTokenizer(r'\w+')
        re_strip = re.compile('<.*?>')
        lemmatizer = WordNetLemmatizer()
        
        self.note = self.note.lower()
        self.note = re.sub(re_strip, '', self.note)
        self.note = re.sub('[0-9]+', '', self.note)
        self.note = " ".join(tokenizer.tokenize(self.note))
        
        if self.lemmatize:
            self.note = "".join([lemmatizer.lemmatize(word) for word in self.note])
        
        if self.stopwords:
            return self.remove_stopwords()
        else:
            return self.note
    
    def remove_stopwords(self):
        stopwords = nltk.corpus.stopwords.words('english')
        return " ".join([word for word in self.note.split() if word not in stopwords])

In [7]:
p = Preprocessor("This is a test note!@$#!@#!")
p()

'this is a test note'

In [8]:
p = Preprocessor("This is a test note!@$#!@#!", remove_stopwords=True)
p()

'test note'

In [15]:
class Synonymizer:
    def __init__(self, note):
        self.preprocessor = Preprocessor(note)
        self.note = self.preprocessor()
        self.glovemodel = api.load("glove-wiki-gigaword-100")
        self.spacymodel = spacy.load("en_core_web_sm")
        self.posmap = {'VERB':'v', 'NOUN':'n', 'PRON':'n', 'PROPN':'n', 'ADJ':'a', 'ADV':'r'}
        
    def __call__(self):
        '''return dictionary of words : synonym(s) pairs.
        '''
        # NOTE current version removes n grams.
        d = {}

        for word in self.note.split():
            synonyms = self.synonyms_by_word(word)

            if synonyms: synonyms = list(set([synonym
                                     for synonym in synonyms
                                     if len(synonym.split("_"))==1]))

            if synonyms: d[word] = synonyms
            
        return d
        
    def pos_by_word(self, word):
        for w in self.spacymodel(self.note):
            if str(w)==word: return w.pos_

    def similarities_by_word(self, word, synonyms):
        def cosinesim(v1, v2):
            return (np.dot(v1, v2 / (norm(v1) * norm(v2))))
        
        def embed(vector, model):
            try:
                vec = self.glovemodel.get_vector(vector)
            except:
                return np.empty(0)
            return vec

        sims = {word: 1.0}
        ref  = embed(word, self.glovemodel)

        for s in synonyms:
            vec = embed(s, self.glovemodel)
            if vec.any():
                sim     = cosinesim(ref, vec)
                sims[s] = sim

        return sims

    def print_similarities(self, similarities):
        for synonym, similarity in similarities.items():
            print(f"word: {synonym}, cosine similarity: {similarity}")

    def synonyms_by_word(self, word):
        pos = self.pos_by_word(word)

        if pos not in ['VERB', 'NOUN', 'PRON', 'PROPN', 'ADJ', 'ADV']: return

        # get full set of synonyms
        synonyms = set(list(itertools.chain([synonym
                                             for synset in wn.synsets(word, pos=self.posmap[pos])
                                             for synonym in synset.lemma_names()
                                             if len(word)>1])))

        # filter this set using cosine similarities
        similarities = self.similarities_by_word(word, synonyms)

        return [synonym
                for synonym, similarity in similarities.items()
                if similarity>=0.70]
    

In [16]:
s = Synonymizer("I was sad that I failed my exam but I knew that I could score better next time.")

In [17]:
s()

{'i': ['i'],
 'sad': ['sorry', 'sad'],
 'failed': ['fail', 'failed'],
 'my': ['my'],
 'exam': ['examination', 'exam'],
 'knew': ['know', 'knew'],
 'score': ['score'],
 'better': ['best', 'good', 'better', 'well'],
 'next': ['next', 'future'],
 'time': ['time']}

In [18]:
class Paraphraser:
    def __init__(self, note, with_gpu=False):
        self.paraphrases = []
        self.preprocessor = Preprocessor(note)
        self.synonymizer = Synonymizer(note)
        self.parrot = Parrot(model_tag="prithivida/parrot_paraphraser_on_T5")
        self.note = self.preprocessor()
        self.synonyms = self.synonymizer() 
        self.gpu = with_gpu
    
    def __call__(self):
        '''return list of paraphrases of the note.
        '''
        self.transformer_phrases()
        self.synonym_phrases()
        random.shuffle(self.paraphrases)
        return self.paraphrases
        
    def transformer_phrases(self):
        phrases = [tup[0] for tup in parrot.augment(input_phrase=self.note, 
                                                 use_gpu=self.gpu)]
        self.paraphrases.extend(phrases)
        
    def synonym_phrases(self):
        genlist = []
        tokens  = []
        prev    = 0

        for word in self.note.split():
            if word in self.synonyms:
                tokens.append(list(itertools.chain(*[[word], self.synonyms[word]])))
            else:
                tokens.append([word])

        # use tokens to return new utterances
        for i in range(len(tokens)):
            word  = tokens[i][0]
            slist = tokens[i]

            for j in range(len(slist)):
                start = self.note.find(word, prev)
                end   = start + len(word)
                gen   = self.note[:start] + slist[j] + self.note[end:]
                genlist.append(gen)

            prev = end

        self.paraphrases.extend(list(set(genlist)))

In [19]:
p = Paraphraser("I was sad that I failed my exam but I knew that I could score better next time.")

In [141]:
p()

['i was sad to fail an exam but i knew i could get better next time', 'i was sad that i failed my exams but i knew i would be able to score better next time', 'i was sad that i failed the exams but i knew i could score better next time', 'i was sad that i failed my exam but i knew i could score better the next time', 'i was sad that i failed my exams but i knew i could score better next time', 'i was sad that i failed my exam but i knew i could score better next time', 'i was sad i failed my exam but i knew that i could score better next time', 'i was sad that i failed my exam but i knew that i could score better next time']


['i was sad to fail an exam but i knew i could get better next time',
 'i was sad that i failed my exams but i knew i would be able to score better next time',
 'i was sad that i failed the exams but i knew i could score better next time',
 'i was sad that i failed my exam but i knew i could score better the next time',
 'i was sad that i failed my exams but i knew i could score better next time',
 'i was sad that i failed my exam but i knew i could score better next time',
 'i was sad i failed my exam but i knew that i could score better next time',
 'i was sad that i failed my exam but i knew that i could score better next time',
 'i was sad that i failed my exam but i knew that i could score better future time',
 'i was sad that i failed my exam but i knew that i could score better next time',
 'i was sorry that i failed my exam but i knew that i could score better next time',
 'i was sad that i failed my exam but i knew that i could score well next time',
 'i was sad that i failed 

Tags
- note
- cleaned note split
- paraphrases
- cleaned paraphrases split


In [60]:
class Tagger:
    def __init__(self, note, with_gpu=False):
        self.tags = [note]
        self.paraphraser = Paraphraser(note, with_gpu)
        self.note = self.paraphraser.note
        self.preprocessor = Preprocessor(note, lemmatize=True, remove_stopwords=True)
        
    def __call__(self):
        self.note_tags()
        self.paraphrase_tags()
        return list(set(self.tags))

    def note_tags(self):
        self.tags.append(self.preprocessor())
        
    def paraphrase_tags(self):
        for p in self.paraphraser():
            self.tags.append(p)
            c = Preprocessor(p, lemmatize=True, remove_stopwords=True)
            self.tags.extend(c().split())
     

In [61]:
t = Tagger("My uncle took me to get ice cream. I learned that I love chocolate")

In [62]:
t()

['my uncle have me to get ice cream i learned that i love chocolate',
 'my uncle took me out for ice cream i discovered that i love chocolates',
 'my uncle took me to get ice cream i learn that i love chocolate',
 'start',
 'uncle took get ice cream learned love chocolate',
 'come',
 'my uncle took me to let ice cream i learned that i love chocolate',
 'my uncle took me to go ice cream i learned that i love chocolate',
 'let',
 'bring',
 'find',
 'took',
 'love',
 'chocolates',
 'my uncle took me to find ice cream i learned that i love chocolate',
 'learn',
 'my uncle took me to take ice cream i learned that i love chocolate',
 'chocolate',
 'my uncle took me to the ice cream shop and i learned that i love chocolate',
 'shop',
 'learned',
 'get',
 'uncle',
 'my uncle took me to ice cream and i learned that i love chocolate',
 'make',
 'My uncle took me to get ice cream. I learned that I love chocolate',
 'ice',
 'my uncle take me to get ice cream i learned that i love chocolate',
 'my 

### trial `sentence-transformers` for asymmetric semantic search

In [63]:
embedder = SentenceTransformer('all-MiniLM-L6-v2')

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Downloading:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/612 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/350 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/13.2k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [84]:
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'A cheetah is running behind its prey.',
          'apples, oranges, eggs, chicken'
          ]

In [85]:
corpus_embeddings = embedder.encode(corpus, convert_to_tensor=True)

In [86]:
corpus_embeddings

tensor([[ 0.0332,  0.0044, -0.0063,  ...,  0.0692, -0.0246, -0.0376],
        [ 0.0525,  0.0552, -0.0112,  ..., -0.0162, -0.0602, -0.0412],
        [-0.0363, -0.0357, -0.0272,  ..., -0.0386,  0.1057, -0.0013],
        ...,
        [ 0.0235, -0.0585,  0.0560,  ...,  0.0583,  0.0377,  0.0410],
        [ 0.0228,  0.1041, -0.0340,  ...,  0.0029,  0.0386,  0.0438],
        [ 0.0042,  0.0007, -0.0168,  ...,  0.0888,  0.1127,  0.0361]])

In [87]:
query = 'grocery list' 

In [88]:
query_embedding = embedder.encode(query, convert_to_tensor=True)

In [89]:
hits = util.semantic_search(query_embedding, corpus_embeddings, top_k=5, score_function=util.dot_score)

In [90]:
hits

[[{'corpus_id': 9, 'score': 0.3955247104167938},
  {'corpus_id': 0, 'score': 0.21664908528327942},
  {'corpus_id': 1, 'score': 0.19876854121685028},
  {'corpus_id': 8, 'score': 0.06794281303882599},
  {'corpus_id': 2, 'score': 0.050490930676460266}]]

In [91]:
# hit 1
print(corpus[hits[0][0]['corpus_id']])

# hit 2
print(corpus[hits[0][1]['corpus_id']])

apples, oranges, eggs, chicken
A man is eating food.
