# Finding Similar Tags

## Preparation

### Packages

In [87]:
import pandas as pd
import numpy as np

#progress bar
from tqdm import tqdm
tqdm.pandas(desc="progress-bar")

#NLP Toolkit
import nltk
from nltk.corpus import stopwords
nltk.download('punkt')

#Embeddings
import gensim
#from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models.doc2vec import TaggedDocument

#plotting
import matplotlib.pyplot as plt

None

  from pandas import Panel
[nltk_data] Downloading package punkt to /home/sevi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Functions

In [137]:
def tokenize_text_by_sentence(text, stopwords):
    '''creates a list of list where each list contains the tokens of a sentence.'''
    list_all = []
    for sent in nltk.sent_tokenize(text):
        curr_sent = []
        for word in nltk.word_tokenize(sent):
            if(word in stopwords):
                continue
            curr_sent.append(word.lower())
        list_all.append(curr_sent)
    return list_all

### Sample Data

In [131]:
doc_1 = '''I opened up a money market account at First Niagara Bank in XXXX XXXX CT on XXXX/XXXX/2016. I was told that the APY interest rate was 1.01 % based on the advertisement in the XXXX XXXX as well as a verbal discussion. There was no mention that the rate was related to compounding and this was confirmed verbally with the bank officer. And that taking out the interest monthly would not affect the rate that I was to receive. After 33 days of " interest '', from XX/XX/XXXX to XX/XX/XXXX, I received not XXXX of 1.01 % but rather XXXX of 0.81 % and my printed statement says that my APY is now 0.81 % not the 1.01 % I was expecting. 
The interest I received was XXXX % LESS that I was expecting. The difference in the amount of money is not very much but First Niagara, in my opinion, lied to me with false advertisements and false verbal discussions, apparently to get me to put my money into a seemingly high interest money market account. Multiply this by XXXX customers and XXXX dollars and the amount of money they defrauded consumers could be very substantial. In my opinion, First Niagara deceived me and its customers and violated honest banking practices.
Product: Bank account or service'''
doc_2 = '''In this post we covered different approaches for word representation in NLP tasks (BOW, TF-IDF and Word Embeddings), learnt how to learn word representation from its context using Word2Vec, saw how we can extract meaningful phrases from a given corpus (NPMI and data-driven approach) and how to transform a given corpus in order to learn similar terms/words for each one of extracted terms/words using Word2Vec algorithm. The results of this process can be used in a downstream task, like Query Expansion in Information Extraction tasks, Document Classification, Clustering, Question-Answering and many more.'''
doc_3 = '''On our 1.6 billion words corpus, it took us 1 hour to construct bi-grams and another 2 hours to train Word2Vec (with batch Skip-Gram, 300 dimension, 10 epochs, context of k=5 , negative sampling of 5, learning rate of 0.01 and minimum word count of 5) on a machine with 16 CPUs and 64 RAM using AWS Sagemaker service. A great Notebook example of how to use AWS Sagemaker service to train Word2Vec can be found here.'''

In [132]:
sample_text = pd.DataFrame(data = [doc_1, doc_2, doc_3])

In [133]:
sample_text

Unnamed: 0,0
0,I opened up a money market account at First Ni...
1,In this post we covered different approaches f...
2,"On our 1.6 billion words corpus, it took us 1 ..."


## Creating AdHoc Word2Vec Word Embedding

### Tokenize

In [139]:
STOPWORDS = ['.']

In [140]:
text_tokens = sample_text.apply(lambda row: tokenize_text_by_sentence(row[0], STOPWORDS), axis=1)# tokenize_text(sample_text)

In [141]:
text_tokens = list(text_tokens)

In [143]:
corpus = [sent_tokens for sent_tokens_lists in text_tokens for sent_tokens in sent_tokens_lists]

### Embedding

In [200]:
embedding_dim = 100
window_size = 20
min_term_occurence = 5

embedding = Word2Vec(
                        sentences = corpus, 
                        size=embedding_dim, 
                        window=window_size, 
                        min_count=min_term_occurence, 
                        workers=-1,
                        seed = 42
                        ) 

In [131]:
doc_1 = '''I opened up a money market account at First Niagara Bank in XXXX XXXX CT on XXXX/XXXX/2016. I was told that the APY interest rate was 1.01 % based on the advertisement in the XXXX XXXX as well as a verbal discussion. There was no mention that the rate was related to compounding and this was confirmed verbally with the bank officer. And that taking out the interest monthly would not affect the rate that I was to receive. After 33 days of " interest '', from XX/XX/XXXX to XX/XX/XXXX, I received not XXXX of 1.01 % but rather XXXX of 0.81 % and my printed statement says that my APY is now 0.81 % not the 1.01 % I was expecting. 
The interest I received was XXXX % LESS that I was expecting. The difference in the amount of money is not very much but First Niagara, in my opinion, lied to me with false advertisements and false verbal discussions, apparently to get me to put my money into a seemingly high interest money market account. Multiply this by XXXX customers and XXXX dollars and the amount of money they defrauded consumers could be very substantial. In my opinion, First Niagara deceived me and its customers and violated honest banking practices.
Product: Bank account or service'''
doc_2 = '''In this post we covered different approaches for word representation in NLP tasks (BOW, TF-IDF and Word Embeddings), learnt how to learn word representation from its context using Word2Vec, saw how we can extract meaningful phrases from a given corpus (NPMI and data-driven approach) and how to transform a given corpus in order to learn similar terms/words for each one of extracted terms/words using Word2Vec algorithm. The results of this process can be used in a downstream task, like Query Expansion in Information Extraction tasks, Document Classification, Clustering, Question-Answering and many more.'''
doc_3 = '''On our 1.6 billion words corpus, it took us 1 hour to construct bi-grams and another 2 hours to train Word2Vec (with batch Skip-Gram, 300 dimension, 10 epochs, context of k=5 , negative sampling of 5, learning rate of 0.01 and minimum word count of 5) on a machine with 16 CPUs and 64 RAM using AWS Sagemaker service. A great Notebook example of how to use AWS Sagemaker service to train Word2Vec can be found here.'''

In [132]:
sample_text = pd.DataFrame(data = [doc_1, doc_2, doc_3])

In [133]:
sample_text

Unnamed: 0,0
0,I opened up a money market account at First Ni...
1,In this post we covered different approaches f...
2,"On our 1.6 billion words corpus, it took us 1 ..."


## Creating AdHoc Word2Vec Word Embedding

### Tokenize

In [139]:
STOPWORDS = ['.']

In [140]:
text_tokens = sample_text.apply(lambda row: tokenize_text_by_sentence(row[0], STOPWORDS), axis=1)# tokenize_text(sample_text)

In [141]:
text_tokens = list(text_tokens)

In [143]:
corpus = [sent_tokens for sent_tokens_lists in text_tokens for sent_tokens in sent_tokens_lists]

## Finding most similar terms

In [190]:
def n_most_similar_words(term, number_of_similar):
    similar = embedding.wv.most_similar(positive=[term], topn = number_of_similar)
    return [term_tuple[0] for term_tuple in similar], [term_tuple[1] for term_tuple in similar]   

In [191]:
similar = embedding.wv.most_similar(positive=['money'])

In [192]:
[term_tuple[0] for term_tuple in similar]

['that', 'interest', 'in', 'the', 'my', '%', ',', 'xxxx', 'and', 'a']

## Continuous Training as new data comes in

### Example

In [197]:

#Current words in corpus
print(embedding.corpus_total_words)
#which is equal to the total tokens in our original set of tokens:
all_tokens_in_corpus = [tokens for sub_corp in corpus for tokens in sub_corp]
print(len(all_tokens_in_corpus))

18
411


In [168]:
new_doc_tokens = [['This', 'is', 'the', 'first', 'sentence', 'of', 'the', 'new', 'document'],
                 ['This', 'is', 'the', 'second', 'sentence', 'of', 'the', 'new', 'document']]

In [196]:
#at least one of them is not already in the corpus:
'sentence' in all_tokens_in_corpus

False

In [195]:
#adding new words to vocabulary
embedding.build_vocab(new_doc_tokens, update=True)
#retraining weights of NN
embedding.train(new_doc_tokens, total_examples=2, epochs=1)

#embedding.train(new_doc_tokens, total_examples=2, epochs=10)

ValueError: negative dimensions are not allowed

In [193]:
print(embedding.corpus_total_words)

411


In [182]:
embedding.wv.most_similar(positive=['sentence'])

KeyError: "word 'sentence' not in vocabulary"

In [201]:
len(embedding.wv.vocab)

15

In [202]:
embedding.build_vocab([['potoatoes', 'and', 'farmers']], update=True)
len(embedding.wv.vocab)

15