In [64]:
import pandas as pd
pd.options.mode.chained_assignment = None 
import numpy as np
import re
import nltk
import os
import gensim
import gzip
import heapq
import tensorflow as tf
import sentencepiece as spm

from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Embedding, Dense, Flatten, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from absl import logging

from gensim.models import word2vec, KeyedVectors
from gensim.test.utils import common_texts, get_tmpfile

from nltk.stem import PorterStemmer
from nltk import word_tokenize

from sklearn.metrics.pairwise import cosine_similarity

from time import time

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [57]:
os.chdir('/Users/patrickrs/Documents/GitLab/patrick-steiner/revealapp/00_exploration/Pat')

data = pd.read_csv('/Users/patrickrs/Documents/Gitlab/patrick-steiner/Exercises/data/job_ads_eng.csv')  # .sample(50000, random_state=23)
data_original = data

In [58]:
#Function for cleaning and stemming the data
nltk.download('stopwords')
STOP_WORDS = nltk.corpus.stopwords.words('english')

def clean_sentence(val):
    "remove chars that are not letters or numbers, downcase, then remove stop words"
    regex = re.compile('([^\s\w]|_)+')
    sentence = regex.sub('', val).lower()
    sentence = re.sub("xxxx", "", sentence)
    sentence = re.sub("xxx", "", sentence)
    sentence = re.sub("xx", "", sentence)
    sentence = re.sub("\s\s+", " ", sentence)
       
    # stemming of words (seems not to affect accuracy, but should make things faster
   # porter = PorterStemmer()
   # words = word_tokenize(sentence)
   # sentence = " ".join([porter.stem(word) for word in words])
      
    sentence = sentence.split(" ")
    for word in list(sentence):
        if word in STOP_WORDS:
            sentence.remove(word)  
    sentence = " ".join(sentence)
    
    return sentence


def clean_dataframe(data):
    "drop nans, then apply 'clean_sentence' function to question1 and 2"
    data = data[data['Content'] == data['Content']]  # removes nan since nan == nan -> False
    
    for col in ['Content']:
        data[col] = data[col].apply(clean_sentence)
    
    return data

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/patrickrs/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [59]:
data = clean_dataframe(data)
data = data['Content']

In [60]:
#Build Corpus:
def build_corpus(data):
    "Creates a list of lists containing words from each sentence"
    corpus = []
    for content in data:
        corpus_temp = nltk.word_tokenize(content)
        corpus.append(corpus_temp)
    return corpus

corpus = build_corpus(data)              

# Word2Vec Implementation
This section compares a pre-trained model to a model trained on the data alone.

In [67]:
# Model 1 is trained only on the available data
model_1 = word2vec.Word2Vec(corpus, size=300, min_count=1)

In [61]:
# Importing pre-trained model, updating vocab to include only words present in current dataset.
# and training the model (takes long to run)
model_2 = word2vec.Word2Vec(size=300, min_count=1)
model_2.build_vocab(corpus)
total_examples = model_2.corpus_count
model = gensim.models.KeyedVectors.load_word2vec_format('/Users/patrickrs/Documents/Gitlab/patrick-steiner/revealapp/00_exploration/Pat/GoogleNews-vectors-negative300.bin', binary=True)
model_2.build_vocab([list(model.vocab.keys())], update=True)
model_2.intersect_word2vec_format('/Users/patrickrs/Documents/Gitlab/patrick-steiner/revealapp/00_exploration/Pat/GoogleNews-vectors-negative300.bin', binary=True, lockf=1.0)
# intersect_word2vec_format() will let you bring vectors from an external file into a model that's already had its own vocabulary initialized
# see https://tedboy.github.io/nlps/generated/generated/gensim.models.Word2Vec.intersect_word2vec_format.html
model_2.train(corpus, total_examples=total_examples, epochs=model_2.iter)

  # This is added back by InteractiveShellApp.init_path()


(14306495, 14306495)

In [70]:
# Let's save this model to disk:
wordvectors = model_2.wv
path = get_tmpfile("wordvectors.kv")
model_2.wv.save(path)
wv = KeyedVectors.load("model_2.wv", mmap='r')

FileNotFoundError: [Errno 2] No such file or directory: 'model_2.wv'

In [7]:
swords = model_2.most_similar('system', restrict_vocab=None)

  """Entry point for launching an IPython kernel.


In [123]:
swords

[('systems', 0.9131548404693604),
 ('maintenance', 0.8251392841339111),
 ('control', 0.8240404725074768),
 ('integration', 0.8239598870277405),
 ('etc', 0.8228959441184998),
 ('software', 0.8206909894943237),
 ('operation', 0.818752110004425),
 ('monitoring', 0.813651442527771),
 ('implementation', 0.8078340291976929),
 ('component', 0.8076769709587097)]

In [8]:
swords = [word[0] for word in swords]
swords

['systems',
 'maintenance',
 'integration',
 'control',
 'etc',
 'operation',
 'monitoring',
 'implementation',
 'software',
 'database']

In [118]:
model_1.most_similar('system', restrict_vocab=None)

  """Entry point for launching an IPython kernel.


[('systems', 0.7568676471710205),
 ('configuration', 0.6172717809677124),
 ('onmandatory', 0.6038140058517456),
 ('workflow', 0.5684077739715576),
 ('upgrades', 0.5452710390090942),
 ('operation', 0.5377562046051025),
 ('maintenance', 0.533307671546936),
 ('security', 0.5248088836669922),
 ('storage', 0.5245387554168701),
 ('deployment', 0.5200053453445435)]

In [64]:
#model['system'] # word vectror with cosine distance

In [78]:
#model.wv.word_vec('system', use_norm = True) # Word vector with euclidian distances

In [126]:
def return_similar_rows(sim_words, corpus):
    indexes = []
    for word in sim_words:
        for i in range(len(corpus)):
            if str(corpus[i]).find(word) > (-1):
                indexes.append(i)
    similar_rows = data_original['Content'][indexes]
    return similar_rows

In [129]:
similar_rows = return_similar_rows(sim_words = swords, corpus = corpus)
similar_rows

1       Ihre Herausforderung  You plan and implement A...
2         Since 1989 - more than a quarter of a centur...
8       Website Innovation Manager       ABOUT LA PRAI...
11      Senior Software Engineer Senior Software Engin...
13      Two Scientific researchers in Photovoltaic Mod...
                              ...                        
9945    Global Medical Director   Back to Previous Pag...
9948    Control Systems Engineer/Programmer (m/f)Dutie...
9975    SPS ENGINEER AUTOMATION (m/w) - 100%RHI Magnes...
9978    Strategic Product Portfolio Architect - Workfo...
9986    SWIFT Alliance / Financial Messaging Specialis...
Name: Content, Length: 20721, dtype: object

# Applying word2vec to whole sentences
This section applies word2vec to whole sentences by simply using the average of the word vectors.

In [134]:
def avg_sentence_vector(sentence, model):
    #function to average all words vectors in a given paragraph/sentence
    words = word_tokenize(sentence)
    sentenceVec = model[words[0]]
    for i in range(1, len(words)):
        sentenceVec = sentenceVec + model[words[i]]
    return sentenceVec

In [142]:
sentenceVec1 = avg_sentence_vector("hello people", model = model_2)
sentenceVec2 = avg_sentence_vector("howdy guys", model = model_2)
sen1_sen2_similarity =  cosine_similarity(sentenceVec1.reshape(1, -1),sentenceVec2.reshape(1, -1))
sen1_sen2_similarity

  after removing the cwd from sys.path.
  


array([[0.28418714]], dtype=float32)

In [148]:
sentenceVec1 = avg_sentence_vector(data[0], model = model_2)
sentenceVec2 = avg_sentence_vector(data[1], model = model_2)
sen1_sen2_similarity =  cosine_similarity(sentenceVec1.reshape(1, -1),sentenceVec2.reshape(1, -1))
sen1_sen2_similarity

  after removing the cwd from sys.path.
  


array([[0.95182484]], dtype=float32)

 This clearly doesnt work for long sentences or paragraphs because of the LLN.... Will have to find alternate solution

# Applying RNN for similarity calculation
inspired by: https://github.com/eliorc/Medium/blob/master/MaLSTM.ipynb

In [114]:
from time import time
import itertools
import datetime

from gensim.models import KeyedVectors
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras.layers import Input, Embedding, LSTM, Lambda
import tensorflow.keras.backend as K
from tensorflow.keras.optimizers import Adadelta
from tensorflow.keras.callbacks import ModelCheckpoint

In [19]:
s2n =[None] * len(data)  # s2n -> sentence to numbers representation
for i in range(len(data)):
        s2n[i]= [model_2[word] for word in word_tokenize(data[i])]
# this huge thing is created super fast

  This is separate from the ipykernel package so we can avoid doing imports until


In [28]:
# Define max length of sentence so we can pad shorter ones later. 
max_seq_length = max(map(len, s2n))
max_seq_length

3329

In [41]:
padded = pad_sequences(s2n[0], maxlen=max_seq_length)
padded.shape

(378, 3329)

In [43]:
len(s2n[0])

378

In [46]:
ex = [1, 2, 3]
pad_sequences(ex, maxlen = 5)

ValueError: `sequences` must be a list of iterables. Found non-iterable: 1

In [49]:
type(s2n[0])

list

# Let's try using the  TF-Hub Universal Encoder
see https://ai.googleblog.com/2019/07/multilingual-universal-sentence-encoder.html


In [7]:
import tensorflow_hub as hub

In [8]:
module_url = "https://tfhub.dev/google/universal-sentence-encoder/2" #@param ["https://tfhub.dev/google/universal-sentence-encoder/2", "https://tfhub.dev/google/universal-sentence-encoder-large/3"]
# see https://tfhub.dev/google/universal-sentence-encoder/4

In [9]:
# Import the Universal Sentence Encoder's TF Hub module (can take a few mins the first time around)
embed = hub.Module(module_url)

Instructions for updating:
Colocations handled automatically by placer.


Instructions for updating:
Colocations handled automatically by placer.


## Semantic Textual Similarity
The embeddings produced by the Universal Sentence Encoder are approximately normalized. The semantic similarity of two sentences can be trivially computed as the inner product of the encodings.

In [13]:
# Each row in data can be embeded
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    data_embeddings = session.run(embed(list(data)))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [14]:
data_embeddings[0]  

array([-0.05130013, -0.00565712,  0.03568609,  0.05133351, -0.01646298,
       -0.05262072,  0.00854534, -0.00012991, -0.05174257, -0.04823807,
       -0.02331096, -0.041965  , -0.04048616,  0.04619381, -0.05205887,
        0.05172755,  0.04799028, -0.01058761,  0.05254794, -0.0494557 ,
        0.05048445, -0.05253791, -0.05041573,  0.05268097, -0.03591403,
        0.04785269, -0.0223821 ,  0.04875357,  0.05069131, -0.05192769,
        0.03304809,  0.02723596,  0.01256995, -0.05118216, -0.04427535,
        0.05248671,  0.05168652, -0.05252631, -0.05268215, -0.04344384,
       -0.0386739 , -0.05268234, -0.05266885,  0.05268234, -0.05245078,
        0.05246299,  0.04202251, -0.02781977,  0.02364252, -0.05268234,
        0.05093616,  0.05235935,  0.05268234, -0.04649564, -0.03376671,
        0.05268233, -0.04624163, -0.05268233, -0.05227803,  0.02614941,
        0.0479455 ,  0.00782102,  0.01165644, -0.04419516, -0.05088451,
       -0.0526678 ,  0.05242542, -0.00879739, -0.05205579,  0.05

In [15]:
# And a similarity matrix (sim_mat) can be calculated using the inner product:
sim_mat = np.inner(data_embeddings, data_embeddings)

In [16]:
sim_mat

array([[1.0000002 , 0.4655776 , 0.8773799 , ..., 0.6845747 , 0.7198032 ,
        0.6262263 ],
       [0.4655776 , 1.0000001 , 0.49171638, ..., 0.435234  , 0.45266008,
        0.38144305],
       [0.8773799 , 0.49171638, 0.99999994, ..., 0.6869832 , 0.7954156 ,
        0.6045858 ],
       ...,
       [0.6845747 , 0.435234  , 0.6869832 , ..., 0.9999996 , 0.6660189 ,
        0.5737752 ],
       [0.7198032 , 0.45266008, 0.7954156 , ..., 0.6660189 , 0.99999976,
        0.6639155 ],
       [0.6262263 , 0.38144305, 0.6045858 , ..., 0.5737752 , 0.6639155 ,
        1.0000002 ]], dtype=float32)

### Comparison to word2vec

In [14]:
swords # See word2vec above

['systems',
 'maintenance',
 'integration',
 'control',
 'etc',
 'operation',
 'monitoring',
 'implementation',
 'software',
 'database']

In [41]:
# Lets first calculate the embeddings for each word in the corpus using the USE:
corpus_list = [word for sublist in corpus for word in sublist]
corpus_list= list(dict.fromkeys(corpus_list)) # transform in dict to remove duplicates

In [42]:
with tf.Session() as session:
    session.run([tf.global_variables_initializer(), tf.tables_initializer()])
    use_corpus_emb = session.run([embed([word]) for word in corpus_list])
# This takes forwever, might be a good idea to use the lite version next time
# see https://github.com/tensorflow/tfjs-models/tree/master/universal-sentence-encoder for this

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


KeyboardInterrupt: 

In [44]:
# The embedding of the word we are comparing:
start = time()
use_system = embed(list('system'))
end = time()
print(end - start)


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


4.126125812530518


In [None]:
# And the similarity:
system_sim = [cosine_similarity(use_system, word_emb) for word_emb in use_corpus_emb]

In [None]:
# The 10 largest similarities and their indexes 
# (https://stackoverflow.com/questions/13070461/get-indices-of-the-top-n-values-of-a-list)
top10_use_index = zip(*heapq.nlargest(10, enumerate(system_sim), key=operator.itemgetter(1)))[0]

In [None]:
# And, finaly, the words:
corpus_list[top10_use_index]

### Similarity matrix works well, but what about learning? 
In order to implement the feedback we need to add layers to the pre-treained model.
(See https://www.dlology.com/blog/keras-meets-universal-sentence-encoder-transfer-learning-for-text-data/
https://towardsdatascience.com/using-use-universal-sentence-encoder-to-detect-fake-news-dfc02dc32ae9 for examples)

In [112]:
def UniversalEmbedding(x):
    return embed(tf.squeeze(tf.cast(x, tf.string)), signature="default", as_dict=True)["default"]
# tf.squeeze: Given a tensor input, this operation returns a tensor of the same type with all dimensions 
# of size 1 removed. If you don't want to remove all size 1 dimensions, you can remove specific size 1 dimensions 
# by specifying axis
# tf.cast(x, dtype): Casts a tensor to a new type.


In [None]:
embed_size = 512
input_text = layers.Input(shape=(1,), dtype=tf.string)
embedding = layers.Lambda(UniversalEmbedding, output_shape=(embed_size,))(input_text)
dense = layers.Dense(256, activation='relu')(embedding)
pred = layers.Dense(2, activation='sigmoid')(dense) # 2 catregorie
model = Model(inputs=[input_text], outputs=pred)
model.compile(loss='binary_crossentropy', 
	optimizer='nadam', metrics=['mse', 'accuracy'])

In [None]:
model.summary()

# Trying USE Lite
https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/semantic_similarity_with_tf_hub_universal_encoder_lite.ipynb

In [22]:
import tensorflow.compat.v1 as tf
tf.disable_v2_behavior()

Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


In [23]:
l_module = hub.Module("https://tfhub.dev/google/universal-sentence-encoder-lite/2")

In [24]:
input_placeholder = tf.sparse_placeholder(tf.int64, shape=[None, None])
encodings = l_module(inputs=dict(values=input_placeholder.values,
                                 indices=input_placeholder.indices,
                                 dense_shape=input_placeholder.dense_shape))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [25]:
with tf.Session() as sess:
  spm_path = sess.run(l_module(signature="spm_path"))

sp = spm.SentencePieceProcessor()
sp.Load(spm_path)
print("SentencePiece model loaded at {}.".format(spm_path))

INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


SentencePiece model loaded at b'/var/folders/w8/9j9hhb7d2f77hqdbf_btrxn00000gn/T/tfhub_modules/539544f0a997d91c327c23285ea00c37588d92cc/assets/universal_encoder_8k_spm.model'.


In [26]:
def process_to_IDs_in_sparse_format(sp, sentences):
  # An utility method that processes sentences with the sentence piece processor
  # 'sp' and returns the results in tf.SparseTensor-similar format:
  # (values, indices, dense_shape)
  ids = [sp.EncodeAsIds(x) for x in sentences]
  max_len = max(len(x) for x in ids)
  dense_shape=(len(ids), max_len)
  values=[item for sublist in ids for item in sublist]
  indices=[[row,col] for row in range(len(ids)) for col in range(len(ids[row]))]
  return (values, indices, dense_shape)

In [48]:
# Compute a representation for each message, showing various lengths supported.
word = "Elephant"
sentence = "I am a sentence for which I would like to get its embedding."
paragraph = (
    "Universal Sentence Encoder embeddings also support short paragraphs. "
    "There is no hard limit on how long the paragraph is. Roughly, the longer "
    "the more 'diluted' the embedding will be.")
messages = [word, sentence, paragraph]

values, indices, dense_shape = process_to_IDs_in_sparse_format(sp, ['system'])



In [49]:
start = time()
with tf.Session() as session:
  session.run([tf.global_variables_initializer(), tf.tables_initializer()])
  message_embeddings = session.run(
      encodings,
      feed_dict={input_placeholder.values: values,
                input_placeholder.indices: indices,
                input_placeholder.dense_shape: dense_shape})
end = time()
print(start - end)

-216.4357509613037


In [39]:
len(corpus_list)

2861299

In [50]:
len(list(dict.fromkeys(corpus_list)))

76075

# NN for word2vec embeddings

In [62]:
# see https://sturzamihai.com/how-to-use-pre-trained-word-vectors-with-keras/
embedding_matrix = np.zeros((len(model_2.wv.vocab) + 1, 300))
for i, vec in enumerate(model_2.wv.vectors):
  embedding_matrix[i] = vec

In [66]:
# how many features should the tokenizer extract
features = 500
tokenizer = Tokenizer(num_words = features)
# fit the tokenizer on our text
tokenizer.fit_on_texts(data)

In [67]:
# get all words that the tokenizer knows
word_index = tokenizer.word_index

In [68]:
# put the tokens in a matrix
X = tokenizer.texts_to_sequences(data)
X = pad_sequences(X)

In [72]:
# init model
model = Sequential()
# emmbed word vectors
model.add(Embedding(len(model_2.wv.vocab) + 1 ,
                    300,
                    input_length  = X.shape[1],
                    weights = [embedding_matrix],
                    trainable=False))
# learn the correlations
model.add(LSTM(300,return_sequences=False))
model.add(Dense(1,activation="sigmoid")) 
# output model skeleton
model.summary()
model.compile(optimizer="nadam",loss="binary_crossentropy",metrics=['acc'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 755, 300)          916212300 
_________________________________________________________________
lstm (LSTM)                  (None, 300)               721200    
_________________________________________________________________
dense (Dense)                (None, 1)                 301       
Total params: 916,933,801
Trainable params: 721,501
Non-trainable params: 916,212,300
_________________________________________________________________


In [73]:
data.shape

(10000,)

In [75]:
len(model_2.wv.vocab) * 300

916212000