<table class="tfo-notebook-buttons" align="left">
<td>
<a target="_blank"  href="https://colab.research.google.com/github/mlai-demo/TextExplore/blob/master/RePlutarch_TFembPub.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
</td><td>
<a target="_blank"  href="https://github.com/mlai-demo/TextExplore/blob/master/RePlutarch_TFembPub.ipynb"><img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a></td></table>

In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Install TensorFlow - works on Colab only
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import tensorflow as tf

In [2]:
print(tf.__version__)

2.0.0-rc0


In [0]:
import os
fpath = os.getcwd(); fpath

In [0]:
# if using Google Colab
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Click Files tab - the updloaded file(s) will be there

In [0]:
import re

corpus = open(fpath + '/Plutarch.txt',  'rb').read().lower().decode(encoding='utf-8')
corpus = re.sub('\n', ' ', corpus) #remove new line
corpus = re.sub('\r', ' ', corpus) #remove "return"

In [0]:
import nltk
from nltk.tokenize import sent_tokenize
nltk.download('punkt') #need in Colab upon resetting the runtime
 
# tokenize at sentence level
sentences = nltk.sent_tokenize(corpus)
#print("\n---\n".join(sentences))
print("The number of sentences is {}".format(len(sentences)))

In [0]:
from nltk.tokenize import word_tokenize

word_count = lambda sentence: len(word_tokenize(sentence))
#print(min(sentences, key=word_count)) 
#print('\n')
#print(max(sentences, key=word_count)) 
longest_sentence = max(sentences, key=word_count)
length_longest_sentence = len(word_tokenize(longest_sentence))
print("The longest sentence has {} words".format(length_longest_sentence))

In [0]:
from tensorflow.keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)
sent_numeric = tokenizer.texts_to_sequences(sentences)

In [0]:
len(tokenizer.word_index.items())

In [0]:
word_index = {k:v for k,v in tokenizer.word_index.items()}
word_index["<PAD>"] = 0

vocab_size = len(word_index)
vocab_size

In [0]:
for word in ['considering', 'therefore', 'great', 'oppose']:
    print('{}: {}'.format(word, word_index[word]))

In [0]:
sent_numeric[2:4]

In [0]:
sentences[2:4]

In [0]:
reverse_word_index = dict([(value, key) for (key, value) in word_index.items()])

def decode_data(text):
    return ' '.join([reverse_word_index.get(i, '?') for i in text])
  
#print(reverse_word_index)

In [0]:
decode_data(sent_numeric[3])

In [0]:
sent_numeric[3]

In [0]:
maxLen = length_longest_sentence
data = tf.keras.preprocessing.sequence.pad_sequences(sent_numeric,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=maxLen)

In [0]:
data[0]

In [0]:
decode_data(data[0])

In [0]:
# embedding layer by itself

import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding

embedding_dim = 100

model_justembed = Sequential()
model_justembed.add(Embedding(vocab_size, embedding_dim, input_length=maxLen))

model_justembed.compile('adam', 'mse')
model_justembed.summary()

In [0]:
output_array = model_justembed.predict(data)
#output_array

In [0]:
from tensorflow.keras import layers
from tensorflow.keras.layers import Dense

In [0]:
embedding_dim=100

model = tf.keras.Sequential([
  layers.Embedding(vocab_size, embedding_dim, input_length=maxLen, mask_zero=True),
  layers.GlobalAveragePooling1D(),
  #layers.Dense(100, activation='relu'), #uncomment to compare the versions
  layers.Dense(1, activation='sigmoid')
])

model.summary()

In [0]:
import numpy as np

adam = tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0, amsgrad=False) 
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

batch_size = 16989
data_labels = np.zeros([batch_size, 1])

history = model.fit(
    data,
    data_labels,
    epochs=200,
    batch_size=batch_size,
    verbose = 0)

In [0]:
import matplotlib.pyplot as plt

history_dict = history.history

loss = history_dict['loss']
epochs = range(1, len(loss) + 1)

plt.figure(figsize=(12,9))
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.title('Training loss')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.show()

In [0]:
e = model.layers[0]
weights = e.get_weights()[0]
print(weights.shape)

In [0]:
import io

out_v = io.open('vecs.tsv', 'w', encoding='utf-8')
out_m = io.open('meta.tsv', 'w', encoding='utf-8')
for word_num in range(vocab_size): 
    word = reverse_word_index[word_num]
    embeddings = weights[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

Go to projector.tensorflow.org and upload the two files

In [0]:
f = open('vectors.tsv' ,'w')
f.write('{} {}\n'.format(vocab_size-1, embedding_dim)) 

In [0]:
vectors = model.get_weights()[0]
for words, i in tokenizer.word_index.items():
    str_vec = ' '.join(map(str, list(vectors[i, :])))
    f.write('{} {}\n'.format(words, str_vec))
f.close()

In [0]:
# if running in Colab, this will download files to the local machine (if double-click does not work)
try:
  from google.colab import files
except ImportError:
   pass
else:
  files.download('vecs.tsv')
  files.download('meta.tsv')
  files.download('vectors.tsv')

In [0]:
import gensim

In [0]:
w2v = gensim.models.KeyedVectors.load_word2vec_format('./vectors.tsv', binary=False)

In [0]:
w2v.most_similar('rome')

In [0]:
round(w2v.similarity('rome', 'caesar'),4)

In [0]:
round(w2v.similarity('pompey', 'caesar'),4)

In [0]:
embedding_dim = 100

model2 = tf.keras.Sequential([
    layers.Embedding(vocab_size, embedding_dim, mask_zero=True),
    #layers.Bidirectional(layers.LSTM(64, return_sequences=True)), #another LSTM layer - uncomment to compare
    layers.Bidirectional(layers.LSTM(64)),  
    layers.Dense(64, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model2.summary()

In [0]:
import numpy as np

adam = tf.keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-07, decay=0.0, amsgrad=False) 
model2.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

batch_size = 16989
data_labels = np.zeros([batch_size, 1])

history = model2.fit(
    data,
    data_labels,
    epochs=20,
    verbose = 0)

In [0]:
e2 = model2.layers[0]
weights2 = e2.get_weights()[0]
print(weights2.shape)

(20242, 100)


In [0]:
import io

out_v = io.open('vecs2.tsv', 'w', encoding='utf-8')
out_m = io.open('meta2.tsv', 'w', encoding='utf-8')
for word_num in range(vocab_size):
    word = reverse_word_index[word_num]
    embeddings = weights2[word_num]
    out_m.write(word + "\n")
    out_v.write('\t'.join([str(x) for x in embeddings]) + "\n")
out_v.close()
out_m.close()

Go to projector.tensorflow.org and upload the two files

In [0]:
f = open('vectors2.tsv' ,'w')
f.write('{} {}\n'.format(vocab_size-1, 100))

In [0]:
vectors2 = model2.get_weights()[0]
for words, i in tokenizer.word_index.items():
    str_vec = ' '.join(map(str, list(vectors2[i, :])))
    f.write('{} {}\n'.format(words, str_vec))
f.close()

In [0]:
w2v2 = gensim.models.KeyedVectors.load_word2vec_format('./vectors2.tsv', binary=False)

In [0]:
w2v2.most_similar('rome')

In [0]:
w2v2.most_similar('caesar')

In [0]:
round(w2v2.similarity('pompey', 'caesar'),4)

In [0]:
round(w2v2.similarity('rome', 'caesar'),4)