Copyright 2019 Almintas Povilaitis

Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at

https://www.apache.org/licenses/LICENSE-2.0

Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.

<table class="tfo-notebook-buttons" align="left">
<td>
<a target="_blank"  href="https://colab.research.google.com/github/mlai-demo/TextExplore/blob/master/RePlutarch_EmbedPub.ipynb">
    <img src="https://www.tensorflow.org/images/colab_logo_32px.png" />Run in Google Colab</a>
</td><td>
<a target="_blank"  href="https://github.com/mlai-demo/TextExplore/blob/master/RePlutarch_EmbedPub.ipynb"><img width=32px src="https://www.tensorflow.org/images/GitHub-Mark-32px.png" />View source on GitHub</a></td></table>

## The basics

### Download the libraries and dataset

Check the current directory and upload the text file:

In [0]:
import os
import re
fpath = os.getcwd(); fpath

In [0]:
# if using Google Colab
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
    print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))
  
# Click Files tab - the updloaded file(s) will be there

### Pre-process the text
Tokenize, convert to lower case, remove some punctuation while preserving the sentence structure, then save the new text for future reference:

In [0]:
with open(fpath + '/Plutarch.txt') as f, open(fpath + '/Plutarch2.txt', 'w') as out_f:
    text = f.read().lower()
    new_text = re.sub('[^a-z\.\?\!\-\'\:\;]', ' ', text) #keep only wanted characters (alphabet and select punctuation)    new_text = re.sub(' +', ' ', new_text)#remove double empty spaces between words
    new_text = re.sub(' +', ' ', new_text) #remove double space
    new_text = re.sub('\n', ' ', new_text) #remove new line
    items = [w for w in new_text.split(' ') if w.strip() != '' or w == '\n']
    unique_items = set(items)
    print("The text is {} words long, has {} unique items and {} characters on average\n".format
      (len(items), len(unique_items), round(sum(len(word) for word in items)/len(items),2)))
    print("First 1000 characters of the text:\n", new_text[:1000])
    out_f.write(new_text)

## Word embeddings

In [0]:
import nltk
nltk.download('punkt') #used in Colab
from nltk.tokenize import word_tokenize, sent_tokenize

data = []
for i in sent_tokenize(new_text): 
    temp = [] 
      
    # tokenize the sentence into words 
    for w in word_tokenize(i): 
        temp.append(w) 
            
    data.append(temp)

In [0]:
#uncomment models you'd like to run
import gensim
from gensim.models import Word2Vec
model_cbow = Word2Vec(data, min_count = 1,  size = 100, window = 8, iter=5, sg=0, hs=0) 
#model_sgram = Word2Vec(data, min_count = 1,  size = 100, window = 8, iter=5, sg=1, hs=0) 
#model_cbow_hs = Word2Vec(data, min_count = 1,  size = 100, window = 8, iter=5, sg=0, hs=1) 
#model_sgram_hs = Word2Vec(data, min_count = 1,  size = 100, window = 8, iter=5, sg=1, hs=1) 

In [0]:
print("Cosine similarity between 'caesar' " + "and 'king' - CBOW : ", 
    round(model_cbow.wv.similarity('caesar', 'king'),4)) 

In [0]:
similar_words = {search_term: [item[0] for item in model_cbow.wv.most_similar([search_term], topn=5)]
                  for search_term in ['caesar', 'god', 'rome', 'greece', 'alexander', 
                                      'gaul', 'truth', 'king', 'hundred', 'sparta']}
similar_words

In [0]:
#vector of one word
caesar = model_cbow.wv['caesar']; caesar

## Dimensionality reduction

In [0]:
import pandas as pd
from sklearn.decomposition import PCA

vocab = list(model_cbow.wv.vocab)
X = model_cbow.wv[vocab]
pca2 = PCA(n_components=2) #number of dimensions
X_pca2 = pca2.fit_transform(X)
#df_pca2 = pd.DataFrame(X_pca2, index=vocab, columns=['x', 'y'])
df_pca2 = pd.DataFrame(X_pca2, columns=['x', 'y'])
df_pca2['word'] = vocab
df_pca2 = df_pca2[['word','x','y']]
df_pca2.head(10)

In [0]:
from sklearn.manifold import TSNE
import numpy as np

#vocab = list(model_cbow.wv.vocab)
#X = model_cbow.wv[vocab]
tsne2 = TSNE(n_components=2, random_state=0, n_iter=4000, perplexity=30) 
#tsne2 = TSNE(n_components=2, random_state=0, n_iter=10000, perplexity=55) #verbose=1?
X_tsne2 = tsne2.fit_transform(X)
#df_tsne22 = pd.DataFrame(X_tsne2, index=vocab, columns=['x', 'y'])
df_tsne2 = pd.DataFrame(X_tsne2, columns=['x', 'y'])
df_tsne2['word'] = vocab
df_tsne2 = df_tsne2[['word','x', 'y']]
df_tsne2.head(10)

## Visualize

### Matplotlib

In [0]:
from matplotlib.pyplot import figure
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
from sklearn.decomposition import PCA

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18, 14)

simwords = sum([[k] + v for k, v in similar_words.items()], [])
wvs = model_cbow.wv[simwords]

pca_wvs = PCA(n_components=2, random_state=0)
np.set_printoptions(suppress=True)
Tpca = pca_wvs.fit_transform(wvs)
labels = simwords

plt.figure(figsize=(16, 12))
plt.scatter(Tpca[:, 0], Tpca[:, 1], c='purple', edgecolors='purple')
for label, x, y in zip(labels, Tpca[:, 0], Tpca[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')

In [0]:
from matplotlib.pyplot import figure
import matplotlib
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import numpy as np

fig = matplotlib.pyplot.gcf()
fig.set_size_inches(18, 14)

simwords = sum([[k] + v for k, v in similar_words.items()], [])
wvs = model_cbow.wv[simwords]

tsne_wvs = TSNE(n_components=2, random_state=0, n_iter=4000, perplexity=30) 
np.set_printoptions(suppress=True)
Ttsne = tsne_wvs.fit_transform(wvs)
labels = simwords

plt.figure(figsize=(16, 12))
plt.scatter(Ttsne[:, 0], Ttsne[:, 1], c='purple', edgecolors='purple')
for label, x, y in zip(labels, Ttsne[:, 0], Ttsne[:, 1]):
    plt.annotate(label, xy=(x, y), xytext=(0, 0), textcoords='offset points')

### Tensorflow projector

In [0]:
# create a pandas dataframe out of the word2vec model (CBOW in this case)
import pandas as pd
project_wvs = [(term, voc.index, voc.count) for term, voc in model_cbow.wv.vocab.items()]
project_wvs = sorted(project_wvs, key=lambda k: k[2])
ordered_terms, term_indices, term_counts = zip(*project_wvs)
df_cbow100 = pd.DataFrame(model_cbow.wv.vectors[term_indices, :], index=ordered_terms)

In [0]:
df_cbow100[:10]

In [0]:
#df_cbow100.to_csv('df_cbow100.tsv', sep='\t', encoding='utf-8', index=True) #only if want to download labels and vectors in one file

In [0]:
df_cbow100['word'] = df_cbow100.index

In [0]:
df_cbow100['word'].to_csv('df_cbow100word.tsv', sep='\t', encoding='utf-8', index=False, header=False)

In [0]:
df_cbow100vector = df_cbow100.iloc[:,0:100].copy()

In [0]:
df_cbow100vector.to_csv('df_cbow100vector.tsv', sep='\t', encoding='utf-8', index=False, header=False)

#### Upload the tsv files to Tensorflow Projector