In [1]:
import tensorflow as tf
import numpy as np
import re
import glob
import collections
import random
import math
import matplotlib.pyplot as plt
import pickle
from sklearn.manifold import TSNE
from tensorflow.contrib.tensorboard.plugins import projector

In [14]:
class Corpus:
    def __init__(self):
        self.embedding_size = 100
        self.batch_size = 8
        self.num_skips = 2
        self.skip_window = 1
        self.num_epochs = 30
        self.learning_rate = 0.1

        self.current_index = 0
        self.words = []

        self.dictionary = {}
        self.final_embeddings = None

    def build_dataset(self):
        new_word_id = 0
        self.words = []
        self.dictionary = {}

        
        for filename in glob.glob("./corpus/*.txt"):
            with open(filename, "r", encoding="utf-8") as f:
        
                text = f.read()
                text = text.lower().replace("\n", " ")
                text = re.sub(r"[^a-z '\-]", "", text)
                text = re.sub(r"[ ]+", " ", text)

                for word in text.split():
        
                    if word.startswith("-"): continue 
                    if word not in self.dictionary:
                        self.dictionary[word] = new_word_id
                        new_word_id += 1
                    self.words.append(self.dictionary[word])

        
        self.vocabulary_size = new_word_id
        print("# of distinct words:", new_word_id)
        print("# of total words:", len(self.words))

    
    def generate_batch(self):
        
        assert self.batch_size % self.num_skips == 0
        assert self.num_skips <= 2 * self.skip_window
        
        self.current_index = 0
        batch = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(self.batch_size, 1), dtype=np.int32)

        
        span = 2 * self.skip_window + 1
        if self.current_index + span >= len(self.words):
            raise StopIteration

        
        buffer = collections.deque(maxlen=span)
        for _ in range(span):
            buffer.append(self.words[self.current_index])
            self.current_index += 1

        
        for _ in range(len(self.words) // self.batch_size):
            
            for i in range(self.batch_size // self.num_skips):
                target = self.skip_window
                targets_to_avoid = [self.skip_window]
                
                for j in range(self.num_skips):
                    while target in targets_to_avoid:
                        target = random.randint(0, span - 1)
                    targets_to_avoid.append(target)
                    batch[i * self.num_skips + j] = buffer[self.skip_window]
                    labels[i * self.num_skips + j, 0] = buffer[target]

                
                buffer.append(self.words[self.current_index])
                self.current_index += 1
                if self.current_index >= len(self.words):
                    raise StopIteration
            yield batch, labels
        raise StopIteration
        

    def train(self):
        
        embeddings = tf.Variable(
            tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))
        print("Start train")
        
        nce_weights = tf.Variable(
            tf.truncated_normal([self.vocabulary_size, self.embedding_size],
                                stddev=1.0 / math.sqrt(self.embedding_size)))
        print("1")
        nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))
        print("2")
        
        train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1])

        
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, embed, train_labels, self.batch_size // 2, self.vocabulary_size)
        )

        
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(loss)
        
        # For similarities
        # valid_examples = np.random.choice(100, 16, replace=False)
        # valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = int(embeddings / norm)
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        print(valid_embeddings, normalized_embeddings)
        similarity = tf.matmul(int(valid_embeddings), int(normalized_embeddings), transpose_b=True)

        logdir = "./corpus/log"
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()

        
            for epoch in range(self.num_epochs):
                epoch_loss = 0
                # generate_batch()
                for batch_x, batch_y in self.generate_batch():
                    _, loss_value = sess.run([optimizer, loss], feed_dict={train_inputs: batch_x, train_labels: batch_y})
                    epoch_loss += loss_value

                print("Epoch", epoch, "completed out of", self.num_epochs, "-- loss:", epoch_loss)

                # Embeddings Visualization
                saver.save(sess, logdir + "/blog.ckpt", epoch)

            
            self.final_embeddings = normalized_embeddings.eval() # <class 'numpy.ndarray'>

            # Embeddings Visualiation
            summary_writer = tf.summary.FileWriter(logdir)
            config = projector.ProjectorConfig()
            embedding = config.embeddings.add()
            embedding.tensor_name = embeddings.name 
            embedding.metadata_path = "./corpus/model/blog.metadata.tsv"
            projector.visualize_embeddings(summary_writer, config)

        # self.plot()

        
        with open("./corpus/model/blog.dic", "wb") as f:
            pickle.dump(self.dictionary, f)
        print("Dictionary was saved to", "./corpus/model/blog.dic")
        np.save("./corpus/model/blog.npy", self.final_embeddings)
        print("Embeddings were saved to", "./corpus/model/blog.npy/")

        # Embeddings Visualization
        
        sorted_dict = sorted(self.dictionary.items(), key=lambda x: x[1])
        words = ["{}\n".format(x[0]) for x in sorted_dict]
        with open("./corpus/model/blog.metadata.tsv", "w", encoding="utf-8") as f:
            f.writelines(words)
        print("Embeddings metadata was saved to ./corpus/model/blog.metadata.tsv")

    def plot(self, filename="./corpus/model/blog.png"):
        tsne = TSNE(perplexity=30, n_components=2, init="pca", n_iter=5000)
        plot_only=500
        low_dim_embeddings = tsne.fit_transform(self.final_embeddings[:plot_only, :])
        reversed_dictionary = dict(zip(self.dictionary.values(), self.dictionary.keys()))
        labels = [reversed_dictionary[i] for i in range(plot_only)]

        plt.figure(figsize=(18, 18))
        for i, label in enumerate(labels):
            x, y = low_dim_embeddings[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                        xy=(x, y),
                        xytext=(5, 2),
                        textcoords="offset points",
                        ha="right",
                        va="bottom")
        plt.savefig(filename)
        print("Scatter plot was saved to", filename)

In [15]:
corpus = Corpus()

In [4]:
corpus.build_dataset()

# of distinct words: 253854
# of total words: 17005207


In [None]:
corpus.generate_batch()

In [16]:
corpus.train()

AttributeError: 'Corpus' object has no attribute 'vocabulary_size'