In [1]:
import tensorflow as tf
import numpy as np
import re
import glob
import collections
import random
import math
import matplotlib.pyplot as plt
import pickle
from sklearn.manifold import TSNE
from tensorflow.contrib.tensorboard.plugins import projector

In [56]:
class Corpus:
    def __init__(self):
        self.embedding_size = 100
        self.batch_size = 8
        self.num_skips = 2
        self.skip_window = 1
        self.num_epochs = 30
        self.learning_rate = 0.1

        self.current_index = 0
        self.words = []

        self.dictionary = {}
        self.final_embeddings = None

    def build_dataset(self):
        new_word_id = 0
        self.words = []
        self.dictionary = {}

        
        for filename in glob.glob("./corpus/*.txt"):
            with open(filename, "r", encoding="utf-8") as f:
        
                text = f.read()
                text = text.lower().replace("\n", " ")
                text = re.sub(r"[^a-z '\-]", "", text)
                text = re.sub(r"[ ]+", " ", text)

                for word in text.split():
        
                    if word.startswith("-"): continue 
                    if word not in self.dictionary:
                        self.dictionary[word] = new_word_id
                        new_word_id += 1
                    self.words.append(self.dictionary[word])

        
        self.vocabulary_size = new_word_id
        print("# of distinct words:", new_word_id)
        print("# of total words:", len(self.words))

    
    def generate_batch(self):
        
        assert self.batch_size % self.num_skips == 0
        assert self.num_skips <= 2 * self.skip_window
        
        self.current_index = 0
        batch = np.ndarray(shape=(self.batch_size), dtype=np.int32)
        labels = np.ndarray(shape=(self.batch_size, 1), dtype=np.int32)

        
        span = 2 * self.skip_window + 1
        if self.current_index + span >= len(self.words):
            raise StopIteration

        
        buffer = collections.deque(maxlen=span)
        for _ in range(span):
            buffer.append(self.words[self.current_index])
            self.current_index += 1

        
        for _ in range(len(self.words) // self.batch_size):
            
            for i in range(self.batch_size // self.num_skips):
                target = self.skip_window
                targets_to_avoid = [self.skip_window]
                
                for j in range(self.num_skips):
                    while target in targets_to_avoid:
                        target = random.randint(0, span - 1)
                    targets_to_avoid.append(target)
                    batch[i * self.num_skips + j] = buffer[self.skip_window]
                    labels[i * self.num_skips + j, 0] = buffer[target]

                
                buffer.append(self.words[self.current_index])
                self.current_index += 1
                if self.current_index >= len(self.words):
                    raise StopIteration
            yield batch, labels
        raise StopIteration
        

    def train(self):
        
        embeddings = tf.Variable(
            tf.random_uniform([self.vocabulary_size, self.embedding_size], -1.0, 1.0))
        print("Start train")
        
        nce_weights = tf.Variable(
            tf.truncated_normal([self.vocabulary_size, self.embedding_size],
                                stddev=1.0 / math.sqrt(self.embedding_size)))
        
        nce_biases = tf.Variable(tf.zeros([self.vocabulary_size]))
        
        
        train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
        train_labels = tf.placeholder(tf.int32, shape=[self.batch_size, 1])
        
                
        embed = tf.nn.embedding_lookup(embeddings, train_inputs)
        
        print("Features:")
        print("nce_weights:", nce_weights,
              "nce_biases:",nce_biases,
              "embed:", embed, 
              "train_labels:", train_labels, 
              "self.batch_size // 2: ", self.batch_size // 2,
              "self.batch_size:", self.batch_size, 
              "self.vocabulary_size: ", self.vocabulary_size)
        loss = tf.reduce_mean(
            tf.nn.nce_loss(nce_weights, nce_biases, train_labels, embed, self.batch_size // 2, self.vocabulary_size)
        )
        
        print("Loss:", loss)
        optimizer = tf.train.GradientDescentOptimizer(learning_rate=self.learning_rate).minimize(loss)
        
        # For similarities
        valid_examples = np.random.choice(100, 16, replace=False)
        valid_dataset = tf.constant(valid_examples, dtype=tf.int32)
        norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
        normalized_embeddings = embeddings / norm
        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, valid_dataset)
        similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)
        
        print("Session: Start")
        logdir = "./corpus/log"
        with tf.Session() as sess:
            sess.run(tf.global_variables_initializer())
            saver = tf.train.Saver()
            
        
            for epoch in range(self.num_epochs):
                epoch_loss = 0
                
                #generate_batch()
                print("Generating Batch: Started")
                
                for batch_x, batch_y in self.generate_batch():
                    _, loss_value = sess.run([optimizer, loss], feed_dict={train_inputs: batch_x, train_labels: batch_y})
                    epoch_loss += loss_value
                print("Generating Batch: End")
                print("Epoch", epoch, "completed out of", self.num_epochs, "-- loss:", epoch_loss)
                
                print("Saving Saver Session", logdir)
                # Embeddings Visualization
                saver.save(sess, logdir + "/blog.ckpt", epoch)

            
            self.final_embeddings = normalized_embeddings.eval() # <class 'numpy.ndarray'>

            # Embeddings Visualiation
            summary_writer = tf.summary.FileWriter(logdir)
            config = projector.ProjectorConfig()
            embedding = config.embeddings.add()
            embedding.tensor_name = embeddings.name 
            embedding.metadata_path = "./corpus/model/blog.metadata.tsv"
            projector.visualize_embeddings(summary_writer, config)

            self.plot()

        print("Dumping Dictionary")
        with open("./corpus/model/blog.dic", "wb") as f:
            pickle.dump(self.dictionary, f)
        print("Dictionary was saved to", "./corpus/model/blog.dic")
        np.save("./corpus/model/blog.npy", self.final_embeddings)
        print("Embeddings were saved to", "./corpus/model/blog.npy/")

        # Embeddings Visualization
        
        sorted_dict = sorted(self.dictionary.items(), key=lambda x: x[1])
        words = ["{}\n".format(x[0]) for x in sorted_dict]
        with open("./corpus/model/blog.metadata.tsv", "w", encoding="utf-8") as f:
            f.writelines(words)
        print("Embeddings metadata was saved to ./corpus/model/blog.metadata.tsv")

    def plot(self, filename="./corpus/model/blog.png"):
        tsne = TSNE(perplexity=30, n_components=2, init="pca", n_iter=5000)
        plot_only=500
        low_dim_embeddings = tsne.fit_transform(self.final_embeddings[:plot_only, :])
        reversed_dictionary = dict(zip(self.dictionary.values(), self.dictionary.keys()))
        labels = [reversed_dictionary[i] for i in range(plot_only)]

        plt.figure(figsize=(18, 18))
        for i, label in enumerate(labels):
            x, y = low_dim_embeddings[i, :]
            plt.scatter(x, y)
            plt.annotate(label,
                        xy=(x, y),
                        xytext=(5, 2),
                        textcoords="offset points",
                        ha="right",
                        va="bottom")
        plt.savefig(filename)
        print("Scatter plot was saved to", filename)

In [57]:
corpus = Corpus()

In [58]:
corpus.build_dataset()

# of distinct words: 253854
# of total words: 17005207


In [None]:
corpus.generate_batch()

In [59]:
corpus.train()

Start train
Features:
nce_weights: <tf.Variable 'Variable_40:0' shape=(253854, 100) dtype=float32_ref> nce_biases: <tf.Variable 'Variable_41:0' shape=(253854,) dtype=float32_ref> embed: Tensor("embedding_lookup_16:0", shape=(8, 100), dtype=float32) train_labels: Tensor("Placeholder_27:0", shape=(8, 1), dtype=int32) self.batch_size // 2:  4 self.batch_size: 8 self.vocabulary_size:  253854
Loss: Tensor("Mean_5:0", shape=(), dtype=float32)
Session: Start
Generating Batch: Started
Generating Batch: End
Epoch 0 completed out of 30 -- loss: 13135643.502
Saving Saver Session ./corpus/log
Generating Batch: Started
Generating Batch: End
Epoch 1 completed out of 30 -- loss: 5344041.84242
Saving Saver Session ./corpus/log
Generating Batch: Started
Generating Batch: End
Epoch 2 completed out of 30 -- loss: 4230715.36069
Saving Saver Session ./corpus/log
Generating Batch: Started
Generating Batch: End
Epoch 3 completed out of 30 -- loss: 3771777.57246
Saving Saver Session ./corpus/log


UnknownError: Failed to WriteFile: ./corpus/log/blog.ckpt-3.data-00000-of-00001.tempstate9185156088408496772 : There is not enough space on the disk.
; Unknown error
	 [[Node: save_3/SaveV2 = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save_3/Const_0_0, save_3/SaveV2/tensor_names, save_3/SaveV2/shape_and_slices, Variable, Variable_1, Variable_10, Variable_11, Variable_12, Variable_13, Variable_14, Variable_15, Variable_16, Variable_17, Variable_18, Variable_19, Variable_2, Variable_20, Variable_21, Variable_22, Variable_23, Variable_24, Variable_25, Variable_26, Variable_27, Variable_28, Variable_29, Variable_3, Variable_30, Variable_31, Variable_32, Variable_33, Variable_34, Variable_35, Variable_36, Variable_37, Variable_38, Variable_39, Variable_4, Variable_40, Variable_41, Variable_5, Variable_6, Variable_7, Variable_8, Variable_9)]]

Caused by op 'save_3/SaveV2', defined at:
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\traitlets\config\application.py", line 658, in launch_instance
    app.start()
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelapp.py", line 477, in start
    ioloop.IOLoop.instance().start()
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\ioloop.py", line 177, in start
    super(ZMQIOLoop, self).start()
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tornado\ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 440, in _handle_events
    self._handle_recv()
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 472, in _handle_recv
    self._run_callback(callback, msg)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\zmq\eventloop\zmqstream.py", line 414, in _run_callback
    callback(*args, **kwargs)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tornado\stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 235, in dispatch_shell
    handler(stream, idents, msg)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\ipkernel.py", line 196, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\ipykernel\zmqshell.py", line 533, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2698, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2808, in run_ast_nodes
    if self.run_code(code, result):
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\IPython\core\interactiveshell.py", line 2862, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-59-3682e42dd59a>", line 1, in <module>
    corpus.train()
  File "<ipython-input-56-e47c15c2d00d>", line 133, in train
    saver = tf.train.Saver()
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\saver.py", line 1139, in __init__
    self.build()
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\saver.py", line 1170, in build
    restore_sequentially=self._restore_sequentially)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\saver.py", line 689, in build
    save_tensor = self._AddSaveOps(filename_tensor, saveables)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\saver.py", line 276, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\training\saver.py", line 219, in save_op
    tensors)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\ops\gen_io_ops.py", line 745, in save_v2
    tensors=tensors, name=name)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 767, in apply_op
    op_def=op_def)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 2506, in create_op
    original_op=self._default_original_op, op_def=op_def)
  File "C:\Users\dhavalma\AppData\Local\Continuum\Anaconda3\envs\tensorflow\lib\site-packages\tensorflow\python\framework\ops.py", line 1269, in __init__
    self._traceback = _extract_stack()

UnknownError (see above for traceback): Failed to WriteFile: ./corpus/log/blog.ckpt-3.data-00000-of-00001.tempstate9185156088408496772 : There is not enough space on the disk.
; Unknown error
	 [[Node: save_3/SaveV2 = SaveV2[dtypes=[DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT, DT_FLOAT], _device="/job:localhost/replica:0/task:0/cpu:0"](_arg_save_3/Const_0_0, save_3/SaveV2/tensor_names, save_3/SaveV2/shape_and_slices, Variable, Variable_1, Variable_10, Variable_11, Variable_12, Variable_13, Variable_14, Variable_15, Variable_16, Variable_17, Variable_18, Variable_19, Variable_2, Variable_20, Variable_21, Variable_22, Variable_23, Variable_24, Variable_25, Variable_26, Variable_27, Variable_28, Variable_29, Variable_3, Variable_30, Variable_31, Variable_32, Variable_33, Variable_34, Variable_35, Variable_36, Variable_37, Variable_38, Variable_39, Variable_4, Variable_40, Variable_41, Variable_5, Variable_6, Variable_7, Variable_8, Variable_9)]]


In [55]:
%pwd

'C:\\Users\\dhavalma\\AnacondaProjects\\Document Classification'