In [128]:
#load dictionaries and batches
import json
import math

import numpy as np
import tensorflow as tf



class W2V:
    def __init__(self,
                 dictionary_fname='../data/protein/dictionaries_1.json',
                 csv_fname='../data/protein/test2.csv',
                 save_path='test.ckpt',
                 batch_size=128,
                 embedding_size=100,
                 num_sampled=5,
                 seed=0):
        with open(dictionary_fname) as f_in:
            self.id2w = json.loads(f_in.readline())
            self.w2id = json.loads(f_in.readline())
            self.word_count = json.loads(f_in.readline())

        # this uses lots of memory
        self.csv = np.genfromtxt(csv_fname, delimiter=",", dtype=np.int32)

        # this is not a hack, but rather due to the fact that integers can't be keys in JSON
        # today I learned
        self.id2w = dict((int(k),v) for k,v in zip(self.id2w.keys(), self.id2w.values()))
        self.batch_size = batch_size
        self.every_2000_losses = []
        self.save_path = save_path
        vocabulary_size = len(self.w2id.keys())
        self._data_index = 0

        # build the computation graph
        self.session = tf.Session()
        with self.session.graph.as_default():
            tf.set_random_seed(seed)

            self.train_inputs = tf.placeholder(tf.int32, shape=[self.batch_size])
            self.train_labels = tf.placeholder(tf.int32, shape=[self.batch_size,1])

            with tf.device('/cpu:0'):
                # This is a matrix that holds the embeddings, random initialization
                embeddings = tf.Variable(
                    tf.random_uniform([vocabulary_size, embedding_size], -1.0, 1.0))

                # This is a view into that matrix for the inputs in the batch
                embed = tf.nn.embedding_lookup(embeddings, self.train_inputs)

                # Weights and Biases for Log Reg on embeddings
                nce_weights = tf.Variable(
                    tf.truncated_normal([vocabulary_size, embedding_size],
                                        stddev=1.0 / math.sqrt(embedding_size))
                )
                nce_biases = tf.Variable(tf.zeros([vocabulary_size]))

            # Compute the average NCE loss for the batch. Automatically
            # does negative sampling too.
            self.loss = tf.reduce_mean(
              tf.nn.nce_loss(nce_weights, nce_biases, embed, self.train_labels,
                             num_sampled, vocabulary_size))

            # Construct the SGD optimizer (minimizer) using a learning rate of 1.0.
            self.optimizer = tf.train.GradientDescentOptimizer(1.0).minimize(self.loss)

            # Compute the co            tf.initialize_all_variables().run()sine similarity between minibatch examples and all embeddings.
            norm = tf.sqrt(tf.reduce_sum(tf.square(embeddings), 1, keep_dims=True))
            self.normalized_embeddings = embeddings / norm
            

        tf.initialize_all_variables().run(session=self.session)
            
    def gen_batch(self):
        if (self._data_index + self.batch_size) > self.csv.shape[0]:
            self._data_index = 0

        batch = self.csv[self._data_index:self._data_index+self.batch_size,0]
        labels = self.csv[self._data_index:self._data_index+self.batch_size,1,None]

        self._data_index = self._data_index + self.batch_size

        return batch, labels

    def train(self, num_batches):
        average_loss = 0
        for step in range(num_batches):
            batch_inputs, batch_labels = self.gen_batch()
            feed_dict = {self.train_inputs : batch_inputs,
                         self.train_labels : batch_labels}
            _, loss_val = self.session.run([self.optimizer, self.loss], feed_dict=feed_dict)
            average_loss += loss_val

        if step % 2000 == 0:
            if step > 0:
                average_loss /= 2000
            # The average loss is an estimate of the loss over the last 2000 batches.
            self.every_2000_losses.append(average_loss)
            average_loss = 0

    def gen_embedding(self):
        return self.normalized_embeddings.eval(session=self.session)

In [129]:
w2v = W2V(dictionary_fname='../data/protein/dictionaries_1.json',
          csv_fname='../data/protein/test2.csv',
          save_path='test.ckpt',
          batch_size=8,
          embedding_size=100,
          num_sampled=5)

w2v.train(100)
w2v.gen_embedding()

array([[ -1.12862423e-01,  -4.83706035e-02,   4.85960618e-02, ...,
          7.32316077e-02,   1.08804017e-01,  -1.88616887e-01],
       [ -9.55439806e-02,  -4.81215082e-02,  -1.66858345e-01, ...,
          5.56581877e-02,  -4.32461537e-02,  -1.09289713e-01],
       [ -1.10054888e-01,   6.62492439e-02,  -1.89714476e-01, ...,
          3.04480847e-02,   1.52271971e-01,  -7.61605278e-02],
       ..., 
       [ -3.87193188e-02,   2.09836736e-02,   1.67491392e-03, ...,
          2.09090840e-02,  -8.79947692e-02,  -6.67165369e-02],
       [  1.19331688e-01,  -1.29233167e-01,   1.58526748e-01, ...,
          1.41413420e-01,   1.20201983e-01,  -1.61211208e-01],
       [ -1.59572631e-01,  -2.16572024e-02,  -7.15490532e-05, ...,
          7.69217610e-02,  -8.05691406e-02,   1.48426533e-01]], dtype=float32)