# Temporal lda2vec Model

We will implement both the lda2vec-TM model as well as a RNN to predict future document embeddings.

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
import numpy as np
import pickle
import pandas as pd
import keras

Found GPU at: /device:GPU:0Metal device set to: Apple M1



2021-11-28 10:42:57.400025: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-28 10:42:57.400111: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Retrieve Embeddings

We will retrieve back the embeddings and dictionary.

In [3]:
def load_embed():
    doc_embed   = tf.constant(np.load(f"doc_embed_col.npy"))
    word_embed  = tf.constant(np.load(f"word_embed_col.npy"))
    topic_embed = tf.constant(np.load(f"topic_embed_col.npy"))
    return (doc_embed, word_embed, topic_embed)

In [4]:
def load_word_dict(idx_to_word):
    if idx_to_word:
        file = open(f"doc_collect/idx_to_word.pickle", "rb")
        word_dict = pickle.load(file)
    else:
        file = open(f"doc_collect/word_to_idx.pickle", "rb")
        word_dict = pickle.load(file)
    return word_dict

In [5]:
def word_embed_lookup(word):
    word_embed = load_embed()[1]
    word_dict  = load_word_dict(False)
    word_ind   = word_dict[word]
    return word_embed[word_ind]

In [6]:
def word_idx_lookup(word):
    word_dict = load_word_dict(False)
    return word_dict[word]

## Modeling Topic Distribution

We now model the topic distribution as a linear model to predict future documents.

In [7]:
def topic_dist():
    embeds = load_embed()
    doc_embed = embeds[0]
    topic_embed = embeds[2]
    norm_doc = tf.nn.softmax(doc_embed)
    topic_dist = tf.linalg.matmul(norm_doc, topic_embed)
    return topic_dist

## Linear Topic Regression
We will model the topic distribution over time as a linear model to predict future documents

In [8]:
# Get transition matrix for training set
def transition():
    dist = topic_dist()[:240]
    old_embed = dist[:239]
    new_embed = dist[1:]
    rhs = tf.linalg.matmul(tf.transpose(old_embed), old_embed)
    lhs = tf.linalg.matmul(tf.transpose(old_embed), new_embed)
    return tf.linalg.lstsq(lhs, rhs, l2_regularizer=0.1)

In [9]:
# Test transition on testing set
test = topic_dist()[240:]
input_embed = test[:59]
out_embed = test[1:60].numpy()
pred = tf.linalg.matmul(input_embed, transition()).numpy()
print("Mean square error:", np.mean((pred - out_embed)**2))
norm_pred = pred/np.reshape(np.linalg.norm(pred,axis=1), (59,1))
norm_out = out_embed/np.reshape(np.linalg.norm(out_embed,axis=1), (59,1))
cosine = 0
for i in range(59):
    cosine += np.dot(norm_pred[i], norm_out[i].T)
print("Average cosine similarity:", cosine/59)

Mean square error: 0.609071
Average cosine similarity: 0.13105160689328688


2021-11-28 10:42:57.605998: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-28 10:42:57.606018: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


## Neural Network Regression

We will implement a recurrent neural network to obtain document vectors.

In [10]:
# convert into dataset matrix
def convertToMatrix(data, step):
    X, Y = [], []
    for i in range(data.shape[0]-step):
        d = i+step
        X.append(data[i:d])
        Y.append(data[d])
    return tf.convert_to_tensor(X), tf.convert_to_tensor(Y)

In [11]:
dist = topic_dist()
idxs_train = list(range(240))
idxs_test = list(range(240,300))
train = tf.gather(dist, idxs_train)
test = tf.gather(dist, idxs_test)
step = 10

In [12]:
trainX, trainY = convertToMatrix(train, step)
testX, testY = convertToMatrix(test, step)

In [13]:
model = tf.keras.Sequential()
#model.add(tf.keras.layers.Input(shape=(step,300)))
#model.add(tf.keras.layers.Bidirectional(tf.keras.layers.GRU(300, return_sequences=True)))
model.add(tf.keras.layers.GRU(300, input_shape=(step,300), return_sequences=True))
model.add(tf.keras.layers.SimpleRNN(units=300))
#model.add(tf.keras.layers.Dense(300, activation="tanh"))
model.add(tf.keras.layers.Dense(300))
model.compile(loss='cosine_similarity', optimizer='Adam')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
gru (GRU)                    (None, 10, 300)           541800    
_________________________________________________________________
simple_rnn (SimpleRNN)       (None, 300)               180300    
_________________________________________________________________
dense (Dense)                (None, 300)               90300     
Total params: 812,400
Trainable params: 812,400
Non-trainable params: 0
_________________________________________________________________


In [14]:
model.fit(trainX,trainY, epochs=15, batch_size=20, validation_split=0.1)

Epoch 1/15


2021-11-28 10:42:59.055529: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-28 10:42:59.055730: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<keras.callbacks.History at 0x294c165b0>

In [15]:
trainScore = model.evaluate(trainX, trainY)



In [16]:
testScore = model.evaluate(testX, testY)



In [17]:
pred = tf.expand_dims(dist[-step:],0)

In [18]:
next_doc = tf.expand_dims(model.predict(pred)[0],1)

In [19]:
next_doc_weights = tf.linalg.lstsq(tf.transpose(load_embed()[2]), next_doc)

In [20]:
norm_doc = next_doc/tf.linalg.norm(next_doc)

In [21]:
cosine = tf.linalg.matmul(load_embed()[1], norm_doc).numpy()

In [22]:
cosine = np.squeeze(cosine)

In [23]:
word_doc_idxs = np.argsort(cosine)

In [24]:
word_dict = load_word_dict(True)

In [25]:
for i in range(1,30):
    print(word_dict[word_doc_idxs[-i]])

algorithm
given
learning
training
model
figure
method
note
based
models
estimation
nips
following
methods
algorithms
consider
corresponding
optimal
time
similar
respectively
step
shown
linear
data
optimization
general
approach
gradient
