# Temporal lda2vec Model

We will implement both the lda2vec-TM model as well as a bidirectional RNN to predict future document embeddings.

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
import numpy as np
import pickle
import pandas as pd

Found GPU at: /device:GPU:0Metal device set to: Apple M1



2021-11-22 14:37:17.723598: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-22 14:37:17.723989: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

In [12]:
from tensorflow.keras.layers import Embedding, Dense, GRU, Bidirectional
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.datasets import imdb

## Retrieve Embeddings

We will retrieve back the embeddings and dictionary.

In [4]:
def load_embed(ind):
    doc_embed   = tf.constant(np.load(f"embeds/doc_embed_{ind}.npy"))
    word_embed  = tf.constant(np.load(f"embeds/word_embed_{ind}.npy"))
    topic_embed = tf.constant(np.load(f"embeds/topic_embed_{ind}.npy"))
    return (doc_embed, word_embed, topic_embed)

In [5]:
def load_word_dict(ind, idx_to_word):
    if idx_to_word:
        file = open(f"preprocess/paper_set_{ind}/idx_to_word.pickle", "rb")
        word_dict = pickle.load(file)
    else:
        file = open(f"preprocess/paper_set_{ind}/word_to_idx.pickle", "rb")
        word_dict = pickle.load(file)
    return word_dict

In [6]:
def word_embed_lookup(s_ind, word):
    word_embed = load_embed(s_ind)[1]
    word_dict  = load_word_dict(s_ind, False)
    word_ind   = word_dict[word]
    return word_embed[word_ind]

In [7]:
def word_idx_lookup(s_ind, word):
    word_dict = load_word_dict(s_ind, False)
    return word_dict[word]

## Modeling Topic Distribution

We now model the topic distribution as a linear model to predict future documents.

In [10]:
def topic_dist(ind):
    embeds = load_embed(ind)
    doc_embed = embeds[0]
    topic_embed = embeds[2]
    norm_doc = tf.nn.softmax(doc_embed)
    topic_dist = tf.linalg.matmul(norm_doc, topic_embed)
    return topic_dist

## Bidirectional RNN

We will implement a bidirection RNN to obtain document vectors.

In [19]:
model = tf.keras.Sequential()
model.add(Embedding(300, 300))
model.add(Bidirectional(GRU(300)))

loss_function = tf.keras.losses.CosineSimilarity()
optimizer = Adam()
additional_metrics = ['accuracy']

inputs = topic_dist(0)[:80]
outputs = topic_dist(0)[1:81]

model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)

model.summary()

history = model.fit(inputs, outputs, epochs=5)

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 300)         90000     
_________________________________________________________________
bidirectional_4 (Bidirection (None, 600)               1083600   
Total params: 1,173,600
Trainable params: 1,173,600
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5


2021-11-22 14:57:53.098292: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-22 14:57:53.101056: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


ValueError: in user code:

    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/engine/training.py:853 train_function  *
        return step_function(self, iterator)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/engine/training.py:842 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:1286 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:2849 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/distribute/distribute_lib.py:3632 _call_for_each_replica
        return fn(*args, **kwargs)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/engine/training.py:835 run_step  **
        outputs = model.train_step(data)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/engine/training.py:788 train_step
        loss = self.compiled_loss(
    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/engine/compile_utils.py:201 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/losses.py:141 __call__
        losses = call_fn(y_true, y_pred)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/losses.py:245 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/keras/losses.py:1968 cosine_similarity
        return -tf.reduce_sum(y_true * y_pred, axis=axis)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/ops/math_ops.py:1383 binary_op_wrapper
        raise e
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/ops/math_ops.py:1367 binary_op_wrapper
        return func(x, y, name=name)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/ops/math_ops.py:1710 _mul_dispatch
        return multiply(x, y, name=name)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/util/dispatch.py:206 wrapper
        return target(*args, **kwargs)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/ops/math_ops.py:530 multiply
        return gen_math_ops.mul(x, y, name)
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/ops/gen_math_ops.py:6245 mul
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/framework/op_def_library.py:748 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/framework/func_graph.py:599 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:3561 _create_op_internal
        ret = Operation(
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:2041 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    /Users/keanl/miniforge3/lib/python3.9/site-packages/tensorflow/python/framework/ops.py:1883 _create_c_op
        raise ValueError(str(e))

    ValueError: Dimensions must be equal, but are 300 and 600 for '{{node cosine_similarity/mul}} = Mul[T=DT_FLOAT](cosine_similarity/l2_normalize, cosine_similarity/l2_normalize_1)' with input shapes: [?,300], [?,600].


In [77]:
# Pad all sequences
padded_inputs = pad_sequences(x_train, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>
padded_inputs_test = pad_sequences(x_test, maxlen=max_sequence_length, value = 0.0) # 0.0 because it corresponds with <PAD>

In [78]:
# Define the Keras model
model = Sequential()
model.add(Embedding(num_distinct_words, embedding_output_dims, input_length=max_sequence_length))
model.add(Bidirectional(LSTM(10), merge_mode='sum'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer=optimizer, loss=loss_function, metrics=additional_metrics)

# Give a summary
model.summary()

# Train the model
history = model.fit(padded_inputs, y_train, batch_size=batch_size, epochs=number_of_epochs, verbose=verbosity_mode, validation_split=validation_split)

# Test the model after training
test_results = model.evaluate(padded_inputs_test, y_test, verbose=False)
print(f'Test results - Loss: {test_results[0]} - Accuracy: {100*test_results[1]}%')

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 300, 15)           75000     
_________________________________________________________________
bidirectional (Bidirectional (None, 10)                2080      
_________________________________________________________________
dense (Dense)                (None, 1)                 11        
Total params: 77,091
Trainable params: 77,091
Non-trainable params: 0
_________________________________________________________________


2021-11-22 09:31:43.657729: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-22 09:31:43.661203: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 1/5


2021-11-22 09:31:44.381042: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 09:31:44.587089: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 09:31:44.597498: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 09:31:46.174542: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 09:31:46.188401: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.




2021-11-22 09:32:14.045058: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 09:32:14.086876: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.
2021-11-22 09:32:14.094386: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:112] Plugin optimizer for device_type GPU is enabled.


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test results - Loss: 0.6848894357681274 - Accuracy: 55.42800426483154%


In [55]:
def linear_reg(old_embed, new_embed, reg):
    rhs = tf.linalg.matmul(old_embed, old_embed, transpose_a=True)
    lhs = tf.linalg.matmul(old_embed, new_embed, transpose_a=True)
    return tf.linalg.lstsq(rhs, lhs, l2_regularizer=reg)

In [73]:
dist = topic_dist(0)
old_embed = dist[:90]
new_embed = dist[1:91]
trans = linear_reg(old_embed, new_embed, 0.3)

In [74]:
pred = tf.linalg.matmul(trans, tf.expand_dims(dist[90], 1))

In [77]:
tf.math.reduce_sum((pred - tf.expand_dims(dist[91], 1))**2)

<tf.Tensor: shape=(), dtype=float32, numpy=14.248801>

In [78]:
(pred - tf.expand_dims(dist[91], 1))**2

<tf.Tensor: shape=(300, 1), dtype=float32, numpy=
array([[7.26384451e-06],
       [2.48449482e-02],
       [6.56153709e-02],
       [9.37536359e-02],
       [1.96923763e-01],
       [9.95805413e-02],
       [4.94867470e-03],
       [9.13792551e-02],
       [7.67794400e-02],
       [3.01532168e-03],
       [1.22324377e-02],
       [1.47417188e-01],
       [1.89990193e-01],
       [8.49751830e-02],
       [9.94711882e-04],
       [1.45701200e-01],
       [1.15279993e-03],
       [4.35182708e-04],
       [3.61793861e-02],
       [3.52018774e-02],
       [1.39161712e-02],
       [1.21387728e-02],
       [8.70667025e-02],
       [1.88042340e-03],
       [1.10588856e-02],
       [3.34285409e-03],
       [2.63946742e-01],
       [1.17867902e-01],
       [3.79502773e-02],
       [3.09049040e-02],
       [3.35333794e-02],
       [4.25203890e-03],
       [1.48263752e-01],
       [9.49576944e-02],
       [8.94311294e-02],
       [4.36869524e-02],
       [3.95513363e-02],
       [2.46011559e-03],
