# Temporal lda2vec Model

We will implement both the lda2vec-TM model as well as a RNN to predict future document embeddings.

In [1]:
import tensorflow as tf
device_name = tf.test.gpu_device_name()
if device_name != '/device:GPU:0':
  raise SystemError('GPU device not found')
print('Found GPU at: {}'.format(device_name))
import numpy as np
import pickle
import pandas as pd
import keras

Found GPU at: /device:GPU:0Metal device set to: Apple M1



2021-11-26 20:22:42.193010: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-26 20:22:42.193091: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [2]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'

## Retrieve Embeddings

We will retrieve back the embeddings and dictionary.

In [3]:
def load_embed():
    doc_embed   = tf.constant(np.load(f"doc_embed_col.npy"))
    word_embed  = tf.constant(np.load(f"word_embed_col.npy"))
    topic_embed = tf.constant(np.load(f"topic_embed_col.npy"))
    return (doc_embed, word_embed, topic_embed)

In [4]:
def load_word_dict(idx_to_word):
    if idx_to_word:
        file = open(f"doc_collect/idx_to_word.pickle", "rb")
        word_dict = pickle.load(file)
    else:
        file = open(f"doc_collect/word_to_idx.pickle", "rb")
        word_dict = pickle.load(file)
    return word_dict

In [5]:
def word_embed_lookup(word):
    word_embed = load_embed()[1]
    word_dict  = load_word_dict(False)
    word_ind   = word_dict[word]
    return word_embed[word_ind]

In [6]:
def word_idx_lookup(word):
    word_dict = load_word_dict(False)
    return word_dict[word]

## Modeling Topic Distribution

We now model the topic distribution as a linear model to predict future documents.

In [7]:
def topic_dist():
    embeds = load_embed()
    doc_embed = embeds[0]
    topic_embed = embeds[2]
    norm_doc = tf.nn.softmax(doc_embed)
    topic_dist = tf.linalg.matmul(norm_doc, topic_embed)
    return topic_dist

## Neural Network Regression

We will implement a dense neural network to obtain document vectors.

In [8]:
# convert into dataset matrix
def convertToMatrix(data, step):
    X, Y = [], []
    for i in range(data.shape[0]-step):
        d = i+step
        X.append(data[i:d])
        Y.append(data[d])
    return tf.convert_to_tensor(X), tf.convert_to_tensor(Y)

In [9]:
dist = topic_dist()
train = dist[:240,:]
test = dist[240:,:]
step = 10

2021-11-26 20:22:42.395344: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
2021-11-26 20:22:42.395372: I tensorflow/core/common_runtime/pluggable_device/pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


In [10]:
trainX, trainY = convertToMatrix(train, step)
testX, testY = convertToMatrix(test, step)

In [11]:
model = tf.keras.Sequential()
model.add(tf.keras.layers.SimpleRNN(units=300, input_shape=(step,300), activation="tanh"))
model.add(tf.keras.layers.Dense(300, activation="tanh"))
model.add(tf.keras.layers.Dense(300))
model.compile(loss='cosine_similarity', optimizer='Adam')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn (SimpleRNN)       (None, 300)               180300    
_________________________________________________________________
dense (Dense)                (None, 300)               90300     
_________________________________________________________________
dense_1 (Dense)              (None, 300)               90300     
Total params: 360,900
Trainable params: 360,900
Non-trainable params: 0
_________________________________________________________________


In [12]:
model.fit(trainX,trainY, epochs=10, batch_size=20, validation_split=0.1)

Epoch 1/10


2021-11-26 20:22:43.662821: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)
2021-11-26 20:22:43.662983: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x288173130>

In [19]:
trainPredict = model.predict(trainX)

In [20]:
testPredict= model.predict(testX)

In [21]:
predicted=tf.concat([trainPredict,testPredict],axis=0)

In [22]:
trainScore = model.evaluate(trainX, trainY, verbose=0)

In [23]:
testScore = model.evaluate(testX, testY, verbose=0)

In [24]:
testScore

-0.683212161064148