In [1]:
import pickle
import tensorflow as tf
import numpy as np
from sklearn.metrics import mean_squared_error
from time import time
from gensim.models import LdaModel
from gensim import corpora

In [51]:
DIR = "data_cnn/tcenr_data/"#"data_cnn/amazon_home/"#/amazon/"
OUTPUT_FILE = DIR + "cnn_best.ckpt"
CHECKPOINT_FILE = DIR + "cnn_curr.ckpt"
WORD_EMB = 50
REVIEW_WORDS = 500
TRAIN_SIZE = 125376#172244#1869470#3959195#2534578#5397476
TEST_SIZE = 13852#19232#206717#440201#281164#598485
DICT_SIZE = 58488#57463#125410#125410#148240#148870
#TRAIN_SIZE = 621600
#TEST_SIZE = 69228
#DICT_SIZE = 83586
FILTER_SIZES = [3]
NUM_FILTERS = 100
LATENT_SIZE = 32
CLASSES = 5
LR=0.01
EPOCHS=100
MIN_EPOCH_TO_SAVE = 3
MSE_DIFF_TO_SAVE = 1.002
EARLY_STOP_INTERVAL = 10
BATCH_SIZE = 8192
RESTORE = True
EXPORT_TOPIC_EMB = True
LDA_TOPICS = 50
LDA_DIR = "data_cnn/lda/tcenr_data/"#"data_cnn/lda/amazon_home/"#amazon/"#"/models_"+ str(LDA_TOPICS) +"/"#"lda_results/rest/"

In [52]:
tf.reset_default_graph()

In [53]:
in_review = tf.placeholder(tf.int32, [None, REVIEW_WORDS], name="in_review")
in_score = tf.placeholder(tf.int32,[None], name="in_score")

random_uniform_initializer = tf.random_uniform_initializer(-1.0, 1.0)

l2_loss = tf.constant(0.0)

with tf.name_scope("review_embedding"):
    W1 = tf.get_variable("W_emb",shape = [DICT_SIZE, WORD_EMB], dtype=tf.float32, initializer=random_uniform_initializer)
    embedded_review = tf.nn.embedding_lookup(W1, in_review)
    embedded_reviews = tf.expand_dims(embedded_review, -1)

In [54]:
truncated_normal_initializer = tf.truncated_normal_initializer(stddev=0.1)
constant_initializer_filter = tf.constant(0.1, shape=[NUM_FILTERS])

pooled_outputs = []

for i, filter_size in enumerate(FILTER_SIZES):
    with tf.name_scope("conv-maxpool-%s" % filter_size):
        # Convolution Layer
        filter_shape = [filter_size, WORD_EMB, 1, NUM_FILTERS]
        W = tf.get_variable("W_"+str(i),shape = filter_shape, dtype=tf.float32, initializer=truncated_normal_initializer)
        b = tf.get_variable("b_"+ str(i), dtype=tf.float32, initializer=constant_initializer_filter )
        conv = tf.nn.conv2d(embedded_reviews,W,strides=[1, 1, 1, 1],padding="VALID",name="conv_{}".format(i))
        # Apply nonlinearity
        h = tf.nn.relu(tf.nn.bias_add(conv, b), name="relu")
        # Maxpooling over the outputs
        pooled = tf.nn.max_pool(h,ksize=[1, REVIEW_WORDS - filter_size + 1, 1, 1],strides=[1, 1, 1, 1],padding='VALID',
                                name="pool_{}".format(i))
        pooled_outputs.append(pooled)
num_filters_total = NUM_FILTERS * len(FILTER_SIZES)
h_pool = tf.concat(pooled_outputs,3)
h_pool_flat = tf.reshape(h_pool, [-1, num_filters_total])

In [55]:
xavier_initializer = tf.contrib.layers.xavier_initializer()

with tf.name_scope("dropout"):
    h_drop = tf.nn.dropout(h_pool_flat, 1.0)
with tf.name_scope("get_fea"):
    Wh = tf.get_variable("Wh",shape=[num_filters_total, LATENT_SIZE],initializer=xavier_initializer,dtype=tf.float32)
    bh = tf.get_variable("bh",initializer=tf.constant(0.1, shape=[LATENT_SIZE]), dtype=tf.float32)
    f_fea=tf.matmul(h_drop, Wh) + bh
    h_final = tf.nn.relu(f_fea)

In [56]:
Wout = tf.get_variable("Wout",shape=[LATENT_SIZE,1],initializer=xavier_initializer,dtype=tf.float32)
bout = tf.get_variable("bout",initializer = tf.constant(0.1, shape=[1]),dtype=tf.float32)
predictions_raw = tf.matmul(h_final,Wout) + bout
predictions = tf.squeeze(predictions_raw)
loss = tf.losses.mean_squared_error(labels=in_score,predictions=predictions)
train_step = tf.train.AdamOptimizer(LR).minimize(loss)

In [57]:
def get_feed_dict(dataset,start,end,pad_size):
    tmp_reviews = []
    for r in dataset["reviews"][start:end]:
        to_pad = pad_size - len(r)
        if to_pad>0:
            r = r + [0] * to_pad
        tmp_reviews.append(r)
    feed_dict = {in_review: tmp_reviews, in_score: dataset["scores"][start:end]}
    return feed_dict

sess = tf.Session()
saver = tf.train.Saver()

if RESTORE:
    print("Restoring model from " + OUTPUT_FILE)
    saver.restore(sess,OUTPUT_FILE)
else:
    print("Training the model from scratch")
    glove = pickle.load(open(DIR + "glove_file.pkl","rb"))
    glove_init = np.array(glove)
    train = pickle.load(open(DIR + "train.pkl","rb"))
    test = pickle.load(open(DIR + "test.pkl","rb"))
    sess.run(tf.global_variables_initializer())
    sess.run(tf.local_variables_initializer())
    sess.run(W1.assign(glove_init))
    best_mse=9999
    best_mse_epoch=0

    for epoch in range(1,EPOCHS+1):
        t1 = time()
        # Shuffle the training batch order
        training_indexes = list(range(0, TRAIN_SIZE, BATCH_SIZE))
        total_batches = len(training_indexes)
        np.random.shuffle(training_indexes)
        train_ratings=np.zeros(TRAIN_SIZE)
        train_outputs=np.zeros(TRAIN_SIZE)
        train_losses=np.zeros(total_batches)
        curr_iter=0
        # Train the model for each batch size
        for start in training_indexes:
            end = min(start + BATCH_SIZE,TRAIN_SIZE)
            feed_dict = get_feed_dict(train,start,end,REVIEW_WORDS) 
            # Perform a training step for current batch
            _,curr_loss,curr_ratings, curr_output = sess.run([train_step,loss,in_score,predictions],feed_dict)
            train_ratings[start:end] = curr_ratings
            train_outputs[start:end] = curr_output
            train_losses[curr_iter] = curr_loss
            curr_iter+=1
            #print("batch {} / {}, took {}".format(curr_iter,total_batches,time()-t1))
            
        saver.save(sess, CHECKPOINT_FILE)
        train_mse = mean_squared_error(y_true=train_ratings, y_pred=train_outputs)
        
        test_indexes = list(range(0, TEST_SIZE, BATCH_SIZE))
        test_ratings = np.zeros(TEST_SIZE)
        test_outputs = np.zeros(TEST_SIZE)
        for start in test_indexes:
            end = min(start+BATCH_SIZE, TEST_SIZE)
            feed_dict = get_feed_dict(test,start,end,REVIEW_WORDS)        
            curr_test_ratings, curr_test_output = sess.run([in_score, predictions],feed_dict)
            test_ratings[start:end] = curr_test_ratings
            test_outputs[start:end] = curr_test_output
        test_mse = mean_squared_error(y_true=test_ratings, y_pred=test_outputs)
        print("epoch {}:{} ms. avg loss {:.4f}. train MSE: {:.4f}, test MSE: {:.4f}".format(epoch,time()-t1,
                                                                                            np.average(train_losses), 
                                                                                            train_mse, test_mse))

        if (test_mse/best_mse)<MSE_DIFF_TO_SAVE and epoch>=MIN_EPOCH_TO_SAVE:
            save_path = saver.save(sess, OUTPUT_FILE)
            print("MSE improved from {:.4f} to {:.4f}. Model saved to {}".format(best_mse,test_mse,save_path))
            best_mse = test_mse
            best_mse_epoch = epoch

        if (epoch - best_mse_epoch)>EARLY_STOP_INTERVAL:
            print("Early stop due to no imporvement since epoch {}".format(best_mse_epoch))
            break

Restoring model from data_cnn/tcenr_data/cnn_best.ckpt
INFO:tensorflow:Restoring parameters from data_cnn/tcenr_data/cnn_best.ckpt


In [58]:
len(test['scores'])

13852

In [59]:
test = pickle.load(open(DIR + "test.pkl","rb"))
test_indexes = list(range(0, TEST_SIZE, BATCH_SIZE))
test_ratings = np.zeros(TEST_SIZE)
test_outputs = np.zeros(TEST_SIZE)
for start in test_indexes:
    end = min(start+BATCH_SIZE, TEST_SIZE)
    feed_dict = get_feed_dict(test,start,end,REVIEW_WORDS)        
    curr_test_ratings, curr_test_output = sess.run([in_score, predictions],feed_dict)
    test_ratings[start:end] = curr_test_ratings
    test_outputs[start:end] = curr_test_output
test_mse = mean_squared_error(y_true=test_ratings, y_pred=test_outputs)


In [60]:
print(test_mse)

0.5485044573951946


In [28]:
import gc
test = None
gc.collect()
train = pickle.load(open(DIR + "train.pkl","rb"))
training_indexes = list(range(0, TRAIN_SIZE, BATCH_SIZE))
total_batches = len(training_indexes)
np.random.shuffle(training_indexes)
train_ratings=np.zeros(TRAIN_SIZE)
train_outputs=np.zeros(TRAIN_SIZE)
curr_iter=0
# Train the model for each batch size
for start in training_indexes:
    end = min(start + BATCH_SIZE,TRAIN_SIZE)
    feed_dict = get_feed_dict(train,start,end,REVIEW_WORDS) 
    # Perform a training step for current batch
    curr_ratings, curr_output = sess.run([in_score,predictions],feed_dict)
    train_ratings[start:end] = curr_ratings
    train_outputs[start:end] = curr_output
    curr_iter+=1
    print("batch {} / {}".format(curr_iter,total_batches))
train_mse = mean_squared_error(y_true=train_ratings, y_pred=train_outputs)
print(train_mse)

batch 1 / 229
batch 2 / 229
batch 3 / 229
batch 4 / 229
batch 5 / 229
batch 6 / 229
batch 7 / 229
batch 8 / 229
batch 9 / 229
batch 10 / 229
batch 11 / 229
batch 12 / 229
batch 13 / 229
batch 14 / 229
batch 15 / 229
batch 16 / 229
batch 17 / 229
batch 18 / 229
batch 19 / 229
batch 20 / 229
batch 21 / 229
batch 22 / 229
batch 23 / 229
batch 24 / 229
batch 25 / 229
batch 26 / 229
batch 27 / 229
batch 28 / 229
batch 29 / 229
batch 30 / 229
batch 31 / 229
batch 32 / 229
batch 33 / 229
batch 34 / 229
batch 35 / 229
batch 36 / 229
batch 37 / 229
batch 38 / 229
batch 39 / 229
batch 40 / 229
batch 41 / 229
batch 42 / 229
batch 43 / 229
batch 44 / 229
batch 45 / 229
batch 46 / 229
batch 47 / 229
batch 48 / 229
batch 49 / 229
batch 50 / 229
batch 51 / 229
batch 52 / 229
batch 53 / 229
batch 54 / 229
batch 55 / 229
batch 56 / 229
batch 57 / 229
batch 58 / 229
batch 59 / 229
batch 60 / 229
batch 61 / 229
batch 62 / 229
batch 63 / 229
batch 64 / 229
batch 65 / 229
batch 66 / 229
batch 67 / 229
batc

In [71]:
if EXPORT_TOPIC_EMB:
    topics_lst = [50]
    for LDA_TOPICS in topics_lst:
        dictionary_path = LDA_DIR + "dictionary_" + str(LDA_TOPICS) + ".dict"
        corpus_path = LDA_DIR + "corpus"+ ".lda-c"
        dictionary = corpora.Dictionary.load(dictionary_path)
        print(corpus_path)
        corpus = corpora.BleiCorpus(corpus_path)
        lda_model_path = LDA_DIR + "lda_model_" + str(LDA_TOPICS) + "_topics.lda"

        word_dict = pickle.load(open(DIR + "word_dictionary.pkl","rb"))
        word_dict_rev = pickle.load(open(DIR + "word_dictionary_reverse.pkl","rb"))

        dictionary = corpora.Dictionary.load(dictionary_path)
        corpus = corpora.BleiCorpus(corpus_path)
        lda = LdaModel.load(lda_model_path)

        W1_arr = sess.run([W1])
        word_embeddings = W1_arr[0]

        #get raw topic > word estimates
        topics_terms = lda.state.get_lambda() 

        #convert estimates to probability (sum equals to 1 per topic)
        topics_terms_proba = np.apply_along_axis(lambda x: x/x.sum(),1,topics_terms)

        # find the right word based on column index
        words = [lda.id2word[i] for i in range(topics_terms_proba.shape[1])]

        words_per_topic = len(words)
        #topic_embeddings_raw = np.empty((LDA_TOPICS,words_per_topic,WORD_EMB))
        #topic_embeddings_raw[:] = np.nan
        topic_embeddings_raw = np.zeros((LDA_TOPICS,words_per_topic,WORD_EMB))

        for t in range(len(topics_terms_proba)):
            topic_dist = topics_terms_proba[t]
            for w in range(len(topic_dist)):
                word = words[w]
                if word in word_dict:
                    emb_idx = word_dict[word]
                    word_vec = word_embeddings[emb_idx]
                    topic_weight = topic_dist[w]
                    topic_embeddings_raw[t][w] = topic_weight * word_vec

        #topic_embeddings = np.nanmean(topic_embeddings_raw, axis=1)
        topic_embeddings = np.sum(topic_embeddings_raw, axis=1)

        with open(DIR + "topic_embeddings_" +str(LDA_TOPICS) + ".pkl",'wb') as file:
            pickle.dump(topic_embeddings,file)

data_cnn/lda/tcenr_data/corpus.lda-c


In [10]:
sess.close()

In [16]:
topic_embeddings.shape

(50, 50)