# LDA2Vec
Adapted from tensorflow implementation: 

(Current) https://github.com/nateraw/Lda2vec-Tensorflow

(Old) https://github.com/meereeum/lda2vec-tf

(Original) https://github.com/cemoody/lda2vec

In [1]:
import pandas as pd
import numpy as np
import pyLDAvis
import pickle

In [2]:
import sys
sys.path.append('./Lda2vec-Tensorflow-master')

  and should_run_async(code)


In [3]:
from lda2vec import *

  and should_run_async(code)


Instructions for updating:
non-resource variables are not supported in the long term


In [4]:
import tensorflow as tf
from tensorflow.keras import layers
#device_name = tf.test.gpu_device_name()
#if device_name != '/device:GPU:0':
#  raise SystemError('GPU device not found')
#print('Found GPU at: {}'.format(device_name))

## Find Ending Index of Documents for Each Year

In [5]:
data = pd.read_csv("papers.csv")

In [6]:
data = data.sort_values(by=["year"], kind='mergesort')

In [7]:
data

Unnamed: 0,id,year,title,event_type,pdf_name,abstract,paper_text
0,1,1987,Self-Organization of Associative Database and ...,,1-self-organization-of-associative-database-an...,Abstract Missing,767\n\nSELF-ORGANIZATION OF ASSOCIATIVE DATABA...
1,10,1987,A Mean Field Theory of Layer IV of Visual Cort...,,10-a-mean-field-theory-of-layer-iv-of-visual-c...,Abstract Missing,683\n\nA MEAN FIELD THEORY OF LAYER IV OF VISU...
111,11,1987,Microelectronic Implementations of Connectioni...,,11-microelectronic-implementations-of-connecti...,Abstract Missing,515\n\nMICROELECTRONIC IMPLEMENTATIONS OF CONN...
219,12,1987,Using Neural Networks to Improve Cochlear Impl...,,12-using-neural-networks-to-improve-cochlear-i...,Abstract Missing,783\n\nUSING NEURAL NETWORKS TO IMPROVE\nCOCHL...
328,13,1987,Temporal Patterns of Activity in Neural Networks,,13-temporal-patterns-of-activity-in-neural-net...,Abstract Missing,297\n\nTEMPORAL PATTERNS OF ACTIVITY IN\nNEURA...
...,...,...,...,...,...,...,...
6943,7280,2017,"On Separability of Loss Functions, and Revisit...",Poster,7280-on-separability-of-loss-functions-and-rev...,We revisit the classical analysis of generativ...,"On Separability of Loss Functions, and Revisit..."
6944,7281,2017,Maxing and Ranking with Few Assumptions,Poster,7281-maxing-and-ranking-with-few-assumptions.pdf,PAC maximum ...,Maxing and Ranking with Few Assumptions\nMoein...
6945,7282,2017,On clustering network-valued data,Poster,7282-on-clustering-network-valued-data.pdf,"Community detection, which focuses on clusteri...",On clustering network-valued data\n\nSoumendu ...
6946,7283,2017,A General Framework for Robust Interactive Lea...,Poster,7283-a-general-framework-for-robust-interactiv...,We propose a general framework for interactive...,A General Framework for Robust Interactive\nLe...


In [8]:
years = list(data["year"].unique())
counts = list(data["year"].value_counts())[::-1]
for i in range(1,len(counts)):
  counts[i] += counts[i-1]
year_index = {years[i]:counts[i] for i in range(len(years))}

In [9]:
print(year_index)
for i in range(1988,1987+len(year_index)):
    print(i, year_index[i]-year_index[i-1])

{1987: 90, 1988: 184, 1989: 285, 1990: 412, 1991: 552, 1992: 695, 1993: 839, 1994: 989, 1995: 1139, 1996: 1290, 1997: 1442, 1998: 1594, 1999: 1746, 2000: 1904, 2001: 2101, 2002: 2299, 2003: 2503, 2004: 2710, 2005: 2917, 2006: 3124, 2007: 3341, 2008: 3591, 2009: 3853, 2010: 4145, 2011: 4451, 2012: 4811, 2013: 5179, 2014: 5582, 2015: 5993, 2016: 6562, 2017: 7241}
1988 94
1989 101
1990 127
1991 140
1992 143
1993 144
1994 150
1995 150
1996 151
1997 152
1998 152
1999 152
2000 158
2001 197
2002 198
2003 204
2004 207
2005 207
2006 207
2007 217
2008 250
2009 262
2010 292
2011 306
2012 360
2013 368
2014 403
2015 411
2016 569
2017 679


## Preprocessing

$\tt cleaned.txt$ is a txt file with all NIPS papers with all stop words and words with $\rm{length} \leq 3$ removed.

Preprocess incorporates tokenization (splitting sentences into words), creating a vocabulary to save mappings from tokens to integer indices, and generating skip-grams.

In [10]:
df = pd.read_csv("cleaned.txt")
df = df[df["stop_removed_paper_text"].notnull()]

In [15]:
#Aggregate 100 papers based on groupings of 10 from the years 2008-2017
aggregate_data = pd.DataFrame(index=[0], columns=['index','paper_text'])

offset_2008 = 3341
offset_2009 = 3591
offset_2010 = 3853
offset_2011 = 4145
offset_2012 = 4451
offset_2013 = 4811
offset_2014 = 5179
offset_2015 = 5582
offset_2016 = 5993
offset_2017 = 6562

offset = 0

for i in range(0,10):
    
    aggregate_data.loc[i+0, 'index'] = offset_2008+offset
    aggregate_data.loc[i+10, 'index'] = offset_2009+offset
    aggregate_data.loc[i+20, 'index'] = offset_2010+offset
    aggregate_data.loc[i+30, 'index'] = offset_2011+offset
    aggregate_data.loc[i+40, 'index'] = offset_2012+offset
    aggregate_data.loc[i+50, 'index'] = offset_2013+offset
    aggregate_data.loc[i+60, 'index'] = offset_2014+offset
    aggregate_data.loc[i+70, 'index'] = offset_2015+offset
    aggregate_data.loc[i+80, 'index'] = offset_2016+offset
    aggregate_data.loc[i+90, 'index'] = offset_2017+offset
    
    aggregate_data.loc[i+0, 'paper_text'] = df.iat[i,offset_2008+offset]
    aggregate_data.loc[i+10, 'paper_text'] = df.iat[i,offset_2009+offset]
    aggregate_data.loc[i+20, 'paper_text'] = df.iat[i,offset_2010+offset]
    aggregate_data.loc[i+30, 'paper_text'] = df.iat[i,offset_2011+offset]
    aggregate_data.loc[i+40, 'paper_text'] = df.iat[i,offset_2012+offset]
    aggregate_data.loc[i+50, 'paper_text'] = df.iat[i,offset_2013+offset]
    aggregate_data.loc[i+60, 'paper_text'] = df.iat[i,offset_2014+offset]
    aggregate_data.loc[i+70, 'paper_text'] = df.iat[i,offset_2015+offset]
    aggregate_data.loc[i+80, 'paper_text'] = df.iat[i,offset_2016+offset]
    aggregate_data.loc[i+90, 'paper_text'] = df.iat[i,offset_2017+offset]
    
    offset += 1
    

print(aggregate_data)
aggregate_data.to_excel("test.xlsx")

   index paper_text
0   3341        NaN
10  3591        NaN
20  3853        NaN
30  4145        NaN
40  4451        NaN
..   ...        ...
59  4820        NaN
69  5188        NaN
79  5591        NaN
89  6002        NaN
99  6571        NaN

[100 rows x 2 columns]


In [12]:
# Initialize a preprocessor
P = nlppipe.Preprocessor(aggregate_data, "paper_text", max_features=30000, maxlen=10000, min_count=30)

In [13]:
# Run the preprocessing on your dataframe
P.preprocess()

AttributeError: 'float' object has no attribute 'split'

In [None]:
# Should we load pretrained embeddings from file
load_embeds = True

# Load embeddings from file if we choose to do so
if load_embeds:
    # Load embedding matrix from file path - change path to where you saved them
    embedding_matrix = P.load_glove("glove.6B.300d.txt")
else:
    embedding_matrix = None

In [None]:
# Save data to data_dir
P.save_data("clean_data", embedding_matrix=embedding_matrix)

## Using the LDA2Vec Model

Using the LDA2Vec model on preprocessed data.

In [None]:
# Path to preprocessed data
data_path  = "clean_data"
# Whether or not to load saved embeddings file
load_embeds = True

# Load data from files
(idx_to_word, word_to_idx, freqs, pivot_ids,
 target_ids, doc_ids, embed_matrix) = utils.load_preprocessed_data(data_path, load_embed_matrix=load_embeds)

In [None]:
# Number of unique documents
num_docs = doc_ids.max() + 1
# Number of unique words in vocabulary (int)
vocab_size = len(freqs)
# Embed layer dimension size
# If not loading embeds, change 128 to whatever size you want.
embed_size = embed_matrix.shape[1] if load_embeds else 128
# Number of topics to cluster into
num_topics = 20
# Amount of iterations over entire dataset
num_epochs = 50
# Batch size - Increase/decrease depending on memory usage
batch_size = 4096
# Epoch that we want to "switch off" regularization
switch_loss_epoch = 0
# Pretrained embeddings value
pretrained_embeddings = embed_matrix if load_embeds else None
# If True, save logdir, otherwise don't
save_graph = False

In [None]:
# Initialize the model
m = model(num_docs,
          vocab_size,
          num_topics,
          embedding_size=embed_size,
          pretrained_embeddings=pretrained_embeddings,
          freqs=freqs,
          batch_size = batch_size,
          save_graph_def=save_graph)

In [None]:
# Train the model
m.train(pivot_ids,
        target_ids,
        doc_ids,
        len(pivot_ids),
        num_epochs,
        idx_to_word=idx_to_word,
        switch_loss_epoch=switch_loss_epoch)

## Get Word and Topic Embeddings

Visualize topics

In [None]:
idx_to_word

In [None]:
doc_embed = m.sesh.run(m.mixture.doc_embedding)

In [None]:
topic_embed = m.sesh.run(m.mixture.topic_embedding)

In [None]:
word_embed = m.sesh.run(m.w_embed.embedding)

In [None]:
# Extract all unique words in order of index 0-vocab_size
vocabulary = []
for k,v in idx_to_word.items():
    vocabulary.append(v)

In [None]:
doc_lengths = np.load("clean_data" + "/doc_lengths.npy")

In [None]:
vis_data = utils.prepare_topics(doc_embed, topic_embed, word_embed, np.array(vocabulary), doc_lengths=doc_lengths,
                              term_frequency=freqs, normalize=True)

In [None]:
prepared_vis_data = pyLDAvis.prepare(**vis_data)

In [None]:
pyLDAvis.display(prepared_vis_data)

In [None]:
np.save("doc_embed", doc_embed)

In [None]:
np.save("word_embed", word_embed)

In [None]:
np.save("topic_embed", topic_embed)

In [None]:
def closest(embed_idxs, embed_type):
    if embed_type == 0:  # Topics
        norm_embeds = topic_embed / np.linalg.norm(topic_embed, axis=1).reshape((topic_embed.shape[0],1))
        embed_vec = topic_embed[embed_idxs[2]] - topic_embed[embed_idxs[0]] + topic_embed[embed_idxs[1]]
        embed_norm = (embed_vec) / np.linalg.norm(embed_vec)
    else:  # Words
        norm_embeds = word_embed / np.linalg.norm(word_embed, axis=1).reshape((word_embed.shape[0],1))
        embed_vec = word_embed[embed_idxs[2]] - word_embed[embed_idxs[0]] + word_embed[embed_idxs[1]]
        embed_norm = (embed_vec) / np.linalg.norm(embed_vec)
    cos_sim = np.dot(norm_embeds, embed_norm)
    sort = (-cos_sim).argsort()
    for i in range(3): sort=sort[sort != embed_idxs[i]]
    return sort[0]

In [None]:
def cosadd(embed1, embed2, embed3, embed4, embed_type):
    if embed_type == 0:
        norm_embed1 = topic_embed[embed1] / np.linalg.norm(topic_embed[embed1])
        norm_embed2 = topic_embed[embed2] / np.linalg.norm(topic_embed[embed2])
        norm_embed3 = topic_embed[embed3] / np.linalg.norm(topic_embed[embed3])
        norm_embed4 = topic_embed[embed4] / np.linalg.norm(topic_embed[embed4])
    else:
        norm_embed1 = word_embed[embed1] / np.linalg.norm(word_embed[embed1])
        norm_embed2 = word_embed[embed2] / np.linalg.norm(word_embed[embed2])
        norm_embed3 = word_embed[embed3] / np.linalg.norm(word_embed[embed3])
        norm_embed4 = word_embed[embed4] / np.linalg.norm(word_embed[embed4])
    return np.dot(norm_embed4, norm_embed3) - np.dot(norm_embed4, norm_embed1) + np.dot(norm_embed4, norm_embed2)

In [None]:
from itertools import *
from math import comb
combine = list(combinations(list(range(word_embed.shape[0])), 3))
score = np.zeros(comb(word_embed.shape[0],3))
for i in range(score.shape[0]):
    (embed_idx1, embed_idx2, embed_idx3) = combine[i]
    score[i] = cosadd(embed_idx1, embed_idx2, embed_idx3, closest([embed_idx1, embed_idx2, embed_idx3], 1), 1)
top_analogies = [list(combine[idx])+[closest([combine[idx][0], combine[idx][1], combine[idx][2]], 1)] for idx in (-score).argsort()]
top_scores = [score[idx] for idx in (-score).argsort()]

In [None]:
top_analogies[:5]

In [None]:
top_scores[:5]

In [None]:
cosadd(10, 14, 33, 167, 1)

In [None]:
word_embed.shape

In [None]:
m.sesh.get_k_closest([], in_type='word', vs_type='word', k=10, idx_to_word=None, verbose=False):