In [1]:
import logging
import os
from imp import reload

import numpy as np
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.models import Phrases
from gensim.corpora import Dictionary
from gensim.models import LdaModel
from gensim.models import ldamodel
from gensim.models import LdaModelOld
from gensim.models import ldamodelold

from bokeh.io import output_notebook
from bokeh.layouts import layout
from bokeh.models import Title, Legend, Div
from bokeh.plotting import figure, show

In [2]:
output_notebook()

In [3]:
# Configure logging.

log_dir = '../../../log_files/log.log'

logger = logging.getLogger()
fhandler = logging.FileHandler(filename=log_dir, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

In [4]:
# Read data.

# Folder containing all NIPS papers.
data_dir = '../../../data/nipstxt/'

# Folders containin individual NIPS papers.
#yrs = ['00', '01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12']
yrs = ['00']
dirs = ['nips' + yr for yr in yrs]

# Read all texts into a list.
docs = []
for yr_dir in dirs:
    files = os.listdir(data_dir + yr_dir)
    for filen in files:
        # Note: ignoring characters that cause encoding errors.
        with open(data_dir + yr_dir + '/' + filen, errors='ignore') as fid:
            txt = fid.read()
        docs.append(txt)

In [5]:
# Tokenize the documents.


# Split the documents into tokens.
tokenizer = RegexpTokenizer(r'\w+')
for idx in range(len(docs)):
    docs[idx] = docs[idx].lower()  # Convert to lowercase.
    docs[idx] = tokenizer.tokenize(docs[idx])  # Split into words.

# Remove numbers, but not words that contain numbers.
docs = [[token for token in doc if not token.isnumeric()] for doc in docs]

# Remove words that are only one character.
docs = [[token for token in doc if len(token) > 1] for doc in docs]

In [6]:
# Lemmatize the documents.

# Lemmatize all words in documents.
lemmatizer = WordNetLemmatizer()
docs = [[lemmatizer.lemmatize(token) for token in doc] for doc in docs]

In [7]:
# Compute bigrams.

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [8]:
# Remove rare and common tokens.

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur less than 20 documents, or more than 50% of the documents.
dictionary.filter_extremes(no_below=20, no_above=0.5)

In [9]:
# Vectorize data.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

In [10]:
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Number of unique tokens: 681
Number of documents: 90


In [None]:
reload(ldamodelold)
LdaModelOld = ldamodelold.LdaModelOld

In [None]:
# Train LDA model.

# Set training parameters.
num_topics = 10
chunksize = 1
passes = 10
iterations = 1
eval_every = 100
update_every=1

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModelOld(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='symmetric', eta='symmetric', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every, random_state=0)

In [14]:
reload(ldamodel)
LdaModel = ldamodel.LdaModel

In [15]:
# Train LDA model.

# Set training parameters.
num_topics = 10
chunksize = 1
passes = 10
iterations = 1
eval_every = 100
update_every=1

# Make a index to word dictionary.
temp = dictionary[0]  # This is only to "load" the dictionary.
id2word = dictionary.id2token

%time model = LdaModel(corpus=corpus, id2word=id2word, chunksize=chunksize, \
                       alpha='symmetric', eta='asymmetric', \
                       iterations=iterations, num_topics=num_topics, \
                       passes=passes, eval_every=eval_every, update_every=update_every, random_state=0)

-6.9379088891
-6.92155478444
-6.90435565647
-6.89286126587
-6.88981479734
-6.89648824302
-6.85361481472
-6.82566448766
-6.81444976494
-6.7921686612
CPU times: user 12.6 s, sys: 92 ms, total: 12.7 s
Wall time: 12.7 s


## Data

### chunksize=100

In [203]:
lda_new_sym = [-7.20780852768, -6.48027491729, -6.39355881137, -6.35073216324, -6.32447008592, -6.30691083434, -6.29370869898, -6.28376425986, -6.27617922491, -6.27016617593]

lda_new_auto = [-7.21197266652, -6.40968603967, -6.29680956267, -6.24068462426, -6.2089305, -6.18930375001, -6.17680866761, -6.16854461189, -6.16271270432, -6.15845521829]

In [204]:
lda_old_sym = [-7.25247867382, -6.49981081176, -6.41634676867, -6.37629961919, -6.35376327852, -6.3387396089, -6.32790036692, -6.31953285676, -6.31267610035, -6.30721028608]

lda_old_auto = [-7.25507185865, -6.53067814813, -6.47428823682, -6.48390241381, -6.52770945854, -6.58897195146, -6.65807185302, -6.72785922104, -6.79348132306, -6.85222778345]

In [211]:
iterations = range(10)

p1 = figure(title='Variational lower bound', x_axis_label='Iterations', y_axis_label='Per word bound')
s1 = p1.line(iterations, lda_old_sym, color='red')
p1.circle(iterations, lda_old_sym, color='red')
s2 = p1.line(iterations, lda_old_auto, color='blue')
p1.circle(iterations, lda_old_auto, color='blue')
s3 = p1.line(iterations, lda_new_sym, color='green')
p1.circle(iterations, lda_new_sym, color='green')
s4 = p1.line(iterations, lda_new_auto, color='black')
p1.circle(iterations, lda_new_auto, color='black')
legend = Legend(items=[
        ('sym before', [s1]),
        ('auto before', [s2]),
        ('sym after', [s3]),
        ('auto after', [s4]),
    ], location=(-150.0, -200.0))
p1.add_layout(legend, 'right')
p1.plot_height=400
p1.plot_width=600
p1.toolbar_location = None

show(p1)

### chunksize=1

In [201]:
lda_new_sym1 = [-6.80134796317, -6.75097531201, -6.73886070304, -6.72962711303, -6.72267638585, -6.71606341239, -6.70971257012, -6.70394757735, -6.69955095496, -6.69904909566]

lda_new_auto1 = [-6.79151637522, -6.69692823872, -6.6743960144, -6.66393359524, -6.6590651321, -6.65639088794, -6.65541702115, -6.65517477998, -6.65463825675, -6.65438399488]

In [202]:
lda_old_sym1 = [-6.90661157714, -6.85064402263, -6.83076125336, -6.82225105487, -6.8195549754, -6.81626842372, -6.81311867382, -6.80953109107, -6.80673987741, -6.80428929604]

lda_old_auto1 = [-6.88633178128, -6.81238227988, -6.79358619197, -6.78386386551, -6.77778353483, -6.77287024956, -6.76896059942, -6.76567813313, -6.7632620623, -6.76098877617]