In [1]:
import random, sys
import numpy as np
from time import time

from gensim.models import atmodel
from gensim.models import AuthorTopicModel


In [2]:
from bokeh.io import output_notebook
from bokeh.plotting import figure, show
from bokeh.models import Legend, Div, PrintfTickFormatter
from bokeh.layouts import column, row

output_notebook()

# Tests of the algorithm on artificially generated data

## Test difference between blocking VB and non-blocking VB

In [2]:
atfilename = '/home/olavur/Dropbox/my_folder/workstuff/DTU/thesis/code/gensim/gensim/models/temp/blocking_vb_tests/atnonblocking.py'
with open(atfilename) as f:
    code = compile(f.read(), atfilename, 'exec')
    exec(code)

In [3]:
atfilename = '/home/olavur/Dropbox/my_folder/workstuff/DTU/thesis/code/gensim/gensim/models/temp/blocking_vb_tests/atblocking.py'
with open(atfilename) as f:
    code = compile(f.read(), atfilename, 'exec')
    exec(code)

In [71]:
vocab_size = 1000
num_docs = 100
words_per_doc = 10  # Average unique words per document
word_freq = 10  # Average frequency of each word in document.
word_std = 10.0
corpus = []
for d in range(num_docs):
    ids = random.sample(range(vocab_size), words_per_doc)
    cts = np.random.normal(word_freq, word_std, len(ids))
    cts = [int(np.ceil(abs(cnt))) for cnt in cts]
    doc = list(zip(ids, cts))
    corpus.append(doc)
    
num_authors = 100
def make_author2doc(docs_per_author=10):
    author2doc = {}
    for a in range(num_authors):
        doc_ids = random.sample(range(num_docs), docs_per_author)
        author2doc[a] = doc_ids

    return author2doc

def make_doc2author(authors_per_doc=10):
    doc2author = {}
    for d in range(num_docs):
        author_ids = random.sample(range(num_authors), authors_per_doc)
        doc2author[d] = author_ids

    return doc2author

In [72]:
num_topics = 5
doc2author = make_doc2author(5)
author2doc = atmodel.construct_author2doc(corpus, doc2author)

%time nb_model = AtNonBlocking(corpus=corpus, num_topics=num_topics, \
                  author2doc=author2doc, doc2author=doc2author, iterations=5, random_state=0)

%time b_model = AtBlocking(corpus=corpus, num_topics=num_topics, \
                  author2doc=author2doc, doc2author=doc2author, iterations=5, random_state=0)


CPU times: user 6.13 s, sys: 0 ns, total: 6.13 s
Wall time: 6.13 s
CPU times: user 11.6 s, sys: 32 ms, total: 11.6 s
Wall time: 11.6 s


In [58]:
authors_per_doc = [len(authors) for authors in nb_model.doc2author.values()]
sum(authors_per_doc) / len(nb_model.doc2author)

5.0

In [59]:
docs_per_author = [len(docs) for docs in nb_model.author2doc.values()]
sum(docs_per_author) / len(nb_model.author2doc)

50.0

In [70]:
iterations = range(nb_model.iterations)

p1 = figure(title='', x_axis_label='Iterations', y_axis_label='Per word bound')
s1 = p1.line(iterations, nb_model.perwordbound[1:], color='red')
p1.circle(iterations, nb_model.perwordbound[1:], color='red')
s2 = p1.line(iterations, b_model.perwordbound[1:], color='blue')
p1.circle(iterations, b_model.perwordbound[1:], color='blue')
p1.plot_height=400
p1.plot_width=600
p1.toolbar_location = None

show(p1)

In [73]:
authors_per_doc = [len(authors) for authors in nb_model.doc2author.values()]
sum(authors_per_doc) / len(nb_model.doc2author)

5.0

In [74]:
docs_per_author = [len(docs) for docs in nb_model.author2doc.values()]
sum(docs_per_author) / len(nb_model.author2doc)

5.05050505050505

In [75]:
iterations = range(nb_model.iterations)

p1 = figure(title='', x_axis_label='Iterations', y_axis_label='Per word bound')
s1 = p1.line(iterations, nb_model.perwordbound[1:], color='red')
p1.circle(iterations, nb_model.perwordbound[1:], color='red')
s2 = p1.line(iterations, b_model.perwordbound[1:], color='blue')
p1.circle(iterations, b_model.perwordbound[1:], color='blue')
p1.plot_height=400
p1.plot_width=600
p1.toolbar_location = None

show(p1)

## Scalability

### W.r.t. number of authors

In [5]:
vocab_size = 1000
num_docs = 100
words_per_doc = 10  # Average unique words per document
word_freq = 10  # Average frequency of each word in document.
word_std = 10.0
corpus = []
for d in range(num_docs):
    ids = random.sample(range(vocab_size), words_per_doc)
    cts = np.random.normal(word_freq, word_std, len(ids))
    cts = [int(np.ceil(abs(cnt))) for cnt in cts]
    doc = list(zip(ids, cts))
    corpus.append(doc)
    
def make_author2doc(docs_per_author=10, num_authors=100):
    author2doc = {}
    for a in range(num_authors):
        doc_ids = random.sample(range(num_docs), docs_per_author)
        author2doc[a] = doc_ids

    return author2doc

def make_doc2author(authors_per_doc=10, num_authors=100):
    doc2author = {}
    for d in range(num_docs):
        author_ids = random.sample(range(num_authors), authors_per_doc)
        doc2author[d] = author_ids

    return doc2author

In [14]:
num_topics = 5
num_authors_list = [100, 200, 400, 800, 1000]
authors_per_doc = 5
chunksize = len(corpus) + 1

In [33]:
%%time
# Time the entire process.

train_time = []
eval_time = []
for num_authors in num_authors_list:
    doc2author = make_doc2author(authors_per_doc, num_authors)
    author2doc = atmodel.construct_author2doc(corpus, doc2author)
    
    # Get training time.
    avg_elapsed = 0.0
    for _ in range(10):
        start = time()
        model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, \
                        author2doc=author2doc, doc2author=doc2author, chunksize=chunksize, \
                        iterations=10, passes=10, eval_every=0, random_state=1)
        avg_elapsed += time() - start
    avg_elapsed /= 10
    train_time.append(avg_elapsed)
    
    # Get evaluation time.
    avg_elapsed = 0.0
    for _ in range(10):
        start = time()
        perwordbound = model.bound(corpus, author2doc=author2doc, doc2author=doc2author)
        avg_elapsed += time() - start
    avg_elapsed /= 10
    eval_time.append(avg_elapsed)

CPU times: user 1min 12s, sys: 16 ms, total: 1min 12s
Wall time: 1min 12s


In [32]:
p1 = figure(title='Train time', x_axis_label='Num authors', y_axis_label='Time (sec)')
s1 = p1.line(num_authors_list, train_time)
p1.circle(num_authors_list, train_time)
p1.plot_height=400
p1.plot_width=400
p1.toolbar_location = None

p2 = figure(title='Evaluation time', x_axis_label='Num authors', y_axis_label='Time (sec)')
s2 = p2.line(num_authors_list, eval_time)
p2.circle(num_authors_list, eval_time)
p2.plot_height=400
p2.plot_width=400
p2.toolbar_location = None

plots = row(p1, p2)

show(plots)

### W.r.t. number of authors PER document

Number of authors is constant.

In [122]:
vocab_size = 1000
num_docs = 100
words_per_doc = 10  # Average unique words per document
word_freq = 10  # Average frequency of each word in document.
word_std = 10.0
corpus = []
for d in range(num_docs):
    ids = random.sample(range(vocab_size), words_per_doc)
    cts = np.random.normal(word_freq, word_std, len(ids))
    cts = [int(np.ceil(abs(cnt))) for cnt in cts]
    doc = list(zip(ids, cts))
    corpus.append(doc)
    
def make_author2doc(docs_per_author=10, num_authors=100):
    author2doc = {}
    for a in range(num_authors):
        doc_ids = random.sample(range(num_docs), docs_per_author)
        author2doc[a] = doc_ids

    return author2doc

def make_doc2author(authors_per_doc=10, num_authors=100):
    doc2author = {}
    for d in range(num_docs):
        author_ids = random.sample(range(num_authors), authors_per_doc)
        doc2author[d] = author_ids

    return doc2author

In [125]:
num_topics = 5
num_authors = 1000
authors_per_doc_list = [5**i for i in range(4)]
chunksize = len(corpus) + 1

In [126]:
%%time
# Time the entire process.

train_time = []
eval_time = []
for authors_per_doc in authors_per_doc_list:
    doc2author = make_doc2author(authors_per_doc, num_authors)
    author2doc = atmodel.construct_author2doc(corpus, doc2author)
    
    # Get training time.
    avg_elapsed = 0.0
    for _ in range(1):
        start = time()
        model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, \
                        author2doc=author2doc, doc2author=doc2author, chunksize=chunksize, \
                        iterations=10, passes=10, eval_every=0, random_state=1)
        avg_elapsed += time() - start
    avg_elapsed /= 1
    train_time.append(avg_elapsed)
    
    # Get evaluation time.
    avg_elapsed = 0.0
    for _ in range(1):
        start = time()
        perwordbound = model.bound(corpus, author2doc=author2doc, doc2author=doc2author)
        avg_elapsed += time() - start
    avg_elapsed /= 1
    eval_time.append(avg_elapsed)

96
383
924
1000
CPU times: user 12.1 s, sys: 36 ms, total: 12.1 s
Wall time: 12.1 s


In [127]:
p1 = figure(title='Train time', x_axis_label='Num authors', y_axis_label='Time (sec)')
s1 = p1.line(authors_per_doc_list, train_time)
p1.circle(authors_per_doc_list, train_time)
p1.plot_height=400
p1.plot_width=400
p1.toolbar_location = None

p2 = figure(title='Evaluation time', x_axis_label='Num authors', y_axis_label='Time (sec)')
s2 = p2.line(authors_per_doc_list, eval_time)
p2.circle(authors_per_doc_list, eval_time)
p2.plot_height=400
p2.plot_width=400
p2.toolbar_location = None

plots = row(p1, p2)

show(column(Div(text='<h3>Sclability w.r.t. number of authors per document</h3>'), plots))

### W.r.t. number of documents

In [208]:
%%time

# Set some parameters.
num_authors = 1000
authors_per_doc = 5
num_topics = 5
vocab_size = 1000
words_per_doc = 10  # Average unique words per document
word_freq = 10  # Average frequency of each word in document.
word_std = 10.0
num_docs_list = [100, 1000, 10000, 100000]

def make_doc2author(authors_per_doc=10, num_authors=100):
    doc2author = {}
    for d in range(num_docs):
        author_ids = random.sample(range(num_authors), authors_per_doc)
        doc2author[d] = author_ids

    return doc2author


# Generate corpus.
corpus_big = []
for d in range(num_docs_list[-1]):
    ids = random.sample(range(vocab_size), words_per_doc)
    cts = np.random.normal(word_freq, word_std, len(ids))
    cts = [int(np.ceil(abs(cnt))) for cnt in cts]
    doc = list(zip(ids, cts))
    corpus_big.append(doc)

train_time = []
eval_time = []
memory_footprint = []
for num_docs in num_docs_list:
    # Run tests.
    
    # Construct dictionaries.
    doc2author = make_doc2author(authors_per_doc, num_authors)
    author2doc = atmodel.construct_author2doc(corpus, doc2author)
    
    corpus = random.sample(corpus_big, num_docs)
    memory_footprint.append(sys.getsizeof(corpus))
    
    chunksize = len(corpus) + 1
    
    # Get training time.
    avg_elapsed = 0.0
    for _ in range(1):
        start = time()
        model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, \
                        author2doc=author2doc, doc2author=doc2author, chunksize=chunksize, \
                        iterations=10, passes=10, eval_every=0, random_state=1)
        avg_elapsed += time() - start
    avg_elapsed /= 1
    train_time.append(avg_elapsed)
    
    # Get evaluation time.
    avg_elapsed = 0.0
    for _ in range(1):
        start = time()
        perwordbound = model.bound(corpus, author2doc=author2doc, doc2author=doc2author)
        avg_elapsed += time() - start
    avg_elapsed /= 1
    eval_time.append(avg_elapsed)

CPU times: user 34min 38s, sys: 4.3 s, total: 34min 42s
Wall time: 34min 48s


In [207]:
p1 = figure(title='Train time', x_axis_label='Size of corpus (bytes)', y_axis_label='Time (sec)', \
            x_axis_type='log', x_range=(10**2, 10**6))
s1 = p1.line(memory_footprint, train_time)
p1.circle(memory_footprint, train_time)
p1.plot_height=400
p1.plot_width=400
p1.toolbar_location = None

p2 = figure(title='Evaluation time', x_axis_label='Size of corpus (bytes)', y_axis_label='Time (sec)',\
           x_axis_type='log', x_range=(10**2, 10**6))
s2 = p2.line(memory_footprint, eval_time)
p2.circle(memory_footprint, eval_time)
p2.plot_height=400
p2.plot_width=400
p2.toolbar_location = None

plots = row(p1, p2)

show(column(Div(text='<h3>Sclability w.r.t. number of documents</h3>'), plots))

In [None]:
print(num_docs_list)
print(memory_footprint)

### W.r.t. size of vocab

In [53]:
%%time

# Set some parameters.
num_authors = 1000
authors_per_doc = 5
num_topics = 5
vocab_size_list = [10, 100, 1000, 10000, 100000]
words_per_doc = 10  # Average unique words per document
word_freq = 10  # Average frequency of each word in document.
word_std = 10.0
num_docs = 100

def make_doc2author(authors_per_doc=10, num_authors=100):
    doc2author = {}
    for d in range(num_docs):
        author_ids = random.sample(range(num_authors), authors_per_doc)
        doc2author[d] = author_ids

    return doc2author


train_time = []
eval_time = []
for vocab_size in vocab_size_list:
    # Run tests.
    
    # Generate corpus.
    corpus = []
    for d in range(num_docs):
        ids = random.sample(range(vocab_size), words_per_doc)
        cts = np.random.normal(word_freq, word_std, len(ids))
        cts = [int(np.ceil(abs(cnt))) for cnt in cts]
        doc = list(zip(ids, cts))
        corpus.append(doc)
    
    # Construct dictionaries.
    doc2author = make_doc2author(authors_per_doc, num_authors)
    author2doc = atmodel.construct_author2doc(corpus, doc2author)
    
    chunksize = len(corpus) + 1
    
    # Get training time.
    avg_elapsed = 0.0
    for _ in range(1):
        start = time()
        model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, \
                        author2doc=author2doc, doc2author=doc2author, chunksize=chunksize, \
                        iterations=10, passes=10, eval_every=0, random_state=1)
        avg_elapsed += time() - start
    avg_elapsed /= 1
    train_time.append(avg_elapsed)
    
    # Get evaluation time.
    avg_elapsed = 0.0
    for _ in range(10):
        start = time()
        perwordbound = model.bound(corpus, author2doc=author2doc, doc2author=doc2author)
        avg_elapsed += time() - start
    avg_elapsed /= 10
    eval_time.append(avg_elapsed)

CPU times: user 20.8 s, sys: 64 ms, total: 20.9 s
Wall time: 20.9 s


In [56]:
p1 = figure(title='Train time', x_axis_label='Size of vocab', y_axis_label='Time (sec)', \
            x_axis_type='log', x_range=(1e0, 1e6), y_axis_type='log')
s1 = p1.line(vocab_size_list, train_time)
p1.circle(vocab_size_list, train_time)
p1.plot_height=400
p1.plot_width=400
p1.toolbar_location = None

p2 = figure(title='Evaluation time', x_axis_label='Size of vocab', y_axis_label='Time (sec)',\
           x_axis_type='log', x_range=(1e0, 1e6), y_axis_type='log')
s1 = p2.line(vocab_size_list, eval_time)
p2.circle(vocab_size_list, eval_time)
p2.plot_height=400
p2.plot_width=400
p2.toolbar_location = None

plots = row(p1, p2)

show(column(Div(text='<h3>Sclability w.r.t. size of vocabulary</h3>'), plots))

### W.r.t. number of topics

In [62]:
%%time

# Set some parameters.
num_authors = 1000
authors_per_doc = 5
num_topics_list = [5, 25, 125, 625]
vocab_size = 1000
words_per_doc = 10  # Average unique words per document
word_freq = 10  # Average frequency of each word in document.
word_std = 10.0
num_docs = 100

def make_doc2author(authors_per_doc=10, num_authors=100):
    doc2author = {}
    for d in range(num_docs):
        author_ids = random.sample(range(num_authors), authors_per_doc)
        doc2author[d] = author_ids

    return doc2author

# Generate corpus.
corpus = []
for d in range(num_docs):
    ids = random.sample(range(vocab_size), words_per_doc)
    cts = np.random.normal(word_freq, word_std, len(ids))
    cts = [int(np.ceil(abs(cnt))) for cnt in cts]
    doc = list(zip(ids, cts))
    corpus.append(doc)

# Construct dictionaries.
doc2author = make_doc2author(authors_per_doc, num_authors)
author2doc = atmodel.construct_author2doc(corpus, doc2author)

chunksize = len(corpus) + 1

train_time = []
for num_topics in num_topics_list:
    # Get training time.
    avg_elapsed = 0.0
    for _ in range(1):
        start = time()
        model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, \
                        author2doc=author2doc, doc2author=doc2author, chunksize=chunksize, \
                        iterations=10, passes=10, eval_every=0, random_state=1)
        avg_elapsed += time() - start
    avg_elapsed /= 1
    train_time.append(avg_elapsed)


CPU times: user 25.7 s, sys: 56 ms, total: 25.8 s
Wall time: 25.8 s


In [66]:
p1 = figure(title='Train time', x_axis_label='Number of topics', y_axis_label='Time (sec)', \
            x_axis_type='log', x_range=(1e0, 2e3), y_axis_type='log')
s1 = p1.line(num_topics_list, train_time)
p1.circle(num_topics_list, train_time)
p1.plot_height=400
p1.plot_width=400
p1.toolbar_location = None


show(column(Div(text='<h3>Sclability w.r.t. number of topics</h3>'), p1))