In [48]:
import xml.etree.ElementTree as ET
from imp import reload
from pprint import pprint
import os, shutil, re, random, logging, pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import gensim
from gensim.corpora import Dictionary, MmCorpus
from gensim.parsing.preprocessing import STOPWORDS
from gensim.similarities import MatrixSimilarity
from gensim.matutils import sparse2full, hellinger
from gensim.models import Phrases, LdaModel
from gensim.models import AuthorTopicModel
from gensim.models import atmodel
from gensim.models import ldamodel

import spacy

In [2]:
# Configure logging.

log_dir = '../../../log_files/log.log'  # On my own machine.
#log_dir = '../../../../log_files/log.log'  # On Hetzner

logger = logging.getLogger()
fhandler = logging.FileHandler(filename=log_dir, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

## Load and pre-process data

In [3]:
nlp = spacy.load('en')

In [17]:
data_folder = '../../../../data/stackexchange/cooking/'
input_fname = data_folder + 'Posts.xml'
output_fname = '/tmp/cooking_docs.txt'
tree = ET.parse(input_fname)
root = tree.getroot()
num_docs = 1000

In [18]:
post_ids = []
for i, item in enumerate(root.iter()):
    if i == 0:
        # This is the <posts> XML element.
        continue
    post_ids.append(int(item.get('Id')))

print('Number of posts in dataset:', len(post_ids))

Number of posts in dataset: 54566


In [19]:
def doc_generator(root, num_docs=None):
    '''
    This generator parses the XML data, do some preliminary
    pre-processing and yields the documents.
    
    '''
    num_posts = 0
    for post_id in post_ids:
        post_text = ''
        for i, item in enumerate(root.iter()):
            if i == 0:
                # This is the <posts> XML element.
                continue
            elif int(item.get('Id')) == post_id:
                # This is the post.
                post_text += item.get('Body')
            elif item.get('ParentId') is not None and int(item.get('ParentId')) == post_id:
                # This is an answer to the post.
                post_text += item.get('Body')
            else:
                # Neither post "post_id" or answer to it.
                continue

            # Remove any HTML tags, such as <p>.
            post_text = re.sub('<[^<]+?>', '', post_text)

            # Replace any whitespace (newline, tabs, etc.) by a single space.
            post_text = re.sub('\s', ' ', post_text)

        if num_docs is not None and num_posts >= num_docs:
            break
            
        num_posts += 1
        
        yield post_text


In [20]:
# Use the default SpaCy NLP pipeline to process the documents in parallel.
# Then use the output of the pipeline to transform the text.
# Write the resulting text to a file.
entity_freq = {}
postid = 0
with open(output_fname, 'w') as fid:
    for doc in nlp.pipe(doc_generator(root, num_docs=num_docs), n_threads=4):
        # Process post text.
        
        # NOTE: the doc_generator is probably the bottleneck here.
        
        ents = doc.ents  # Named entities.

        # Keep only words (no numbers, no punctuation).
        # Lemmatize tokens, remove punctuation and remove stopwords.
        #doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        
        # Remove stopwords and punctuation, and lemmatized tokens.
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        #tokens = [str(token) for token in doc if not token.is_stop and not token.is_punct]
        #tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
        
        # Add named entities, but only if they are a compound of more than word.
        #doc.extend([str(entity) for entity in ents if len(entity) > 1])
        
        #for entity in ents:
        #    if entity_freq.get(entity):
        #        entity_freq[entity] += 1
        #    else:
        #        entity_freq[entity] = 1
        
        # Write the doc to file.
        fid.write(' '.join(tokens) + '\n')

In [21]:
# Get the tags of each post.
postid = 0
postid2tagname = dict()
tag_set = set()
for i, item in enumerate(root.iter()):
    if i == 0:
        # This is the <posts> XML element.
        continue
    if item.get('Tags') is None:
        # There are many posts with no tags.
        continue
    
    if num_docs is not None and postid >= num_docs:
        break

    tags = item.get('Tags')
    tags = re.findall('<(.+?)>', tags)
    # NOTE: consider using a tag that is common for all posts, and/or
    # a tag that is only for this particular post. 
    # NOTE: also consider including posts with no tags, and tag them with
    # post ID or "SUPER_TAG", maybe both, maybe an extra "NO_TAG" tag.
    #tags.append('SUPER_TAG')
    tags.append('POST_ID' + str(postid))
    postid2tagname[postid] = tags
    for tag in tags:
        tag_set.add(tag)

    postid += 1

**TODO:** these names aren't great, "doc_generator" and "docs_generator".

In [22]:
def docs_generator(fname):
    '''
    This generator reads the processed text one line
    at a time and yields documents (lists of words).
    
    '''
    with open(fname, 'r') as fid:
        for line in fid:
            line = line.strip()  # Remove newline ("\n").
            doc = line.split(' ')  # Split line text into words.
            yield doc

In [23]:
# Compute bigrams.

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
docs = docs_generator(output_fname)
bigram = Phrases(docs, min_count=20)
docs = docs_generator(output_fname)
with open(output_fname + '.tmp', 'w') as fid:
    for doc in docs:
        for token in bigram[doc]:
            if '_' in token:
                doc.append(token)
        fid.write(' '.join(doc) + '\n')


shutil.copyfile(output_fname + '.tmp', output_fname)
os.remove(output_fname + '.tmp')



In [24]:
# Vectorize data.

# Create a dictionary representation of the documents.
docs = docs_generator(output_fname)
dictionary = Dictionary(docs)

# Filter out words that occur too frequently or too rarely.
# Disregarding stop words, this dataset has a very high number of low frequency words.
max_freq = 0.5
min_count = 5
dictionary.filter_extremes(no_below=min_count, no_above=max_freq)

dict0 = dictionary[0]  # This sort of "initializes" dictionary.id2token.

# Bag-of-words representation of the documents.
docs = docs_generator(output_fname)
corpus = [dictionary.doc2bow(doc) for doc in docs]

num_docs = len(corpus)  # In case num_docs was set to None.

# Serialize the corpus.
#MmCorpus.serialize('/tmp/corpus.mm', corpus)
#corpus = MmCorpus('/tmp/corpus.mm')

In [25]:
tagname2postid = atmodel.construct_author2doc(corpus, postid2tagname)

In [26]:
# FIXME: how to do this with MmCorpus.

train_corpus = corpus[100:]
test_corpus = corpus[:100]
train_postid2tagname = {i: postid2tagname[j] for i, j in enumerate(range(100, num_docs))}
test_postid2tagname = {i: postid2tagname[j] for i, j in enumerate(range(100))}

train_tagname2postid = atmodel.construct_author2doc(train_corpus, train_postid2tagname)
test_tagname2postid = atmodel.construct_author2doc(test_corpus, test_postid2tagname)

train_tag_set = set()
for d, tags in train_postid2tagname.items():
    for tag in tags:
        train_tag_set.add(tag)

docs = docs_generator(output_fname)
test_docs = []
for d, doc in enumerate(docs):
    if d > 100:
        break
    test_docs.append(doc)

In [27]:
def display_doc(doc):
    for token in doc:
        if token in dictionary.values():
            print(token)

In [28]:
#docs = list(docs_generator(output_fname))
#display_doc(docs[0])

## Train model

In [29]:
print('Train data dimensionality:')
print('Number of authors: %d (%d in total)' % (len(train_tag_set), len(tag_set)))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(train_corpus))

Train data dimensionality:
Number of authors: 1344 (1462 in total)
Number of unique tokens: 2220
Number of documents: 900


In [41]:
A = [1,2,3]
B = {1:1, 2:2}

In [42]:
new = False
for a in A:
    if not B.get(a):
        new = True
print(new)

True


In [43]:
len(set(A).intersection(B)) < len(A)

True

In [194]:
#with open('model.pickle', 'wb') as fid:
#    pickle.dump(model, fid)

In [195]:
# list(dictionary.values())

In [59]:
reload(atmodel)
AuthorTopicModel = atmodel.AuthorTopicModel

In [60]:
num_topics = 20
%time model = AuthorTopicModel(corpus=train_corpus, num_topics=num_topics, id2word=dictionary.id2token, \
                author2doc=None, doc2author=train_postid2tagname, var_lambda=None,  \
                chunksize=1000, passes=10, update_every=1, \
                alpha='symmetric', eta='symmetric', decay=0.5, offset=1.0, \
                eval_every=1, iterations=1, gamma_threshold=1e-10, \
                minimum_probability=0.01, random_state=0, ns_conf={},\
                serialized=False, serialization_path='/tmp/model_serializer.mm')

-9.02640308098
-7.46027696685
-7.32860734589
-7.19242777313
-7.0608323278
-6.94086803102
-6.83188425744
-6.73251887124
-6.64267204902
-6.56241965265
CPU times: user 18.4 s, sys: 10.8 s, total: 29.2 s
Wall time: 17.1 s


In [234]:
# Compute the per-word bound.
# Number of words in corpus.
corpus_words = sum(cnt for document in train_corpus for _, cnt in document)

# Compute bound and divide by number of words.
perwordbound = model.bound(train_corpus, author2doc=train_tagname2postid, \
                           doc2author=train_postid2tagname) / corpus_words
print(perwordbound)

-13.4469052387


In [670]:
os.remove('/tmp/model_serializer.mm')

In [204]:
model.show_topics(num_topics=10)

[(8,
  '0.046*"chicken" + 0.038*"skin" + 0.034*"fish" + 0.032*"breast" + 0.023*"hour" + 0.014*"juice" + 0.012*"freeze" + 0.012*"rub" + 0.012*"eat" + 0.012*"sear"'),
 (13,
  '0.019*"crack" + 0.018*"paper" + 0.017*"container" + 0.017*"smell" + 0.016*"powder" + 0.016*"towel" + 0.015*"bacon" + 0.015*"fruit" + 0.014*"coconut" + 0.013*"line"'),
 (10,
  '0.117*"garlic" + 0.047*"clove" + 0.036*"pizza" + 0.021*"slice" + 0.021*"peel" + 0.021*"crush" + 0.019*"skin" + 0.015*"knife" + 0.014*"break" + 0.014*"press"'),
 (6,
  '0.061*"iron" + 0.055*"cast" + 0.055*"cast_iron" + 0.025*"cheese" + 0.017*"seasoning" + 0.017*"chocolate" + 0.016*"melt" + 0.016*"fry" + 0.016*"pot" + 0.015*"apart"'),
 (2,
  '0.042*"cake" + 0.037*"bake" + 0.030*"measure" + 0.029*"baking" + 0.027*"pop" + 0.026*"layer" + 0.024*"double" + 0.021*"test" + 0.021*"center" + 0.020*"bubble"'),
 (4,
  '0.147*"knife" + 0.044*"steel" + 0.032*"blade" + 0.032*"cheese" + 0.030*"grate" + 0.028*"sharpen" + 0.020*"edge" + 0.018*"french" + 0.018*

In [205]:
for i in range(5):
    tag = random.choice(list(train_tagname2postid.keys()))
    while tag[:7] == 'POST_ID':
        tag = random.choice(list(train_tagname2postid.keys()))
    print('\n%s' % tag)
    print('#Docs:', len(model.author2doc[tag]))
    pprint(model.get_author_topics(tag))


kitchen
#Docs: 3
[(15, 0.989037136854092)]

seafood
#Docs: 2
[(12, 0.9561081217189048)]

vegan
#Docs: 4
[(8, 0.93230970897091492)]

pan
#Docs: 2
[(17, 0.97213802935976956)]

tartare
#Docs: 1
[(0, 0.049999997552185323),
 (1, 0.049999997498422474),
 (2, 0.05000000872233331),
 (3, 0.050000001886338089),
 (4, 0.04999999499943094),
 (5, 0.049999997405850212),
 (6, 0.049999995267154818),
 (7, 0.049999997092325013),
 (8, 0.050000004100872561),
 (9, 0.049999997159923648),
 (10, 0.050000007295847075),
 (11, 0.050000002003677026),
 (12, 0.050000000746224099),
 (13, 0.049999997126121382),
 (14, 0.050000002752668145),
 (15, 0.050000004326004793),
 (16, 0.04999999896722946),
 (17, 0.050000000602347314),
 (18, 0.049999994878697052),
 (19, 0.049999999616347358)]


In [206]:
model.show_topic(2)

[('cake', 0.041798281064494305),
 ('bake', 0.036799056345563914),
 ('measure', 0.029646700246982956),
 ('baking', 0.029190879027789197),
 ('pop', 0.026582234678151219),
 ('layer', 0.026354052842399108),
 ('double', 0.024050735528188368),
 ('test', 0.020978781252733003),
 ('center', 0.020774919563634326),
 ('bubble', 0.019779515912893848)]

In [47]:
tag = 'baking'
print('%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'eggs'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'pasta'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'herbs'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'beef'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'salmon'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))


baking
#Docs: 74
[(14, 0.99944123182339661)]

eggs
#Docs: 38
[(15, 0.78797064392893468), (19, 0.21131357754063784)]

pasta
#Docs: 19
[(14, 0.99793584780062705)]

herbs
#Docs: 13
[(4, 0.99214435647682309)]

beef
#Docs: 15
[(3, 0.9980177727998798)]

salmon
#Docs: 6
[(11, 0.75261915362288345), (19, 0.23653737575723341)]


## Similarity queries

Discrete Hellinger distance:

$$
H(p, q) = \frac{1}{\sqrt{2}} \sqrt{\sum_{i=1}^K (\sqrt{p_i} - \sqrt{q_i})^2}
$$

where $p$ and $q$ are both topic distributions for two different tags. We define the similarity as
$$
S(p, q) = \frac{1}{1 + H(p, q)}
$$

In [89]:
def similarity(vec1, vec2):
    dist = hellinger(sparse2full(vec1, num_topics), sparse2full(vec2, num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec, tag_vecs):
    sims = [similarity(vec, vec2) for vec2 in tag_vecs]
    return sims

In [90]:
tag_vecs = [model.get_author_topics(tag, minimum_probability=0.0) for tag in train_tag_set]

In [91]:
id2tag = dict(zip(range(len(train_tag_set)), list(train_tag_set)))

In [99]:
tag = random.choice(list(train_tagname2postid.keys()))
while tag[:7] == 'POST_ID':
    tag = random.choice(list(train_tagname2postid.keys()))
sims = get_sims(model.get_author_topics(tag, minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
158,ingredient-selection,1.0
22,nutrient-composition,0.753888
236,smoking,0.748792
218,celery,0.743314
240,sauteing,0.72202
39,carrots,0.701454
279,stir-fry,0.698036
429,cookware,0.693311
172,asian-cuisine,0.683168
391,smell,0.66368


In [92]:
sims = get_sims(model.get_author_topics('beef', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
274,beef,1.0
189,brownies,0.983048
70,apples,0.947878
290,professional,0.944643
401,meatballs,0.925438
316,candy,0.912623
185,ribs,0.906636
26,gelling-agents,0.905063
140,texture,0.90473
406,liver,0.900559


In [93]:
sims = get_sims(model.get_author_topics('baking', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
212,baking,1.0
76,meat,0.992634
412,steak,0.991649
437,sugar,0.987753
418,oil,0.984446
331,pasta,0.980935
18,italian-cuisine,0.979905
148,slow-cooking,0.974038
74,tomatoes,0.970183
136,food-preservation,0.969655


In [94]:
sims = get_sims(model.get_author_topics('salmon', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
367,salmon,1.0
417,yeast,0.980547
200,sous-vide,0.966504
286,vegetarian,0.965828
428,cleaning,0.951663
284,chocolate,0.949653
98,food-science,0.945865
294,tzatziki,0.945838
420,flavor,0.945496
12,bread,0.94157


## Predicting the tag of a new document

In [696]:
lda = LdaModel(corpus=None, num_topics=num_topics, id2word=dictionary.id2token)
lda.state.sstats = model.state.sstats
lda.iterations = 100  # Make sure training converges on document when calling lda[doc].

In [701]:
postid = 1
doc = test_corpus[postid]
print('Post tags:\n', test_postid2tagname[postid])
print('Post body:\n', test_docs[postid])

for tag in test_postid2tagname[postid]:
    if tag not in train_tag_set:
        print('Tag "', tag, '" not in training data.')

Post tags:
 ['oven', 'cooking-time', 'bacon']
Post body:
 ["'ve", 'heard', 'people', 'cooking', 'bacon', 'oven', 'laying', 'strips', 'cookie', 'sheet', 'method', 'long', 'cook', 'bacon', 'temperature', 'place', 'bacon', 'cold', 'oven', 'turn', 'oven', '400F.', '', '', 'takes', '15', '20', 'minutes', 'slightly', 'crisp', 'bacon', '', '', '', "'ve", 'cooked', 'aluminum', 'foil', '350°F', '~175°C', '20', 'minutes', 'Flipping', 'half', 'way', 'point', 'prefer', 'crispier', '25', 'minutes', 'cookie', 'sheet', 'Use', 'high', 'temp', '375F+', '10', '20', 'minutes', 'depending', 'desired', 'crispness', '', '', 'easier', 'cleaning', 'cookie', 'sheet', 'line', 'aluminum', 'foil', '', '', 'let', 'grease', 'drain', 'corrugate', 'foil', 'far', 'prefer', 'remember', 'tear', 'sheet', 'roll', "'ll", 'need', '~2x', 'foil', 'area', 'use', 'tinfoil', 'non', 'stick', 'kind', 'works', 'old', 'baking', 'sheet', 'added', 'boost', 'sprinkle', 'dark', 'brown', 'sugar', 'coursely', 'ground', 'pepper', 'Cook', '

In [702]:
sims = get_sims(lda.get_document_topics(doc, minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
257,juice,0.65425
246,baking-soda,0.646869
352,soymilk,0.645439
27,tenderizing,0.641415
239,quinoa,0.64086
309,stainless-steel,0.639342
381,jicama,0.638536
191,reheating,0.636994
328,cherries,0.63562
266,pan,0.628226


In [911]:
pred_threshold = 0.6
pred = sims_df.loc[sims_df.Score > pred_threshold]
pred_tags = list(pred.Tag)
pred_prob = list(pred.Score)
pred

Unnamed: 0,Tag,Score
75,catering,0.681661
289,melting-chocolate,0.832565


In [912]:
tp = 0
fp = 0
fn = 0
for tag in pred_tags:
    if tag in test_postid2tagname[postid]:
        tp += 1
    else:
        fp += 1

for tag in test_postid2tagname[postid]:
    if tag not in pred_tags:
        fn += 1
        
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if precision + recall == 0:
    f1_score = 0.0
else:
    f1_score = 2 * precision * recall / (precision + recall)

print('F1 score: ', f1_score)

F1 score:  0.0


## Cosine similarity

In [47]:
# Generate a similarity object for the transformed corpus.
index = MatrixSimilarity(model[list(train_tag_set)])

In [48]:
# Get similarities to some tag.
tag_name = 'baking'
sims = index[model[tag_name]]

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
0,baking,1.0
88,aging,0.994557
24,crepe,0.987024
96,ganache,0.983886
386,sauce,0.980548
38,puree,0.978956
191,roast-beef,0.975619
243,roasting,0.971207
233,taffy,0.960213
418,rye,0.959389


## LDA

In [483]:
%time lda = LdaModel(corpus, num_topics=10, id2word=dictionary.id2token, iterations=1, \
                     passes=100, eval_every=0, chunksize=1000)

CPU times: user 41.6 s, sys: 0 ns, total: 41.6 s
Wall time: 41.6 s


In [484]:
lda.show_topics()

[(0,
  '0.032*"sauce" + 0.031*"like" + 0.026*"taste" + 0.021*"eat" + 0.017*"rice" + 0.014*"dish" + 0.014*"ingredient" + 0.013*"love" + 0.012*"look" + 0.012*"tip"'),
 (1,
  '0.042*"egg" + 0.027*"milk" + 0.025*"difference" + 0.020*"use" + 0.017*"flavor" + 0.014*"day" + 0.014*"white" + 0.014*"question" + 0.013*"bit" + 0.012*"wine"'),
 (2,
  '0.036*"meat" + 0.025*"chicken" + 0.023*"cook" + 0.020*"pan" + 0.019*"cut" + 0.015*"like" + 0.015*"use" + 0.014*"hear" + 0.013*"food" + 0.012*"stick"'),
 (3,
  '0.064*"recipe" + 0.032*"use" + 0.022*"bread" + 0.020*"substitute" + 0.019*"bake" + 0.017*"baking" + 0.016*"cake" + 0.014*"chocolate" + 0.012*"problem" + 0.011*"dough"'),
 (4,
  '0.036*"way" + 0.029*"good" + 0.023*"temperature" + 0.021*"cheese" + 0.019*"time" + 0.018*"cook" + 0.016*"long" + 0.014*"beef" + 0.014*"vegetable" + 0.013*"use"'),
 (5,
  '0.028*"oil" + 0.022*"oven" + 0.020*"use" + 0.017*"knife" + 0.016*"good" + 0.013*"prepare" + 0.013*"wonder" + 0.011*"skin" + 0.011*"cook" + 0.011*"plac

## List docs

In [381]:
root = ET.parse(input_fname)
postid2tagname = dict()
postid = 0
posts = []
tag_set = set()
for i, item in enumerate(root.iter()):
    if i == 0:
        continue
    if item.get('Tags') is not None:
        tags = item.get('Tags')
        tags = re.findall('<(.+?)>', tags)
        # NOTE: consider using a tag that is common for all posts, or
        # a tag that is only for this particular post.
        #tags.append('SUPER_TAG')
        #tags.append('POST_ID' + str(postid))
        postid2tagname[postid] = tags
        posts.append(item.get('Body'))
        for tag in tags:
            tag_set.add(tag)
        postid += 1

In [489]:
num_posts = len(posts)
docs = []
for post in posts[:]:
    # Remove any HTML tags, such as <p>.
    text = re.sub('<[^<]+?>', '', post)
    doc = nlp(text)
    ents = doc.ents  # Named entities.
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens.
    doc = [token.lemma_ for token in doc if token.is_alpha]
    # Remove common words from a stopword list.
    doc = [token for token in doc if token not in STOPWORDS]
    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    docs.append(doc)

In [490]:
# Compute bigrams.

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [492]:
# Vectorize data.

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur too frequently or too rarely.
#max_freq = 0.5
#min_wordcount = 20
#dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

dict0 = dictionary[0]  # This sort of "initializes" dictionary.id2token.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Serialize the corpus.
#MmCorpus.serialize('/tmp/corpus.mm', corpus)
#corpus = MmCorpus('/tmp/corpus.mm')

## Blocking VB tests

In [86]:
print('Train data dimensionality:')
print('Number of authors: %d' % len(tagname2postid))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(corpus))

Train data dimensionality:
Number of authors: 237
Number of unique tokens: 434
Number of documents: 100


In [87]:
atfilename = '/home/olavur/Dropbox/my_folder/workstuff/DTU/thesis/code/gensim/gensim/models/temp/blocking_vb_tests/atnonblocking.py'
with open(atfilename) as f:
    code = compile(f.read(), atfilename, 'exec')
    exec(code)

In [88]:
model = AtNonBlocking(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                      author2doc=tagname2postid, doc2author=postid2tagname, threshold=1e-10, \
                      iterations=10, alpha='symmetric', eta='symmetric', minimum_probability=0.01, \
                      eval_every=1, random_state=0)

In [89]:
atfilename = '/home/olavur/Dropbox/my_folder/workstuff/DTU/thesis/code/gensim/gensim/models/temp/blocking_vb_tests/atblocking.py'
with open(atfilename) as f:
    code = compile(f.read(), atfilename, 'exec')
    exec(code)

In [90]:
model = AtBlocking(corpus=corpus, num_topics=10, id2word=dictionary.id2token, \
                      author2doc=tagname2postid, doc2author=postid2tagname, threshold=1e-10, \
                      iterations=10, alpha='symmetric', eta='symmetric', minimum_probability=0.01, \
                      eval_every=1, random_state=0)

In [110]:
tags_per_doc = [len(docs) for docs in postid2tagname.values()]
sum(tags_per_doc) / len(tags_per_doc)

3.91

    2016-12-30 16:08:40,106 - gensim.models.atmodel - WARNING - no author id mapping provided; initializing from corpus, assuming identity
    2016-12-30 16:08:40,107 - gensim.models.atmodel - INFO - Starting inference. Training on 100 documents.
    2016-12-30 16:08:40,586 - gensim.models.atmodel - INFO - perwordbound: -8.431e+00.
    2016-12-30 16:08:44,144 - gensim.models.atmodel - INFO - perwordbound: -7.543e+00.
    2016-12-30 16:08:47,576 - gensim.models.atmodel - INFO - perwordbound: -7.514e+00.
    2016-12-30 16:08:51,052 - gensim.models.atmodel - INFO - perwordbound: -7.446e+00.
    2016-12-30 16:08:54,481 - gensim.models.atmodel - INFO - perwordbound: -7.273e+00.
    2016-12-30 16:08:58,230 - gensim.models.atmodel - INFO - perwordbound: -7.000e+00.
    2016-12-30 16:09:01,808 - gensim.models.atmodel - INFO - perwordbound: -6.699e+00.
    2016-12-30 16:09:05,283 - gensim.models.atmodel - INFO - perwordbound: -6.388e+00.
    2016-12-30 16:09:08,763 - gensim.models.atmodel - INFO - perwordbound: -6.083e+00.
    2016-12-30 16:09:12,133 - gensim.models.atmodel - INFO - perwordbound: -5.810e+00.
    2016-12-30 16:09:15,503 - gensim.models.atmodel - INFO - perwordbound: -5.593e+00.
    2016-12-30 16:09:15,563 - gensim.models.atmodel - INFO - Vocabulary consists of 434 words.
    2016-12-30 16:09:15,563 - gensim.models.atmodel - INFO - Number of authors: 237.
    2016-12-30 16:09:15,563 - gensim.models.atmodel - WARNING - no author id mapping provided; initializing from corpus, assuming identity
    2016-12-30 16:09:15,564 - gensim.models.atmodel - INFO - Starting inference. Training on 100 documents.
    2016-12-30 16:09:15,980 - gensim.models.atmodel - INFO - perwordbound: -8.431e+00.
    2016-12-30 16:09:20,729 - gensim.models.atmodel - INFO - perwordbound: -7.542e+00.
    2016-12-30 16:09:25,065 - gensim.models.atmodel - INFO - perwordbound: -7.510e+00.
    2016-12-30 16:09:29,330 - gensim.models.atmodel - INFO - perwordbound: -7.433e+00.
    2016-12-30 16:09:33,614 - gensim.models.atmodel - INFO - perwordbound: -7.235e+00.
    2016-12-30 16:09:37,889 - gensim.models.atmodel - INFO - perwordbound: -6.922e+00.
    2016-12-30 16:09:42,393 - gensim.models.atmodel - INFO - perwordbound: -6.570e+00.
    2016-12-30 16:09:46,664 - gensim.models.atmodel - INFO - perwordbound: -6.218e+00.
    2016-12-30 16:09:51,217 - gensim.models.atmodel - INFO - perwordbound: -5.890e+00.
    2016-12-30 16:09:55,439 - gensim.models.atmodel - INFO - perwordbound: -5.617e+00.
    2016-12-30 16:09:59,700 - gensim.models.atmodel - INFO - perwordbound: -5.414e+00.
