Author-topic model Gensim tutorial at http://nbviewer.jupyter.org/github/rare-technologies/gensim/blob/develop/docs/notebooks/atmodel_tutorial.ipynb.

This is not a tutorial, it is not user friendly, read at your own peril.

In [1]:
import xml.etree.ElementTree as ET
from imp import reload
from pprint import pprint
import os, shutil, re, random, logging, pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import gensim
from gensim.corpora import Dictionary, MmCorpus
from gensim.parsing.preprocessing import STOPWORDS
from gensim.similarities import MatrixSimilarity
from gensim.matutils import sparse2full, hellinger
from gensim.models import Phrases, LdaModel
from gensim.models import AuthorTopicModel
from gensim.models import atmodel
from gensim.models import ldamodel

import spacy

In [2]:
# Configure logging.

log_dir = '../log_files/log.log'

logger = logging.getLogger()
fhandler = logging.FileHandler(filename=log_dir, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

## Load and pre-process data

In [3]:
nlp = spacy.load('en')

In [4]:
data_folder = '../../data/stackexchange/cooking/'
input_fname = data_folder + 'Posts.xml'
output_fname = '/tmp/cooking_docs.txt'
tree = ET.parse(input_fname)
root = tree.getroot()
num_docs = 100

In [5]:
post_ids = []
for i, item in enumerate(root.iter()):
    if i == 0:
        # This is the <posts> XML element.
        continue
    post_ids.append(int(item.get('Id')))

print('Number of posts in dataset:', len(post_ids))

Number of posts in dataset: 54566


In [6]:
def doc_generator(root, num_docs=None):
    '''
    This generator parses the XML data, do some preliminary
    pre-processing and yields the documents.
    
    '''
    num_posts = 0
    for post_id in post_ids:
        post_text = ''
        for i, item in enumerate(root.iter()):
            if i == 0:
                # This is the <posts> XML element.
                continue
            elif int(item.get('Id')) == post_id:
                # This is the post.
                post_text += item.get('Body')
            elif item.get('ParentId') is not None and int(item.get('ParentId')) == post_id:
                # This is an answer to the post.
                post_text += item.get('Body')
            else:
                # Neither post "post_id" or answer to it.
                continue

            # Remove any HTML tags, such as <p>.
            post_text = re.sub('<[^<]+?>', '', post_text)

            # Replace any whitespace (newline, tabs, etc.) by a single space.
            post_text = re.sub('\s', ' ', post_text)

        if num_docs is not None and num_posts >= num_docs:
            break
            
        num_posts += 1
        
        yield post_text


In [7]:
# Use the default SpaCy NLP pipeline to process the documents in parallel.
# Then use the output of the pipeline to transform the text.
# Write the resulting text to a file.
entity_freq = {}
postid = 0
with open(output_fname, 'w') as fid:
    for doc in nlp.pipe(doc_generator(root, num_docs=num_docs), n_threads=4):
        # Process post text.
        
        # NOTE: the doc_generator is probably the bottleneck here.
        
        ents = doc.ents  # Named entities.

        # Keep only words (no numbers, no punctuation).
        # Lemmatize tokens, remove punctuation and remove stopwords.
        #doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        
        # Remove stopwords and punctuation, and lemmatized tokens.
        tokens = [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]
        #tokens = [str(token) for token in doc if not token.is_stop and not token.is_punct]
        #tokens.extend([token.lemma_ for token in doc if not token.is_stop and not token.is_punct])
        
        # Add named entities, but only if they are a compound of more than word.
        #doc.extend([str(entity) for entity in ents if len(entity) > 1])
        
        #for entity in ents:
        #    if entity_freq.get(entity):
        #        entity_freq[entity] += 1
        #    else:
        #        entity_freq[entity] = 1
        
        # Write the doc to file.
        fid.write(' '.join(tokens) + '\n')

In [8]:
# Get the tags of each post.
postid = 0
postid2tagname = dict()
tag_set = set()
for i, item in enumerate(root.iter()):
    if i == 0:
        # This is the <posts> XML element.
        continue
    if item.get('Tags') is None:
        # There are many posts with no tags.
        continue
    
    if num_docs is not None and postid >= num_docs:
        break

    tags = item.get('Tags')
    tags = re.findall('<(.+?)>', tags)
    # NOTE: consider using a tag that is common for all posts, and/or
    # a tag that is only for this particular post. 
    # NOTE: also consider including posts with no tags, and tag them with
    # post ID or "SUPER_TAG", maybe both, maybe an extra "NO_TAG" tag.
    #tags.append('SUPER_TAG')
    tags.append('POST_ID' + str(postid))
    postid2tagname[postid] = tags
    for tag in tags:
        tag_set.add(tag)

    postid += 1

**TODO:** these names aren't great, "doc_generator" and "docs_generator".

In [9]:
def docs_generator(fname):
    '''
    This generator reads the processed text one line
    at a time and yields documents (lists of words).
    
    '''
    with open(fname, 'r') as fid:
        for line in fid:
            line = line.strip()  # Remove newline ("\n").
            doc = line.split(' ')  # Split line text into words.
            yield doc

In [10]:
# Compute bigrams.

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
docs = docs_generator(output_fname)
bigram = Phrases(docs, min_count=20)
docs = docs_generator(output_fname)
with open(output_fname + '.tmp', 'w') as fid:
    for doc in docs:
        for token in bigram[doc]:
            if '_' in token:
                doc.append(token)
        fid.write(' '.join(doc) + '\n')


shutil.copyfile(output_fname + '.tmp', output_fname)
os.remove(output_fname + '.tmp')



In [19]:
# Vectorize data.

# Create a dictionary representation of the documents.
docs = docs_generator(output_fname)
dictionary = Dictionary(docs)

# Filter out words that occur too frequently or too rarely.
# Disregarding stop words, this dataset has a very high number of low frequency words.
max_freq = 0.5
min_count = 5
dictionary.filter_extremes(no_below=min_count, no_above=max_freq)

dict0 = dictionary[0]  # This sort of "initializes" dictionary.id2token.

# Bag-of-words representation of the documents.
#docs = docs_generator(output_fname)
#corpus = [dictionary.doc2bow(doc) for doc in docs]

#num_docs = len(corpus)  # In case num_docs was set to None.

In [18]:
def corpus_generator(fname):
    '''
    This generator reads the processed text one line
    at a time and yields BOW documents.
    
    '''
    with open(fname, 'r') as fid:
        for line in fid:
            line = line.strip()  # Remove newline ("\n").
            doc = line.split(' ')  # Split line text into words.
            yield dictionary.doc2bow(doc)

In [29]:
corpus = corpus_generator(output_fname)

# Serialize the corpus.
MmCorpus.serialize('/tmp/corpus.mm', corpus)
corpus = MmCorpus('/tmp/corpus.mm')

num_docs = len(corpus)

In [25]:
tagname2postid = atmodel.construct_author2doc(corpus, postid2tagname)

## Train model

In [30]:
print('Train data dimensionality:')
print('Number of authors: %d:' % (len(tag_set)))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % num_docs)

Train data dimensionality:
Number of authors: 237:
Number of unique tokens: 434
Number of documents: 100


In [27]:
reload(atmodel)
AuthorTopicModel = atmodel.AuthorTopicModel

In [32]:
num_topics = 10
chunksize = num_docs + 1
%time model = AuthorTopicModel(corpus=corpus, num_topics=num_topics, id2word=dictionary.id2token, \
                author2doc=tagname2postid, doc2author=postid2tagname, \
                chunksize=chunksize, passes=10, update_every=1, \
                alpha='symmetric', eta='symmetric', decay=0.5, offset=1.0, \
                eval_every=1, iterations=1, gamma_threshold=1e-10, \
                minimum_probability=0.01, random_state=0, \
                serialized=True, serialization_path='/tmp/model_serializer.mm')

CPU times: user 1.8 s, sys: 12 ms, total: 1.81 s
Wall time: 1.83 s


In [31]:
os.remove('/tmp/model_serializer.mm')

FileNotFoundError: [Errno 2] No such file or directory: '/tmp/model_serializer.mm'

In [33]:
model.show_topics(num_topics=10)

[(0,
  '0.030*"spice" + 0.016*"bread" + 0.015*"use" + 0.013*"dry" + 0.012*"cook" + 0.012*"good" + 0.011*"not" + 0.010*"have" + 0.010*"thing" + 0.010*"way"'),
 (1,
  '0.040*"1" + 0.032*"half" + 0.024*"cup" + 0.018*"convert" + 0.017*"tbsp" + 0.017*"cream" + 0.017*"2" + 0.016*"egg" + 0.015*"bacon" + 0.015*"recipe"'),
 (2,
  '0.031*"recipe" + 0.024*"not" + 0.023*"use" + 0.019*"\'" + 0.017*"good" + 0.015*"ingredient" + 0.015*"like" + 0.013*"add" + 0.012*"list" + 0.012*"egg"'),
 (3,
  '0.029*"spice" + 0.023*"not" + 0.021*"cook" + 0.020*"food" + 0.018*"add" + 0.017*"use" + 0.017*"raw" + 0.013*"like" + 0.012*"\'" + 0.012*"butter"'),
 (4,
  '0.031*"not" + 0.027*"bread" + 0.023*"good" + 0.020*"use" + 0.017*"\'" + 0.013*"meat" + 0.012*"store" + 0.011*"long" + 0.011*"way" + 0.011*"bag"'),
 (5,
  '0.052*"egg" + 0.017*"food" + 0.017*"add" + 0.016*"not" + 0.014*"\'" + 0.014*"center" + 0.012*"use" + 0.012*"fresh" + 0.011*"recipe" + 0.011*"rice"'),
 (6,
  '0.033*"oil" + 0.031*"grill" + 0.024*"pan" + 0.

In [None]:
for i in range(5):
    tag = random.choice(list(train_tagname2postid.keys()))
    while tag[:7] == 'POST_ID':
        tag = random.choice(list(train_tagname2postid.keys()))
    print('\n%s' % tag)
    print('#Docs:', len(model.author2doc[tag]))
    pprint(model.get_author_topics(tag))

In [None]:
model.show_topic(2)

In [None]:
tag = 'baking'
print('%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'eggs'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'pasta'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'herbs'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'beef'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'salmon'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))


## Similarity queries

In [None]:
def similarity(vec1, vec2):
    dist = hellinger(sparse2full(vec1, num_topics), sparse2full(vec2, num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec, tag_vecs):
    sims = [similarity(vec, vec2) for vec2 in tag_vecs]
    return sims

In [None]:
tag_vecs = [model.get_author_topics(tag, minimum_probability=0.0) for tag in train_tag_set]

In [None]:
id2tag = dict(zip(range(len(train_tag_set)), list(train_tag_set)))

In [None]:
tag = random.choice(list(train_tagname2postid.keys()))
while tag[:7] == 'POST_ID':
    tag = random.choice(list(train_tagname2postid.keys()))
sims = get_sims(model.get_author_topics(tag, minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

In [None]:
sims = get_sims(model.get_author_topics('beef', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

In [None]:
sims = get_sims(model.get_author_topics('baking', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

In [None]:
sims = get_sims(model.get_author_topics('salmon', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims = [(id2tag[elem[0]], elem[1]) for elem in enumerate(sims) if not id2tag[elem[0]][:7] == 'POST_ID']
sims_df = pd.DataFrame(sims, columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

## Predicting the tag of a new document

In [None]:
lda = LdaModel(corpus=None, num_topics=num_topics, id2word=dictionary.id2token)
lda.state.sstats = model.state.sstats
lda.iterations = 100  # Make sure training converges on document when calling lda[doc].

In [None]:
postid = 1
doc = test_corpus[postid]
print('Post tags:\n', test_postid2tagname[postid])
print('Post body:\n', test_docs[postid])

for tag in test_postid2tagname[postid]:
    if tag not in train_tag_set:
        print('Tag "', tag, '" not in training data.')

In [None]:
sims = get_sims(lda.get_document_topics(doc, minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

In [None]:
pred_threshold = 0.6
pred = sims_df.loc[sims_df.Score > pred_threshold]
pred_tags = list(pred.Tag)
pred_prob = list(pred.Score)
pred

In [None]:
tp = 0
fp = 0
fn = 0
for tag in pred_tags:
    if tag in test_postid2tagname[postid]:
        tp += 1
    else:
        fp += 1

for tag in test_postid2tagname[postid]:
    if tag not in pred_tags:
        fn += 1
        
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if precision + recall == 0:
    f1_score = 0.0
else:
    f1_score = 2 * precision * recall / (precision + recall)

print('F1 score: ', f1_score)

## Cosine similarity

In [None]:
# Generate a similarity object for the transformed corpus.
index = MatrixSimilarity(model[list(train_tag_set)])

In [None]:
# Get similarities to some tag.
tag_name = 'baking'
sims = index[model[tag_name]]

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

## LDA

In [None]:
%time lda = LdaModel(corpus, num_topics=10, id2word=dictionary.id2token, iterations=1, \
                     passes=100, eval_every=0, chunksize=1000)

In [None]:
lda.show_topics()

## List docs

In [None]:
root = ET.parse(input_fname)
postid2tagname = dict()
postid = 0
posts = []
tag_set = set()
for i, item in enumerate(root.iter()):
    if i == 0:
        continue
    if item.get('Tags') is not None:
        tags = item.get('Tags')
        tags = re.findall('<(.+?)>', tags)
        # NOTE: consider using a tag that is common for all posts, or
        # a tag that is only for this particular post.
        #tags.append('SUPER_TAG')
        #tags.append('POST_ID' + str(postid))
        postid2tagname[postid] = tags
        posts.append(item.get('Body'))
        for tag in tags:
            tag_set.add(tag)
        postid += 1

In [None]:
num_posts = len(posts)
docs = []
for post in posts[:]:
    # Remove any HTML tags, such as <p>.
    text = re.sub('<[^<]+?>', '', post)
    doc = nlp(text)
    ents = doc.ents  # Named entities.
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens.
    doc = [token.lemma_ for token in doc if token.is_alpha]
    # Remove common words from a stopword list.
    doc = [token for token in doc if token not in STOPWORDS]
    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    docs.append(doc)

In [None]:
# Compute bigrams.

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)