In [1]:
import xml.etree.ElementTree as ET
from imp import reload
from pprint import pprint
import os, shutil, re, random, logging, pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

import gensim
from gensim.corpora import Dictionary, MmCorpus
from gensim.parsing.preprocessing import STOPWORDS
from gensim.similarities import MatrixSimilarity
from gensim.matutils import sparse2full, hellinger
from gensim.models import Phrases, LdaModel
from gensim.models import AuthorTopicModel
from gensim.models import atmodel

import spacy

In [2]:
# Configure logging.

log_dir = '../../../log_files/log.log'  # On my own machine.
#log_dir = '../../../../log_files/log.log'  # On Hetzner

logger = logging.getLogger()
fhandler = logging.FileHandler(filename=log_dir, mode='a')
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
fhandler.setFormatter(formatter)
logger.addHandler(fhandler)
logger.setLevel(logging.DEBUG)

## Load and pre-process data

In [3]:
nlp = spacy.load('en')

In [125]:
data_folder = '../../../../data/stackexchange/cooking/'
input_fname = data_folder + 'Posts.xml'
output_fname = '/tmp/cooking_docs.txt'
tree = ET.parse(input_fname)
root = tree.getroot()
num_docs = 1000

In [126]:
post_ids = []
for i, item in enumerate(root.iter()):
    if i == 0:
        # This is the <posts> XML element.
        continue
    post_ids.append(int(item.get('Id')))

print('Number of posts in dataset:', len(post_ids))

Number of posts in dataset: 54566


In [127]:
def doc_generator(root, num_docs=None):
    '''
    This generator parses the XML data, do some preliminary
    pre-processing and yields the documents.
    
    '''
    num_posts = 0
    for post_id in post_ids:
        post_text = ''
        for i, item in enumerate(root.iter()):
            if i == 0:
                # This is the <posts> XML element.
                continue
            elif int(item.get('Id')) == post_id:
                # This is the post.
                post_text += item.get('Body')
            elif item.get('ParentId') is not None and int(item.get('ParentId')) == post_id:
                # This is an answer to the post.
                post_text += item.get('Body')
            else:
                # Neither post "post_id" or answer to it.
                continue

            # Remove any HTML tags, such as <p>.
            post_text = re.sub('<[^<]+?>', '', post_text)

            # Replace any whitespace (newline, tabs, etc.) by a single space.
            post_text = re.sub('\s', ' ', post_text)

        if num_docs is not None and num_posts >= num_docs:
            break
            
        num_posts += 1
        
        yield post_text


In [253]:
doc = nlp(post_text)

In [255]:
token = doc[0]

In [275]:
doc

Normal double-acting baking powder makes CO2 (thus giving a rising effect) in two ways: when it gets wet, and when it is heated.  Baking soda only makes CO2 when it gets wet.  From Wikipedia:     The acid in a baking powder can be   either fast-acting or slow-acting.[6]   A fast-acting acid reacts in a wet   mixture with baking soda at room   temperature, and a slow-acting acid   will not react until heated in an   oven. Baking powders that contain both   fast- and slow-acting acids are double   acting; those that contain only one   acid are single acting. By providing a   second rise in the oven, double-acting   baking powders increase the   reliability of baked goods by   rendering the time elapsed between   mixing and baking less critical, and   this is the type most widely available   to consumers today.to consumers today.    See: http://en.wikipedia.org/wiki/Baking_powder  

In [345]:
# Use the default SpaCy NLP pipeline to process the documents in parallel.
# Then use the output of the pipeline to transform the text.
# Write the resulting text to a file.
postid = 0
with open(output_fname, 'w') as fid:
    for doc in nlp.pipe(doc_generator(root, num_docs=num_docs), n_threads=4):
        # Process post text.
        
        # NOTE: the doc_generator is probably the bottleneck here.
        
        ents = doc.ents  # Named entities.

        # Keep only words (no numbers, no punctuation).
        # Lemmatize tokens, remove punctuation and remove stopwords.
        #doc = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
        doc = [token.lemma_ for token in doc if not token.is_stop]
        
        # Remove common words from a stopword list.
        #doc = [token for token in doc if token not in STOPWORDS]

        # Add named entities, but only if they are a compound of more than word.
        doc.extend([str(entity) for entity in ents if len(entity) > 1])
        
        # Write the doc to file.
        fid.write(' '.join(doc) + '\n')

In [346]:
# Get the tags of each post.
postid = 0
postid2tagname = dict()
tag_set = set()
for i, item in enumerate(root.iter()):
    if i == 0:
        # This is the <posts> XML element.
        continue
    if item.get('Tags') is None:
        # There are many posts with no tags.
        continue
    
    if num_docs is not None and postid >= num_docs:
        break

    tags = item.get('Tags')
    tags = re.findall('<(.+?)>', tags)
    # NOTE: consider using a tag that is common for all posts, and/or
    # a tag that is only for this particular post. 
    # NOTE: also consider including posts with no tags, and tag them with
    # post ID or "SUPER_TAG", maybe both, maybe an extra "NO_TAG" tag.
    #tags.append('SUPER_TAG')
    #tags.append('POST_ID' + str(postid))
    postid2tagname[postid] = tags
    for tag in tags:
        tag_set.add(tag)

    postid += 1

**TODO:** these names aren't great, "doc_generator" and "docs_generator".

In [347]:
def docs_generator(fname):
    '''
    This generator reads the processed text one line
    at a time and yields documents (lists of words).
    
    '''
    with open(fname, 'r') as fid:
        for line in fid:
            line = line.strip()  # Remove newline ("\n").
            doc = line.split(' ')  # Split line text into words.
            yield doc

In [348]:
# Compute bigrams.

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
docs = docs_generator(output_fname)
bigram = Phrases(docs, min_count=1)
docs = docs_generator(output_fname)
with open(output_fname + '.tmp', 'w') as fid:
    for doc in docs:
        for token in bigram[doc]:
            if '_' in token:
                doc.append(token)
        fid.write(' '.join(doc) + '\n')


shutil.copyfile(output_fname + '.tmp', output_fname)
os.remove(output_fname + '.tmp')



In [358]:
# Vectorize data.

# Create a dictionary representation of the documents.
docs = docs_generator(output_fname)
dictionary = Dictionary(docs)

# Filter out words that occur too frequently or too rarely.
# Disregarding stop words, this dataset has a very high number of low frequency words.
max_freq = 1.0  # No filtering.
min_count = 5
dictionary.filter_extremes(no_below=min_count, no_above=max_freq)

dict0 = dictionary[0]  # This sort of "initializes" dictionary.id2token.

# Bag-of-words representation of the documents.
docs = docs_generator(output_fname)
corpus = [dictionary.doc2bow(doc) for doc in docs]

num_docs = len(corpus)  # In case num_docs was set to None.

# Serialize the corpus.
#MmCorpus.serialize('/tmp/corpus.mm', corpus)
#corpus = MmCorpus('/tmp/corpus.mm')

In [359]:
tagname2postid = atmodel.construct_author2doc(corpus, postid2tagname)

In [360]:
# FIXME: how to do this with MmCorpus.

train_corpus = corpus[100:]
test_corpus = corpus[:100]
train_postid2tagname = {i: postid2tagname[j] for i, j in enumerate(range(100, num_docs))}
test_postid2tagname = {i: postid2tagname[j] for i, j in enumerate(range(100))}

train_tagname2postid = atmodel.construct_author2doc(train_corpus, train_postid2tagname)
test_tagname2postid = atmodel.construct_author2doc(test_corpus, test_postid2tagname)

train_tag_set = set()
for d, tags in train_postid2tagname.items():
    for tag in tags:
        train_tag_set.add(tag)

docs = docs_generator(output_fname)
test_docs = []
for d, doc in enumerate(docs):
    if d > 100:
        break
    test_docs.append(doc)

## Train model

In [361]:
print('Train data dimensionality:')
print('Number of authors: %d (%d in total)' % (len(train_tag_set), len(tag_set)))
print('Number of unique tokens: %d' % len(dictionary))
print('Number of documents: %d' % len(train_corpus))

Train data dimensionality:
Number of authors: 444 (462 in total)
Number of unique tokens: 2764
Number of documents: 900


In [314]:
#with open('big_model.pickle', 'wb') as fid:
#    pickle.dump(model, fid)

In [315]:
# big_model_save = model

In [316]:
# model = big_model_save

In [317]:
reload(atmodel)
AuthorTopicModel = atmodel.AuthorTopicModel

In [362]:
num_topics = 20
%time model = AuthorTopicModel(corpus=train_corpus, num_topics=num_topics, id2word=dictionary.id2token, \
                author2doc=None, doc2author=train_postid2tagname, var_lambda=None,  \
                chunksize=1000, passes=100, update_every=1, \
                alpha='auto', eta='auto', decay=0.5, offset=1.0, \
                eval_every=0, iterations=1, gamma_threshold=1e-10, \
                minimum_probability=0.01, random_state=0, ns_conf={},\
                serialized=False, serialization_path='/tmp/model_serializer.mm')

CPU times: user 1min 36s, sys: 1min 4s, total: 2min 41s
Wall time: 1min 30s


In [234]:
# Compute the per-word bound.
# Number of words in corpus.
corpus_words = sum(cnt for document in train_corpus for _, cnt in document)

# Compute bound and divide by number of words.
perwordbound = model.bound(train_corpus, author2doc=train_tagname2postid, \
                           doc2author=train_postid2tagname) / corpus_words
print(perwordbound)

-13.4469052387


In [670]:
os.remove('/tmp/model_serializer.mm')

In [175]:
model.show_topics(num_topics=10)

[(14,
  '0.005*"pumpkin" + 0.004*"knife" + 0.003*"rice" + 0.003*"pie" + 0.003*"bake" + 0.003*"egg" + 0.003*"flour" + 0.003*"cheese" + 0.003*"white" + 0.003*"type"'),
 (8,
  '0.006*"rib" + 0.005*"knife" + 0.004*"rice" + 0.004*"stir" + 0.004*"fry" + 0.003*"style" + 0.003*"marinade" + 0.003*"steam" + 0.003*"flour" + 0.003*"chinese"'),
 (0,
  '0.005*"knife" + 0.004*"fruit" + 0.003*"bean" + 0.003*"ethylene" + 0.003*"rice" + 0.003*"stop" + 0.003*"flour" + 0.003*"ripening" + 0.003*"bread" + 0.003*"egg"'),
 (13,
  '0.021*"vinegar" + 0.009*"wine" + 0.008*"rice" + 0.005*"white" + 0.005*"balsamic" + 0.004*"cake" + 0.004*"rice_wine" + 0.004*"cider" + 0.004*"knife" + 0.004*"cider_vinegar"'),
 (10,
  '0.056*"steak" + 0.014*"grill" + 0.011*"medium" + 0.008*"rest" + 0.005*"flip" + 0.005*"rare" + 0.005*"pepper" + 0.005*"thickness" + 0.005*"sear" + 0.005*"outside"'),
 (16,
  '0.009*"hour" + 0.008*"onion" + 0.008*"rib" + 0.007*"half" + 0.006*"slice" + 0.006*"baking" + 0.006*"gas" + 0.005*"rice" + 0.005*"

In [182]:
model.show_topic(18)

[('bag', 0.010500341319300025),
 ('herb', 0.0095810207676818686),
 ('ice', 0.0075708166957713617),
 ('leaf', 0.0072657253245815654),
 ('basil', 0.00665525668017454),
 ('skin', 0.0061031745693093629),
 ('paper', 0.0058831482760993087),
 ('freeze', 0.0058423073262144501),
 ('cold', 0.0057310478119483865),
 ('fish', 0.0056977872937214744)]

In [320]:
tag = 'baking'
print('%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'eggs'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'pasta'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'herbs'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'beef'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))

tag = 'salmon'
print('\n%s' % tag)
print('#Docs:', len(model.author2doc[tag]))
pprint(model.get_author_topics(tag))


baking
#Docs: 74
[(3, 0.99969861353423572)]

eggs
#Docs: 38
[(10, 0.99962213699351399)]

pasta
#Docs: 19
[(8, 0.99891468476715684)]

herbs
#Docs: 13
[(7, 0.99709305343567833)]

beef
#Docs: 15
[(12, 0.99879414856070836)]

salmon
#Docs: 6
[(18, 0.99213976423946681)]


## Similarity queries

Discrete Hellinger distance:

$$
H(p, q) = \frac{1}{\sqrt{2}} \sqrt{\sum_{i=1}^K (\sqrt{p_i} - \sqrt{q_i})^2}
$$

where $p$ and $q$ are both topic distributions for two different tags. We define the similarity as
$$
S(p, q) = \frac{1}{1 + H(p, q)}
$$

In [304]:
def similarity(vec1, vec2):
    dist = hellinger(sparse2full(vec1, num_topics), sparse2full(vec2, num_topics))
    sim = 1.0 / (1.0 + dist)
    return sim

def get_sims(vec, tag_vecs):
    sims = [similarity(vec, vec2) for vec2 in tag_vecs]
    return sims

In [305]:
tag_vecs = [model.get_author_topics(tag, minimum_probability=0.0) for tag in train_tag_set]

In [306]:
id2tag = dict(zip(range(len(train_tag_set)), list(train_tag_set)))

In [307]:
sims = get_sims(model.get_author_topics('baking', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
231,baking,1.0
290,food-safety,0.965386
278,poaching,0.916794
82,bread,0.881186
260,chili-peppers,0.866564
85,allium,0.8596
173,dry-aging,0.855882
240,please-remove-this-tag,0.843243
212,dairy-free,0.84016
196,fats,0.816782


In [308]:
print('#Docs:', len(model.author2doc['baking-powder']))

#Docs: 1


In [309]:
sims = get_sims(model.get_author_topics('baking-powder', minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
202,baking-powder,1.0
120,stews,0.998336
257,juice,0.997528
378,creme-brulee,0.995559
225,elderberries,0.995318
154,measuring-scales,0.99511
394,sponge-cake,0.992812
339,asparagus,0.992152
360,salad-dressing,0.991331
54,fire,0.989778


## Predicting the tag of a new document

In [192]:
lda = LdaModel(corpus=None, num_topics=num_topics, id2word=dictionary.id2token)
lda.state.sstats = model.state.sstats
lda.iterations = 1000  # Make sure training converges on document when calling lda[doc].

In [193]:
postid = 2
doc = test_corpus[postid]
print('Post tags:\n', test_postid2tagname[postid])
print('Post body:\n', test_docs[postid])

for tag in test_postid2tagname[postid]:
    if tag not in train_tag_set:
        print('Tag "', tag, '" not in training data.')

Post tags:
 ['eggs']
Post body:
 ['use', 'brown', 'extra', 'large', 'egg', 'can', 'honestly', 'habit', 'point', 'distinct', 'advantage', 'disadvantage', 'like', 'flavor', 'shelf', 'life', 'egg', 'nutrition', 'center', 'faq', 'page', 'entry', 'topic', 'basically', 'color', 'egg', 'affect', 'egg', 'flavor', 'nutritional', 'value', 'simply', 'depend', 'particular', 'breed', 'chicken', 'lay', 'egg', 'white', 'egg', 'white', 'hen', 'brown', 'egg', 'brown', 'hen', 'worth', 'note', 'enc', 'point', 'generally', 'brown', 'hen', 'large', 'require', 'feed', 'egg', 'slightly', 'high', 'price', 'difference', 'notice', 'free', 'range', 'egg', 'instead', 'factory', 'farm', 'egg', 'slight', 'yolk', 'color', 'difference', 'think', 'slightly', 'good', 'flavor', 'come', 'range', 'color', 'include', 'green', 'brown', 'white', 'difference', 'white', 'brown', 'egg', 'purely', 'cosmetic', 'nutritional', 'taste', 'difference', 'brown', 'egg', 'imo', 'look', 'cooler', 'cosmetic', 'base', 'breed', 'chicken', 'g

In [194]:
sims = get_sims(lda.get_document_topics(doc, minimum_probability=0.0), tag_vecs)

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
312,beverages,0.973676
3,cream,0.969973
105,fresh,0.961778
374,honey,0.96081
369,raw,0.959857
423,cheesecake,0.959098
373,classification,0.95609
145,resources,0.955922
216,maple-syrup,0.954864
68,shopping,0.953665


In [911]:
pred_threshold = 0.6
pred = sims_df.loc[sims_df.Score > pred_threshold]
pred_tags = list(pred.Tag)
pred_prob = list(pred.Score)
pred

Unnamed: 0,Tag,Score
75,catering,0.681661
289,melting-chocolate,0.832565


In [912]:
tp = 0
fp = 0
fn = 0
for tag in pred_tags:
    if tag in test_postid2tagname[postid]:
        tp += 1
    else:
        fp += 1

for tag in test_postid2tagname[postid]:
    if tag not in pred_tags:
        fn += 1
        
precision = tp / (tp + fp)
recall = tp / (tp + fn)
if precision + recall == 0:
    f1_score = 0.0
else:
    f1_score = 2 * precision * recall / (precision + recall)

print('F1 score: ', f1_score)

F1 score:  0.0


## Cosine similarity

In [47]:
# Generate a similarity object for the transformed corpus.
index = MatrixSimilarity(model[list(train_tag_set)])

In [48]:
# Get similarities to some tag.
tag_name = 'baking'
sims = index[model[tag_name]]

# Print the most similar tags.
sims_df = pd.DataFrame([(id2tag[elem[0]], elem[1]) for elem in enumerate(sims)], columns=['Tag', 'Score'])
sims_df.sort_values('Score', ascending=False)[:10]

Unnamed: 0,Tag,Score
0,baking,1.0
88,aging,0.994557
24,crepe,0.987024
96,ganache,0.983886
386,sauce,0.980548
38,puree,0.978956
191,roast-beef,0.975619
243,roasting,0.971207
233,taffy,0.960213
418,rye,0.959389


## LDA

In [483]:
%time lda = LdaModel(corpus, num_topics=10, id2word=dictionary.id2token, iterations=1, \
                     passes=100, eval_every=0, chunksize=1000)

CPU times: user 41.6 s, sys: 0 ns, total: 41.6 s
Wall time: 41.6 s


In [484]:
lda.show_topics()

[(0,
  '0.032*"sauce" + 0.031*"like" + 0.026*"taste" + 0.021*"eat" + 0.017*"rice" + 0.014*"dish" + 0.014*"ingredient" + 0.013*"love" + 0.012*"look" + 0.012*"tip"'),
 (1,
  '0.042*"egg" + 0.027*"milk" + 0.025*"difference" + 0.020*"use" + 0.017*"flavor" + 0.014*"day" + 0.014*"white" + 0.014*"question" + 0.013*"bit" + 0.012*"wine"'),
 (2,
  '0.036*"meat" + 0.025*"chicken" + 0.023*"cook" + 0.020*"pan" + 0.019*"cut" + 0.015*"like" + 0.015*"use" + 0.014*"hear" + 0.013*"food" + 0.012*"stick"'),
 (3,
  '0.064*"recipe" + 0.032*"use" + 0.022*"bread" + 0.020*"substitute" + 0.019*"bake" + 0.017*"baking" + 0.016*"cake" + 0.014*"chocolate" + 0.012*"problem" + 0.011*"dough"'),
 (4,
  '0.036*"way" + 0.029*"good" + 0.023*"temperature" + 0.021*"cheese" + 0.019*"time" + 0.018*"cook" + 0.016*"long" + 0.014*"beef" + 0.014*"vegetable" + 0.013*"use"'),
 (5,
  '0.028*"oil" + 0.022*"oven" + 0.020*"use" + 0.017*"knife" + 0.016*"good" + 0.013*"prepare" + 0.013*"wonder" + 0.011*"skin" + 0.011*"cook" + 0.011*"plac

## List docs

In [381]:
root = ET.parse(input_fname)
postid2tagname = dict()
postid = 0
posts = []
tag_set = set()
for i, item in enumerate(root.iter()):
    if i == 0:
        continue
    if item.get('Tags') is not None:
        tags = item.get('Tags')
        tags = re.findall('<(.+?)>', tags)
        # NOTE: consider using a tag that is common for all posts, or
        # a tag that is only for this particular post.
        #tags.append('SUPER_TAG')
        #tags.append('POST_ID' + str(postid))
        postid2tagname[postid] = tags
        posts.append(item.get('Body'))
        for tag in tags:
            tag_set.add(tag)
        postid += 1

In [489]:
num_posts = len(posts)
docs = []
for post in posts[:]:
    # Remove any HTML tags, such as <p>.
    text = re.sub('<[^<]+?>', '', post)
    doc = nlp(text)
    ents = doc.ents  # Named entities.
    # Keep only words (no numbers, no punctuation).
    # Lemmatize tokens.
    doc = [token.lemma_ for token in doc if token.is_alpha]
    # Remove common words from a stopword list.
    doc = [token for token in doc if token not in STOPWORDS]
    # Add named entities, but only if they are a compound of more than word.
    doc.extend([str(entity) for entity in ents if len(entity) > 1])
    docs.append(doc)

In [490]:
# Compute bigrams.

# Add bigrams and trigrams to docs (only ones that appear 20 times or more).
bigram = Phrases(docs, min_count=20)
for idx in range(len(docs)):
    for token in bigram[docs[idx]]:
        if '_' in token:
            # Token is a bigram, add to document.
            docs[idx].append(token)



In [492]:
# Vectorize data.

# Create a dictionary representation of the documents.
dictionary = Dictionary(docs)

# Filter out words that occur too frequently or too rarely.
#max_freq = 0.5
#min_wordcount = 20
#dictionary.filter_extremes(no_below=min_wordcount, no_above=max_freq)

dict0 = dictionary[0]  # This sort of "initializes" dictionary.id2token.

# Bag-of-words representation of the documents.
corpus = [dictionary.doc2bow(doc) for doc in docs]

# Serialize the corpus.
#MmCorpus.serialize('/tmp/corpus.mm', corpus)
#corpus = MmCorpus('/tmp/corpus.mm')