# Part 1: Prepare LDA models for topic-diff estimation

In [1]:
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.info("check")

2017-03-26 16:20:29,889 : INFO : check


## Download dataset
Dataset description: https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/readme.txt

In [2]:
!mkdir -p dataset/
!wget -O dataset/docs.txt.gz -nc https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/docword.nytimes.txt.gz
!wget -O dataset/vocab.txt -nc https://archive.ics.uci.edu/ml/machine-learning-databases/bag-of-words/vocab.nytimes.txt
!gunzip -k -f dataset/docs.txt.gz

File `dataset/docs.txt.gz' already there; not retrieving.
File `dataset/vocab.txt' already there; not retrieving.


## Cleanup data

In [3]:
logging.info("Load dictionary")
with open("dataset/vocab.txt") as infile:
    words = [line.strip() for line in infile]
id2word = dict(zip(range(1, len(words) + 1), words))

logging.info("Construct corpus")
docs = {}
with open("dataset/docs.txt") as infile:
    for _ in range(3):
        next(infile)
        
    for line in infile:
        d, wid, cnt = map(int, line.strip().split(" "))
        docs.setdefault(d, {})
        docs[d][id2word[wid]] = cnt
        
logging.info("Expand corpus")
documents = []
for (_, val) in docs.iteritems():
    curr_doc = []

    for (w, cnt) in val.items():
        for _ in range(cnt):
            curr_doc.append(w)
    documents.append(curr_doc)
    
docs.clear()
id2word.clear()
del docs, id2word

2017-03-26 16:20:36,506 : INFO : Load dictionary
2017-03-26 16:20:36,562 : INFO : Construct corpus
2017-03-26 16:22:32,933 : INFO : Expand corpus


## Split dataset to train and holdout 

reduce corpus size to speed up training

In [4]:
from random import shuffle
shuffle(documents)

TRAINSET_SIZE = 100000
HOLDOUT_SIZE = 20000

trainset = documents[:TRAINSET_SIZE]
holdout = documents[TRAINSET_SIZE:TRAINSET_SIZE + HOLDOUT_SIZE]

## Filter dictionary

In [5]:
from gensim.corpora import Dictionary

dictionary = Dictionary(documents=trainset, prune_at=None)
print(dictionary)

dictionary.filter_extremes(no_below=10, no_above=0.3, keep_n=None)
dictionary.compactify()
print(dictionary)

!rm -rf models/*
!mkdir -p models/
dictionary.save("models/dictionary.corpora")

2017-03-26 16:23:05,102 : INFO : 'pattern' package not found; tag filters are not available for English
2017-03-26 16:23:05,106 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2017-03-26 16:23:09,420 : INFO : adding document #10000 to Dictionary(77125 unique tokens: [u'zzz_faa', u'zzz_olson', u'zzz_fab', u'zzz_compaq', u'zzz_king_abdullah_ii']...)
2017-03-26 16:23:13,815 : INFO : adding document #20000 to Dictionary(88667 unique tokens: [u'fawn', u'zzz_olson', u'zzz_bo_jackson', u'zzz_erubiel_durazo', u'nunnery']...)
2017-03-26 16:23:18,324 : INFO : adding document #30000 to Dictionary(93622 unique tokens: [u'fawn', u'zzz_olson', u'zzz_bo_jackson', u'zzz_erubiel_durazo', u'nunnery']...)
2017-03-26 16:23:22,412 : INFO : adding document #40000 to Dictionary(96055 unique tokens: [u'fawn', u'zzz_olson', u'zzz_bo_jackson', u'zzz_erubiel_durazo', u'nunnery']...)
2017-03-26 16:23:26,506 : INFO : adding document #50000 to Dictionary(97563 unique tokens: [u'fawn', u'zzz_olson', u

Dictionary(100141 unique tokens: [u'fawn', u'zzz_olson', u'zzz_bo_jackson', u'zzz_erubiel_durazo', u'nunnery']...)
Dictionary(72405 unique tokens: [u'fawn', u'zzz_olson', u'zzz_fab', u'zzz_bo_jackson', u'zzz_erubiel_durazo']...)


2017-03-26 16:23:50,042 : INFO : saving Dictionary object under models/dictionary.corpora, separately None
2017-03-26 16:23:50,110 : INFO : saved models/dictionary.corpora


## Covert dataset to bag-of-word for LDA training

In [6]:
import json

with open("dataset/trainset.json", 'w') as outfile:
    for doc in trainset:
        outfile.write(json.dumps({"d2b": dictionary.doc2bow(doc)}) + "\n")
        
with open("dataset/holdout.json", 'w') as outfile:
    for doc in holdout:
        outfile.write(json.dumps({"d2b": dictionary.doc2bow(doc)}) + "\n")

!wc -l dataset/trainset.json
!wc -l dataset/holdout.json

100000 dataset/trainset.json
20000 dataset/holdout.json


## Prepare stuff for training

In [7]:
import os
os.environ['OMP_NUM_THREADS']

'1'

In [8]:
import json
from gensim.corpora import Dictionary
from gensim.models import LdaMulticore

dictionary = Dictionary.load("models/dictionary.corpora")
lda = LdaMulticore(num_topics=75, id2word=dictionary, workers=4, eval_every=None,
                   passes=10, batch=True, chunksize=2500)

2017-03-26 16:24:42,918 : INFO : loading Dictionary object from models/dictionary.corpora
2017-03-26 16:24:42,950 : INFO : loaded models/dictionary.corpora
2017-03-26 16:24:42,963 : INFO : using symmetric alpha at 0.0133333333333
2017-03-26 16:24:42,964 : INFO : using symmetric eta at 1.38112008839e-05
2017-03-26 16:24:42,984 : INFO : using serial LDA version on this node


## Train model & dump model every 10k documents (+ save perplexity value)

In [9]:
!mkdir -p models/lda

BATCH_SIZE = 20000
EPOCH_NUM = 3

for ep in range(1, EPOCH_NUM + 1):
    logging.info("---=== EPOCH #%d ===---", ep)
    
    logging.info("Shuffle trainset")
    !shuf dataset/trainset.json > dataset/trainset_shuffled.json
    
    with open("dataset/trainset_shuffled.json") as infile:
        batch = []

        for idx, line in enumerate(infile):
            batch.append(json.loads(line)["d2b"])

            if (idx + 1) % BATCH_SIZE == 0:
                logging.info("#%d", idx + 1)
                lda.update(batch)
                lda.save("models/lda/ep{}_docs{}_lda.model".format(ep, idx + 1))
                batch[:] = []

2017-03-26 16:25:39,649 : INFO : ---=== EPOCH #1 ===---
2017-03-26 16:25:39,652 : INFO : Shuffle trainset
2017-03-26 16:25:52,773 : INFO : #20000
2017-03-26 16:25:52,774 : INFO : running batch LDA training, 75 topics, 10 passes over the supplied corpus of 20000 documents, updating every 20000 documents, evaluating every ~0 documents, iterating 50x with a convergence threshold of 0.001000
2017-03-26 16:25:52,783 : INFO : training LDA model using 4 processes
2017-03-26 16:25:52,964 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #2500/20000, outstanding queue size 1
2017-03-26 16:25:54,931 : INFO : PROGRESS: pass 0, dispatched chunk #1 = documents up to #5000/20000, outstanding queue size 2
2017-03-26 16:25:54,932 : INFO : PROGRESS: pass 0, dispatched chunk #2 = documents up to #7500/20000, outstanding queue size 3
2017-03-26 16:25:54,933 : INFO : PROGRESS: pass 0, dispatched chunk #3 = documents up to #10000/20000, outstanding queue size 4
2017-03-26 16:25:54,934 : INFO

## Go to "Part 2: Visualize topic-difference" notebook