## Build topic model

In [14]:
import pandas as pd

In [25]:
n_features = 5000 # number of most common words
n_topics = 30 # number of topics
n_top_words = 60 # number of characteristic words per topic (viz purposes only)
max_df = 0.5 # maximum document frequency
min_df = 100 # minimum document frequency
max_books = 1000 # include all books
chunk_size = 300 # size of average Harry Potter chapter (after keeping only ADJ, NOUN & VERB)

In [26]:
import glob
import random
RND = 12345
random.seed(RND)

class Chunker(object):
    def __init__(self, filenames, max_chars_per_book=None,
                 chunk_size=None):
        self.max_chars_per_book = max_chars_per_book
        self.chunk_size = chunk_size
        self.filenames = filenames

    def __iter__(self):
        for filename in self.filenames:
            with open(filename) as f:
                text = f.read()
                if self.max_chars_per_book:
                    text = text[:self.max_chars_per_book]
            
            tokens = text.split()
            
            for i in range(0, len(tokens), self.chunk_size):
                yield tokens[i:i + self.chunk_size]

In [33]:
path = 'data/tagged/streek/*.txt'
filenames = sorted(list(glob.glob(path)))
print(len(filenames))

50


In [34]:
chunker = Chunker(filenames, chunk_size=chunk_size)

In [35]:
import os
from sklearn.feature_extraction.text import CountVectorizer

def identity(x):
    return x

vectorizer = CountVectorizer(max_df=max_df, min_df=min_df,
                             max_features=n_features,
                             analyzer=identity)
X = vectorizer.fit_transform(chunker)

bow_example = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names())
bow_example.sample(10)

Unnamed: 0,a,aandacht,aandachtig,aandoen,aangeboden,aangekeken,aangekomen,aangenomen,aankijken,aankomen,...,zwarte,zweeg,zweet,zwijgen,zwijgend,zwijgt,zélf,zó,één,óók
3507,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
4835,0,0,0,0,0,0,0,0,0,0,...,7,0,0,0,0,0,0,0,2,0
3441,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
355,1,0,0,0,0,0,0,0,0,0,...,2,0,0,0,0,0,0,0,3,0
1004,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5821,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
2364,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5579,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1267,0,0,0,0,0,0,0,0,0,0,...,1,0,1,0,0,0,0,0,0,0
957,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [36]:
from sklearn.feature_extraction.text import TfidfTransformer
transformer = TfidfTransformer()
X_ = transformer.fit_transform(X)

In [37]:
from sklearn.decomposition import NMF
nmf = NMF(n_components=n_topics,
          random_state=RND,
          verbose=1, max_iter=n_topics).fit(X_)

violation: 1.0
violation: 0.41793887036448085
violation: 0.29592196620678946
violation: 0.19250065306166914
violation: 0.11648847001109655
violation: 0.07762911873129312
violation: 0.05654371161575821
violation: 0.04502088246039263
violation: 0.037637597756080145
violation: 0.032455858643444864
violation: 0.02872612589952708
violation: 0.025917065664696714
violation: 0.023594134492937728
violation: 0.021557542610322614
violation: 0.019719252901369763
violation: 0.018221832777507045
violation: 0.017084994851341836
violation: 0.01624826056614422
violation: 0.01582424037230903
violation: 0.01569538784951964
violation: 0.015767322728756313
violation: 0.015676211596075126
violation: 0.01537315648158281
violation: 0.014708426603159136
violation: 0.013773992748037674
violation: 0.01289684069805528
violation: 0.012338362067158617
violation: 0.011915771562779594
violation: 0.011600628940723182
violation: 0.011259792606965522


In [38]:
import os
import shutil
from wordcloud import WordCloud
import numpy as np
import seaborn as sb
import matplotlib.pyplot as plt

def top_words(model, feature_names, n_top_words):
    try:
        shutil.rmtree('clouds')
    except:
        pass
    os.mkdir('clouds')

    for topic_idx, topic in enumerate(model.components_):
        print('.', end='')
        topic[np.isnan(topic)] = 0
        
        words = [feature_names[i] for i in topic.argsort()[:-n_top_words-1:-1]]
        weights = [topic[i] for i in topic.argsort()[:-n_top_words-1:-1]]

        freqs = {wo: we for wo, we in zip(words, weights)}
        wordcloud = WordCloud(normalize_plurals=False,
                              background_color='white',
                              colormap='inferno_r',
                              width=800,
                              height=400)
        wordcloud = wordcloud.generate_from_frequencies(freqs)
        wordcloud.to_file('clouds/'+str(topic_idx) + '.tiff')

feature_names = vectorizer.get_feature_names()
info = top_words(nmf, feature_names, n_topics)

..............................

In [41]:
chunks = Chunker(filenames, chunk_size=chunk_size, max_chars_per_book=None)
X = vectorizer.transform(chunks)

In [42]:
import pyLDAvis
pyLDAvis.enable_notebook()
import pyLDAvis.sklearn

In [43]:
nmf.verbose = 0
pyLDAvis.sklearn.prepare(nmf, X, vectorizer)

  kernel = (topic_given_term * np.log((topic_given_term.T / topic_proportion).T))
  log_lift = np.log(topic_term_dists / term_proportion)
  log_ttd = np.log(topic_term_dists)
