# Modelling Genre

Author-Topic Modelling -> Genre-Topic Modelling

https://nbviewer.jupyter.org/github/rare-technologies/gensim/blob/develop/docs/notebooks/atmodel_tutorial.ipynb

## Latent Dirichlet Allocation

### Training a topic model on the Books corpus

In [57]:
import glob
import random
RND = 12345
random.seed(RND)
from nltk.tokenize import word_tokenize

def chunker(l, n):
    """Yield successive n-sized chunks from l."""
    l = ''.join([c for c in l if c.isalpha() or c.isspace()])
    l = word_tokenize(l)
    l = [t.lower() for t in l]
    for i in range(0, len(l), n):
        yield l[i:i + n]        

class ParagraphIterator(object):
    def __init__(self, path, max_per_book=None,
                 chunk_size=300, max_books=None):
        self.max_books = max_books
        self.max_per_book = max_per_book
        self.chunk_size = chunk_size
        self.doc2genre = {}
        self.genre2doc = {}
        
        self.filenames = list(glob.glob(path))
        random.seed(RND)
        random.shuffle(self.filenames)
        if self.max_books:
            self.filenames = self.filenames[:self.max_books]

    def __iter__(self):
        for filename in self.filenames:
            comps = filename.split('/')
            genre, idx = comps[-2:]
            with open(filename, 'r') as f:
                try:
                    if self.max_per_book:
                        text = f.read(self.max_per_book)
                    else:
                        text = f.read()
                except:
                    continue
            for ch in chunker(text, self.chunk_size):
                idx = len(self.doc2genre)
                self.doc2genre[idx] = [genre]
                try:
                    self.genre2doc[genre].append(idx)
                except KeyError:
                    self.genre2doc[genre] = [idx]
                yield ch

In [58]:
path = '/Users/mike/GitRepos/potter/data/other/books_txt_full/*/*.txt'
n_features = 3000
n_topics = 50
n_top_words = 60

In [59]:
max_freq = 0.5
min_wordcount = 20

from gensim import corpora

paragraphs = ParagraphIterator(path, max_books=20)
dictionary = corpora.Dictionary(paragraphs)
dictionary.filter_extremes(no_below=min_wordcount,
                           no_above=max_freq,
                           keep_n=n_features)
dictionary.filter_n_most_frequent(500)

paragraphs = ParagraphIterator(path, max_books=50)
doc2genre = paragraphs.doc2genre
genre2doc = paragraphs.genre2doc
bow = [dictionary.doc2bow(doc) for doc in paragraphs]

2018-02-02 10:33:36,786 : INFO : adding document #0 to Dictionary(0 unique tokens: [])
2018-02-02 10:33:41,906 : INFO : built Dictionary(29798 unique tokens: ['wes', 'parker', 'monster', 'hunter', 'volume']...) from 3852 documents (total 1153278 corpus positions)
2018-02-02 10:33:41,945 : INFO : discarding 26798 tokens: [('parker', 18), ('volume', 12), ('one', 2152), ('cj', 1), ('pike', 4), ('copyright', 10), ('smashwords', 10), ('edition', 10), ('this', 2413), ('collection', 25)]...
2018-02-02 10:33:41,946 : INFO : keeping 3000 tokens which were in no less than 20 and no more than 1926 (=50.0%) documents
2018-02-02 10:33:41,956 : INFO : resulting dictionary: Dictionary(3000 unique tokens: ['wes', 'monster', 'hunter', 'short', 'stories']...)
2018-02-02 10:33:41,962 : INFO : discarding 500 tokens: [('like', 1915), ('about', 1897), ('we', 1842), ('do', 1840), ('could', 1831), ('an', 1810), ('would', 1747), ('down', 1730), ('them', 1708), ('over', 1703)]...
2018-02-02 10:33:41,965 : INFO 

In [60]:
print(len(doc2genre))
print(len(genre2doc))
print(len(bow))

10767
14
10767


In [52]:
import gensim
from gensim import corpora
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [9]:
lda = gensim.models.ldamodel.LdaModel(corpus=bow,
                                      id2word=dictionary,
                                      num_topics=n_topics,
                                      update_every=1,
                                      chunksize=1000,
                                      passes=1,
                                      random_state=RND)

2018-02-02 10:17:40,741 : INFO : using symmetric alpha at 0.02
2018-02-02 10:17:40,742 : INFO : using symmetric eta at 0.0004
2018-02-02 10:17:40,743 : INFO : using serial LDA version on this node
2018-02-02 10:17:41,488 : INFO : running online (single-pass) LDA training, 50 topics, 1 passes over the supplied corpus of 3852 documents, updating model once every 1000 documents, evaluating perplexity every 3852 documents, iterating 50x with a convergence threshold of 0.001000
2018-02-02 10:17:41,490 : INFO : PROGRESS: pass 0, at document #1000/3852
2018-02-02 10:17:44,690 : INFO : merging changes from 1000 documents into a model of 3852 documents
2018-02-02 10:17:44,885 : INFO : topic #13 (0.020): 0.031*"jake" + 0.023*"alvin" + 0.023*"tasha" + 0.017*"nina" + 0.013*"diana" + 0.011*"sharine" + 0.008*"veron" + 0.004*"ghost" + 0.004*"children" + 0.004*"ok"
2018-02-02 10:17:44,886 : INFO : topic #35 (0.020): 0.019*"polonius" + 0.014*"josh" + 0.008*"sharine" + 0.007*"killed" + 0.007*"alvin" + 0

In [10]:
print(lda.print_topics(num_topics=10, num_words=20))

2018-02-02 10:18:09,906 : INFO : topic #5 (0.020): 0.024*"charlie" + 0.016*"jake" + 0.011*"kissing" + 0.010*"mask" + 0.009*"sofa" + 0.009*"sean" + 0.008*"mia" + 0.008*"today" + 0.007*"teased" + 0.007*"tucked" + 0.007*"fun" + 0.006*"alvin" + 0.006*"tasha" + 0.006*"thanks" + 0.006*"guys" + 0.006*"leg" + 0.005*"marry" + 0.005*"boys" + 0.005*"tongue" + 0.005*"girls"
2018-02-02 10:18:09,907 : INFO : topic #45 (0.020): 0.212*"george" + 0.021*"kumiko" + 0.015*"amanda" + 0.014*"mehmet" + 0.011*"rachel" + 0.009*"fourth" + 0.009*"somehow" + 0.007*"hospital" + 0.007*"definitely" + 0.006*"softened" + 0.006*"desk" + 0.006*"week" + 0.006*"especially" + 0.005*"lots" + 0.005*"vulnerable" + 0.005*"clearly" + 0.005*"nearly" + 0.005*"crazy" + 0.005*"particularly" + 0.005*"agreed"
2018-02-02 10:18:09,908 : INFO : topic #43 (0.020): 0.015*"number" + 0.012*"harpers" + 0.011*"section" + 0.011*"happiness" + 0.011*"lock" + 0.011*"beneath" + 0.011*"code" + 0.010*"apartment" + 0.009*"block" + 0.009*"partner" + 0

[(5, '0.024*"charlie" + 0.016*"jake" + 0.011*"kissing" + 0.010*"mask" + 0.009*"sofa" + 0.009*"sean" + 0.008*"mia" + 0.008*"today" + 0.007*"teased" + 0.007*"tucked" + 0.007*"fun" + 0.006*"alvin" + 0.006*"tasha" + 0.006*"thanks" + 0.006*"guys" + 0.006*"leg" + 0.005*"marry" + 0.005*"boys" + 0.005*"tongue" + 0.005*"girls"'), (45, '0.212*"george" + 0.021*"kumiko" + 0.015*"amanda" + 0.014*"mehmet" + 0.011*"rachel" + 0.009*"fourth" + 0.009*"somehow" + 0.007*"hospital" + 0.007*"definitely" + 0.006*"softened" + 0.006*"desk" + 0.006*"week" + 0.006*"especially" + 0.005*"lots" + 0.005*"vulnerable" + 0.005*"clearly" + 0.005*"nearly" + 0.005*"crazy" + 0.005*"particularly" + 0.005*"agreed"'), (43, '0.015*"number" + 0.012*"harpers" + 0.011*"section" + 0.011*"happiness" + 0.011*"lock" + 0.011*"beneath" + 0.011*"code" + 0.010*"apartment" + 0.009*"block" + 0.009*"partner" + 0.009*"begin" + 0.009*"itll" + 0.009*"swollen" + 0.009*"concrete" + 0.007*"blake" + 0.007*"discussed" + 0.007*"shoved" + 0.007*"warm

### Reading tea leaves: add your own label to the topics

In [11]:
#!pip install pyldavis

In [12]:
import pyLDAvis
import pyLDAvis.gensim

v = pyLDAvis.gensim.prepare(lda, bow, dictionary)
pyLDAvis.display(v)

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  topic_term_dists = topic_term_dists.ix[topic_order]


- student assigned: provide short interpretative labels for each topic

In [13]:
### Infer topic on HP + diachronic plot

## A Genre-Topic Model of the Books corpus

From the documentation:

doc2author is a dictionary where the keys are document IDs (indexes to corpus) and the values are lists of author names. I.e. this is the reverse mapping of author2doc. Only one of the two, author2doc and doc2author have to be supplied.

In [14]:
print(doc2genre)

{0: 'Teen', 1: 'Teen', 2: 'Teen', 3: 'Teen', 4: 'Teen', 5: 'Teen', 6: 'Teen', 7: 'Teen', 8: 'Teen', 9: 'Teen', 10: 'Teen', 11: 'Teen', 12: 'Teen', 13: 'Teen', 14: 'Teen', 15: 'Teen', 16: 'Teen', 17: 'Teen', 18: 'Teen', 19: 'Teen', 20: 'Teen', 21: 'Teen', 22: 'Teen', 23: 'Teen', 24: 'Teen', 25: 'Teen', 26: 'Teen', 27: 'Teen', 28: 'Teen', 29: 'Teen', 30: 'Teen', 31: 'Teen', 32: 'Teen', 33: 'Teen', 34: 'Teen', 35: 'Teen', 36: 'Teen', 37: 'Teen', 38: 'Teen', 39: 'Teen', 40: 'Teen', 41: 'Teen', 42: 'Teen', 43: 'Teen', 44: 'Teen', 45: 'Teen', 46: 'Teen', 47: 'Teen', 48: 'Teen', 49: 'Teen', 50: 'Teen', 51: 'Teen', 52: 'Teen', 53: 'Teen', 54: 'Teen', 55: 'Teen', 56: 'Teen', 57: 'Teen', 58: 'Teen', 59: 'Teen', 60: 'Teen', 61: 'Teen', 62: 'Teen', 63: 'Teen', 64: 'Teen', 65: 'Teen', 66: 'Teen', 67: 'Teen', 68: 'Teen', 69: 'Teen', 70: 'Teen', 71: 'Teen', 72: 'Teen', 73: 'Teen', 74: 'Teen', 75: 'Teen', 76: 'Teen', 77: 'Teen', 78: 'Teen', 79: 'Teen', 80: 'Teen', 81: 'Teen', 82: 'Teen', 83: 'Teen', 8

In [53]:
from gensim.models import AuthorTopicModel

In [54]:
print(set(doc2genre.values()))

{'Literature', 'Humor', 'Fantasy', 'New_Adult', 'Historical', 'Vampires', 'Themes', 'Adventure', 'Romance', 'Science_fiction', 'Mystery', 'Young_Adult', 'Teen', 'Horror'}


In [61]:
atmodel = AuthorTopicModel(bow,
                           num_topics=n_topics,
                           doc2author=doc2genre,
                           author2doc=genre2doc,
                           id2word=dictionary,
                           update_every=1,
                           chunksize=1000,
                           passes=1,
                           random_state=RND)
atmodel.save('atmodel')

2018-02-02 10:34:04,749 : INFO : Vocabulary consists of 2500 words.
2018-02-02 10:34:04,750 : INFO : using symmetric alpha at 0.02
2018-02-02 10:34:04,750 : INFO : using symmetric eta at 0.0004
2018-02-02 10:34:05,533 : INFO : running online author-topic training, 50 topics, 14 authors, 1 passes over the supplied corpus of 10767 documents, updating model once every 1000 documents, evaluating perplexity every 10000 documents, iterating 50x with a convergence threshold of 0.001000
2018-02-02 10:34:05,535 : INFO : PROGRESS: pass 0, at document #1000/10767
2018-02-02 10:34:06,688 : INFO : merging changes from 1000 documents into a model of 10767 documents
2018-02-02 10:34:06,750 : INFO : topic #8 (0.020): 0.021*"wed" + 0.009*"seat" + 0.008*"phillip" + 0.007*"longer" + 0.007*"line" + 0.007*"utterly" + 0.007*"year" + 0.007*"questions" + 0.007*"blow" + 0.007*"island"
2018-02-02 10:34:06,751 : INFO : topic #5 (0.020): 0.046*"josh" + 0.017*"gun" + 0.016*"mission" + 0.015*"daughter" + 0.014*"str

2018-02-02 10:34:12,595 : INFO : topic diff=0.939836, rho=0.408248
2018-02-02 10:34:12,596 : INFO : PROGRESS: pass 0, at document #7000/10767
2018-02-02 10:34:13,367 : INFO : merging changes from 1000 documents into a model of 10767 documents
2018-02-02 10:34:13,403 : INFO : topic #9 (0.020): 0.030*"map" + 0.030*"minho" + 0.027*"brenda" + 0.026*"shirt" + 0.021*"hallway" + 0.020*"dr" + 0.018*"guard" + 0.015*"metal" + 0.015*"newt" + 0.013*"blank"
2018-02-02 10:34:13,404 : INFO : topic #40 (0.020): 0.005*"tapestry" + 0.005*"lightning" + 0.003*"school" + 0.003*"cat" + 0.003*"space" + 0.003*"earth" + 0.003*"response" + 0.003*"travel" + 0.002*"agent" + 0.002*"field"
2018-02-02 10:34:13,405 : INFO : topic #17 (0.020): 0.019*"brother" + 0.009*"upon" + 0.007*"male" + 0.006*"brothers" + 0.005*"son" + 0.005*"mo" + 0.004*"august" + 0.004*"drew" + 0.004*"shall" + 0.004*"tongue"
2018-02-02 10:34:13,406 : INFO : topic #11 (0.020): 0.178*"vampire" + 0.098*"vampires" + 0.040*"creature" + 0.017*"baby" +

In [63]:
atmodel = AuthorTopicModel.load('atmodel')

2018-02-02 10:34:24,051 : INFO : loading AuthorTopicModel object from atmodel
2018-02-02 10:34:24,122 : INFO : loading expElogbeta from atmodel.expElogbeta.npy with mmap=None
2018-02-02 10:34:24,127 : INFO : setting ignored attribute state to None
2018-02-02 10:34:24,128 : INFO : setting ignored attribute id2word to None
2018-02-02 10:34:24,128 : INFO : setting ignored attribute dispatcher to None
2018-02-02 10:34:24,129 : INFO : loaded atmodel
2018-02-02 10:34:24,129 : INFO : loading AuthorTopicModel object from atmodel.state
2018-02-02 10:34:24,137 : INFO : loaded atmodel.state


In [64]:
atmodel.show_topic(0)

[('flare', 0.045700383422214733),
 ('minho', 0.022302333146892402),
 ('guys', 0.017697413820859591),
 ('brenda', 0.015896514968156122),
 ('newt', 0.013487016904500722),
 ('trust', 0.011491811583505989),
 ('pointed', 0.0086140254930573857),
 ('jorge', 0.0077988330517695659),
 ('fist', 0.0075734680636358759),
 ('gally', 0.0062308296249942875)]

In [66]:
atmodel.get_author_topics('Romance', minimum_probability=1e-8)

[(0, 1.2751196593716457e-07),
 (1, 1.2751196593716457e-07),
 (2, 1.2751196593716457e-07),
 (3, 1.2751196593716457e-07),
 (4, 1.2751196593716457e-07),
 (5, 0.065835583515977583),
 (6, 1.2751196593716457e-07),
 (7, 1.2751196593716457e-07),
 (8, 1.2751196593716457e-07),
 (9, 1.2751196593716457e-07),
 (10, 0.00055049312512626849),
 (11, 1.2751196593716457e-07),
 (12, 1.2751196593716457e-07),
 (13, 0.12437962018511892),
 (14, 1.2751196593716457e-07),
 (15, 1.2751196593716457e-07),
 (16, 1.2751196593716457e-07),
 (17, 0.022352296150818404),
 (18, 1.2751196593716457e-07),
 (19, 1.2751196593716457e-07),
 (20, 1.2751196593716457e-07),
 (21, 1.2751196593716457e-07),
 (22, 1.2751196593716457e-07),
 (23, 0.0016661255737882646),
 (24, 1.2751196593716457e-07),
 (25, 0.20392470749407718),
 (26, 1.2751196593716457e-07),
 (27, 1.2751196593716457e-07),
 (28, 1.2751196593716457e-07),
 (29, 1.2751196593716457e-07),
 (30, 1.2751196593716457e-07),
 (31, 1.2751196593716457e-07),
 (32, 0.42795248948931386),
 

In [67]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
smallest_genre = 0  # Ignore authors with documents less than this.
genres = [model.author2id[a] for a in model.author2id.keys() if len(model.author2doc[a]) >= smallest_genre]
_ = tsne.fit_transform(model.state.gamma[genres, :])  # Result stored in tsne.embedding_

In [70]:
# Tell Bokeh to display plots inside the notebook.
from bokeh.io import output_notebook
output_notebook()

In [69]:
#!pip install bokeh

Collecting bokeh
  Downloading bokeh-0.12.13.tar.gz (15.4MB)
[K    100% |████████████████████████████████| 15.4MB 72kB/s eta 0:00:011
Building wheels for collected packages: bokeh
  Running setup.py bdist_wheel for bokeh ... [?25ldone
[?25h  Stored in directory: /Users/mike/Library/Caches/pip/wheels/a7/2c/4a/96740179eabf7ddba1f7ae36ba96540e8a7e557936eaab4829
Successfully built bokeh
Installing collected packages: bokeh
Successfully installed bokeh-0.12.13


In [71]:
from bokeh.models import HoverTool
from bokeh.plotting import figure, show, ColumnDataSource

x = tsne.embedding_[:, 0]
y = tsne.embedding_[:, 1]
genre_names = [model.id2author[a] for a in genres]

# Radius of each point corresponds to the number of documents attributed to that author.
scale = 0.1
genre_sizes = [len(model.author2doc[a]) for a in genre_names]
radii = [size * scale for size in genre_sizes]

source = ColumnDataSource(
        data=dict(
            x=x,
            y=y,
            author_names=genre_names,
            author_sizes=genre_sizes,
            radii=radii,
        )
    )

# Add author names and sizes to mouse-over info.
hover = HoverTool(
        tooltips=[
        ("author", "@author_names"),
        ("size", "@author_sizes"),
        ]
    )

p = figure(tools=[hover, 'crosshair,pan,wheel_zoom,box_zoom,reset,save,lasso_select'])
p.scatter('x', 'y', radius='radii', source=source, fill_alpha=0.6, line_color=None)
show(p)

## word2vec: modelling the muggles and other non-words