In [46]:
%matplotlib nbagg

import json
from glob import glob

import matplotlib.pyplot as plt
import numpy as np
import sklearn.decomposition
import sklearn.preprocessing
import sklearn.feature_extraction
import sklearn.pipeline

In [50]:
import warnings
warnings.filterwarnings('ignore')

In [9]:
def chunker(lines, timestamps=True):
    
    chunks, chunk = [], ''
    speaker_id = lines[0]['speaker_id']
    
    if timestamps:
        audio_ids = []
    
    for line in lines:
        if speaker_id == line['speaker_id']:
            chunk = chunk + ' ' + line['best_text'].strip()
        else:
            chunks.append(chunk)
            chunk = line['best_text']
            speaker_id = line['speaker_id']
            
    chunks.append(chunk)
    return chunks

In [10]:
PATHS = "../data/data-*/transcripts/json/*.json"
files = sorted(glob(PATHS))

In [11]:
docs = []
for file in files:
    data = json.load(open(file, 'r'))
    docs.extend(chunker(data['lines']))

In [12]:
len(docs)

10325

In [9]:
P = sklearn.pipeline.Pipeline( [('counts', sklearn.feature_extraction.text.CountVectorizer(strip_accents='ascii',
                                                                                           #max_df=0.5,
                                                                                           stop_words='english')),
                                ('LDA', sklearn.decomposition.LatentDirichletAllocation(n_topics=32,
                                                                                        batch_size=32,
                                                                                        max_iter=128))])

In [None]:
y = P.fit_transform(docs)

In [11]:
import pickle

In [13]:
f = open("../base-model.pkl", "rb")
model = pickle.load(f)

In [38]:
pipeline = model['pipeline']
vect, lda = (step[1] for step in pipeline.steps)

In [39]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words='english',
        strip_accents='ascii', token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [40]:
lda

LatentDirichletAllocation(batch_size=64, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=64, mean_change_tol=0.001,
             n_jobs=1, n_topics=32, perp_tol=0.1, random_state=None,
             topic_word_prior=None, total_samples=1000000.0, verbose=0)

In [5]:
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

In [6]:
class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, doc):
        return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]

In [15]:
P = sklearn.pipeline.Pipeline(
    [('counts', sklearn.feature_extraction.text.CountVectorizer(strip_accents='ascii',
                                                                min_df=5,
                                                                max_df=0.5,
                                                                stop_words='english')),
                                                                #tokenizer=LemmaTokenizer())),
     ('LDA', sklearn.decomposition.LatentDirichletAllocation(n_topics=32,
                                                             batch_size=32,
                                                             max_iter=128))])

In [16]:
y = P.fit_transform(docs)

In [22]:
vect, lda = (step[1] for step in P.steps)

In [23]:
import pandas as pd

# https://gist.github.com/napjon/ef842b41e0048d132607

norm = lambda data: pd.DataFrame(data).div(data.sum(1), axis=0).values

s_docs = pd.Series(docs)
vected = vect.fit_transform(s_docs)
doc_topic_dists = norm(lda.fit_transform(vected))

In [30]:
import pyLDAvis

In [47]:
prepped = pyLDAvis.prepare(
    doc_lengths = s_docs.str.len(),
    vocab = vect.get_feature_names(),
    term_frequency = vected.sum(axis=0).tolist()[0],
    topic_term_dists = norm(lda.components_),
    doc_topic_dists = doc_topic_dists,
    mds = 'tsne',
    R=20
)

In [48]:
# t-SNE
pyLDAvis.display(prepped)

In [49]:
prepped = pyLDAvis.prepare(
    doc_lengths = s_docs.str.len(),
    vocab = vect.get_feature_names(),
    term_frequency = vected.sum(axis=0).tolist()[0],
    topic_term_dists = norm(lda.components_),
    doc_topic_dists = doc_topic_dists,
    R=20
)

In [45]:
# PCoA
pyLDAvis.display(prepped)

In [53]:
P2 = sklearn.pipeline.Pipeline(
    [('counts', sklearn.feature_extraction.text.CountVectorizer(strip_accents='ascii',
                                                                min_df=5,
                                                                max_df=0.5,
                                                                stop_words='english')),
                                                                #tokenizer=LemmaTokenizer())),
     ('LDA', sklearn.decomposition.LatentDirichletAllocation(n_topics=32,
                                                             batch_size=64,
                                                             max_iter=128))])

In [54]:
y = P2.fit_transform(s_docs)

In [168]:
def from_scikit(y, vect, lda, docs, **kwargs):
    def norm(x):
        """Standardize rows to sum to 1"""
        return x / x.sum(axis=1).reshape(-1, 1)
    
    term_freqs = vect.fit_transform(docs)
    
    return pyLDAvis.prepare(
        doc_lengths = docs.str.len(),
        vocab = vect.get_feature_names(),
        term_frequency = np.array(term_freqs.sum(axis=0)).flatten(),
        topic_term_dists = norm(lda.components_),
        doc_topic_dists = norm(y),
        **kwargs)

In [169]:
vect, lda = P2.named_steps['counts'], P2.named_steps['LDA']
s_docs = pd.Series(docs)

In [170]:
tsne, pcoa = (from_scikit(y, vect, lda, s_docs, mds=mds, R=20) for mds in ('tsne', 'pcoa'))

In [171]:
pyLDAvis.display(tsne)

In [172]:
pyLDAvis.display(pcoa)