In [1]:
from glob import glob
import json
import pickle

import numpy as np
import pandas as pd
import pyLDAvis
import spacy

In [2]:
import warnings
warnings.filterwarnings('ignore')

### Process audio transcripts into documents

One document = contiguous statement by one speaker

In [3]:
def chunker(lines):    
    chunks, chunk = [], ''
    speaker_id = lines[0]['speaker_id']

    for line in lines:
        if speaker_id == line['speaker_id']:
            chunk = chunk + ' ' + line['best_text']
        else:
            chunks.append(chunk)
            chunk = line['best_text']
            speaker_id = line['speaker_id']
            
    chunks.append(chunk)
    return chunks

In [4]:
PATHS = '/Volumes/Media/workspace/data/openaudio/data-*/transcripts/json/*.json'
files = sorted(glob(PATHS))

In [5]:
docs = []
for file in files:
    data = json.load(open(file, 'r'))
    docs.extend(chunker(data['lines']))

In [6]:
len(docs)

10325

### Extract nouns from documents

In [7]:
nlp = spacy.load('en')

In [8]:
def nounify(doc):
    return ' '.join([chunk.lemma_ for chunk in nlp(doc).noun_chunks])

noun_docs = pd.Series([nounify(doc) for doc in docs])

### Load [LDA](https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation) model and fit to documents (processed transcripts)

* Topics are distributions over words
* Documents are distribution over topics

--> Given corpus (in this case, audio transcript segments), find latent topics that explain observed documents.

In [9]:
f = open('../base-model.pkl', 'rb')
model = pickle.load(f)

In [10]:
P = model['pipeline']

In [11]:
y = P.fit_transform(noun_docs)

In [12]:
vect, lda = P.named_steps['counts'], P.named_steps['LDA']

In [13]:
len(noun_docs)

10325

### Explore model by visualizing high-dimensional LDA topic data in 2-D

Using two methods of dimensionality reduction:
* t-Stochastic neighbor embedding (t-SNE)
* Principle coordinates analysis (PCoA)

In [14]:
def from_scikit(y, vect, lda, docs, **kwargs):
    def norm(x):
        """Standardize rows to sum to 1"""
        return x / x.sum(axis=1).reshape(-1, 1)
    
    freqs = vect.fit_transform(docs)
    
    return pyLDAvis.prepare(
        doc_lengths = docs.str.len(),
        vocab = vect.get_feature_names(),
        term_frequency = np.array(freqs.sum(axis=0)).flatten(),
        topic_term_dists = norm(lda.components_),
        doc_topic_dists = norm(y),
        **kwargs)

In [15]:
tsne, pcoa = (from_scikit(y, vect, lda, noun_docs, mds=mds, R=20) for mds in ('tsne', 'pcoa'))

### Save as `lda_tsne.html` & `lda_pcoa.html.` Open in browser to view!

In [30]:
#pyLDAvis.display(tsne)
#pyLDAvis.display(pcoa)

with open("lda_tsne.html", "w") as f:
    pyLDAvis.save_html(tsne, f)
    
with open("lda_pcoa.html", "w") as f:
    pyLDAvis.save_html(pcoa, f)