# Topic Modeling Clustering

In this notebook we visualize a previously generated topic model.

# Set Up

## Imports

In [None]:
import pandas as pd
import numpy as np
from lib import tapi

## Configuration

In [None]:
tapi.list_dbs()

In [None]:
data_prefix = 'winereviews'
# data_prefix = 'jstor_hyperparameter'
# data_prefix = 'tamilnet'

In [None]:
group_col = 'doc_points' # winereviews
# group_col = 'doc_year' # jstor_hyperparameter
# group_col = 'doc_label' # tamilnet

In [None]:
db = tapi.Edition(data_prefix)

## Import Topic Data

We import our previously generated model.

In [None]:
db.get_tables()

In [None]:
db.TOPICS_NMF[['topwords','doc_weight_sum']]

# Inspect Results

## Sort Topics by Doc Weight

In [None]:
db.TOPICS_NMF.sort_values('doc_weight_sum', ascending=True)\
    .plot.barh(y='doc_weight_sum', x='topwords', figsize=(5, db.n_topics/3), legend=False);

In [None]:
db.TOPICS.sort_values('doc_weight_sum', ascending=True)\
    .plot.barh(y='doc_weight_sum', x='topwords', figsize=(5, db.n_topics/3), legend=False);

## Clutser Topics

In [None]:
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

In [None]:
def plot_tree(tree, labels):
    plt.figure()
    fig, axes = plt.subplots(figsize=(5, db.n_topics / 3))
    dendrogram = sch.dendrogram(tree, labels=labels, orientation="left")
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
SIMS = pdist(normalize(db.PHI), metric='cosine')
TREE = sch.linkage(SIMS, method='ward')

In [None]:
plot_tree(TREE, db.TOPICS.topwords.to_list());

In [None]:
SIMS = pdist(normalize(db.PHI_NMF), metric='cosine')
TREE = sch.linkage(SIMS, method='ward')

In [None]:
plot_tree(TREE, db.TOPICS_NMF.topwords.to_list());

## Group Topics by Label

In [None]:
db.LABELS.columns


In [None]:
# group_col = 'doc_points'

In [None]:
group_vals = db.LABELS[group_col].value_counts().to_frame('n')
group_vals.index.name = 'group_id'

In [None]:
G = db.THETA.join(db.LABELS).groupby(group_col)[db.topic_cols].mean()
G.index.name = 'group_id'
G.columns = db.TOPICS.topwords

In [None]:
G2 = db.THETA_NMF.join(db.LABELS).groupby(group_col)[db.topic_cols].mean()
G2.index.name = 'group_id'
G2.columns = db.TOPICS_NMF.topwords

### Heatmap of Labels and Topics

In [None]:
G.style.background_gradient(cmap='YlGnBu', axis=0)

In [None]:
G.style.background_gradient(cmap='YlGnBu', axis=1)

In [None]:
G2.style.background_gradient(cmap='YlGnBu', axis=0)

In [None]:
G2.style.background_gradient(cmap='YlGnBu', axis=1)

### Top Topic per Label Group

In [None]:
group_vals['top_topic'] = G.T[group_vals.index].idxmax()
group_vals['top_topic2'] = G2.T[group_vals.index].idxmax()

In [None]:
group_vals.sort_values('top_topic')

### Top Label Group per Topic

In [None]:
G.idxmax().to_frame('label').sort_values('label')

In [None]:
G2.idxmax().to_frame('label').sort_values('label')

## How Similar are the Topics to Each Other?

Get the dot product of the two topic series over documents.

In [None]:
X = db.THETA.T.dot(db.THETA_NMF)
X.columns = db.TOPICS_NMF.topwords
X.index = db.TOPICS.topwords
X.columns.name = 'NMF'
X.index.name = 'LDA'

In [None]:
# X

In [None]:
X2 = X.stack().to_frame('w').sort_values('w', ascending=False)

In [None]:
X2.head(10).style.bar()

## Frequency by Group

In [None]:
group_vals.sort_index().plot.bar(rot=45, figsize=(15,5))