Nick Clifford

# Topic Models (LDA)

Next, I use mallet to help produce topic models for my corpus of documents

# Setup

In [46]:
import pandas as pd
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [3]:
datadir = '/Users/nickclifford/Documents/UVA/Spring 2020/DS 5001 Exploratory Text Analysis/final/data/'
table_out = datadir + 'tables/'

LIB = pd.read_csv(datadir + 'tables/LIB_mod.csv', parse_dates=['date']).set_index('doc_id')
TOKEN = pd.read_csv(datadir + 'tables/TOKEN_mod.csv').set_index('doc_id')
VOCAB = pd.read_csv(datadir + 'tables/VOCAB_mod.csv').set_index('term_id')

In [49]:
n_terms = 4000
n_topics = 30
max_iter = 5
OHCO = ['doc_id', 'page_num', 'para_num']

# Create the corpus

First I make sure to format the corpus correctly before using mallet

In [20]:
corpus = TOKEN\
.groupby(BAG)\
.term_str.apply(lambda  x:  x.tolist())\
.reset_index().rename({'term_str':'doc_text'}, axis=1)
corpus.sample(10)

Unnamed: 0,doc_id,doc_text
935,A7186,"[p, s, since, writing, the, foregoing, mr, tuk..."
1604,F3067,"[my, dear, husband, i, take, my, pen, in, hand..."
928,A7179,"[dear, sir, yours, of, the, 23rd, is, received..."
1475,F0920,"[chambersburg, dear, sir, i, have, just, bid, ..."
1581,F3044,"[morris, iland, south, carlina, my, dear, i, t..."
1088,A8018,"[richd, dear, brother, william, h, clarke, of,..."
563,A5041,"[fairfax, co, va, dear, sister, i, take, my, p..."
1547,F3010,"[jacksonville, florida, headquarters, my, dear..."
1611,F3503,"[london, franklin, co, pa, i, do, hereby, cert..."
903,A7154,"[yours, containing, checks, one, for, 627100, ..."


In [21]:
# This corpus is moved to polite/corpus directory after read to .csv
#corpus.to_csv(datadir + '/tables/valley-corpus.csv')

# Read in Mallet ouput files

In [25]:
tables = !ls data/valley-mazo-output/tables/*.csv
tables

['data/valley-mazo-output/tables/DOC.csv',
 'data/valley-mazo-output/tables/DOCTOPIC.csv',
 'data/valley-mazo-output/tables/DOCTOPIC_NARROW.csv',
 'data/valley-mazo-output/tables/DOCWORD.csv',
 'data/valley-mazo-output/tables/TOPIC.csv',
 'data/valley-mazo-output/tables/TOPICPHRASE.csv',
 'data/valley-mazo-output/tables/TOPICWORD.csv',
 'data/valley-mazo-output/tables/TOPICWORD_DIAGS.csv',
 'data/valley-mazo-output/tables/TOPICWORD_NARROW.csv',
 'data/valley-mazo-output/tables/TOPICWORD_WEIGHTS.csv',
 'data/valley-mazo-output/tables/VOCAB.csv']

In [27]:
df = {}
for table in tables:
    table_name = table.split('/')[-1].split('.')[0]
    df[table_name] = pd.read_csv(table)

In [28]:
df

{'DOC':       doc_id  src_doc_id doc_label
 0          0         NaN    doc_id
 1          1         0.0     A0001
 2          2         1.0     A0002
 3          3         2.0     A0003
 4          4         3.0     A0004
 ...      ...         ...       ...
 1838    1838      1837.0     F8578
 1839    1839      1838.0     F8579
 1840    1840      1839.0     F8580
 1841    1841      1840.0     F8581
 1842    1842      1841.0     F8582
 
 [1843 rows x 3 columns],
 'DOCTOPIC':       doc_id         0         1         2         3         4         5  \
 0          0  0.081774  0.266344  0.063659  0.047258  0.102270  0.265975   
 1          1  0.107574  0.034277  0.129395  0.028252  0.001757  0.051321   
 2          2  0.005033  0.428743  0.003918  0.258849  0.006295  0.002152   
 3          3  0.007325  0.272156  0.295384  0.086999  0.154001  0.106590   
 4          4  0.033574  0.747677  0.006833  0.054665  0.010977  0.028549   
 ...      ...       ...       ...       ...       ...      

# Cluster Topics

In [29]:
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

In [30]:
def plot_tree(tree, labels):
    plt.figure()
    fig, axes = plt.subplots(figsize=(5, 10))
    dendrogram = sch.dendrogram(tree, labels=labels, orientation="left")
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
SIMS = pdist(normalize(PHI), metric='euclidean')
TREE = sch.linkage(SIMS, method='ward')

In [None]:
labels  = ["{}: {}".format(a,b) for a, b in zip(AUTHORS.index,  AUTHORS.topterms.tolist())]

In [None]:
plot_tree(TREE, labels)

# LDA Viz

## Create Vector Space

We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.

In [52]:
tfv = CountVectorizer(max_features=n_terms, stop_words='english')
tf = tfv.fit_transform(corpus)
TERMS = tfv.get_feature_names()

# Generate Model

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [23]:
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)