In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation as LDA

In [2]:
%matplotlib inline

In [3]:
n_terms = 4000
n_topics = 80
max_iter = 5

# Prepare Data

In [4]:
import os

In [7]:
os.system("cat ./corpus/cord19_* > ./corpus/corpus.csv")

0

In [8]:
corpus = pd.read_csv('corpus/corpus.csv', sep='|')

In [9]:
corpus.head()

Unnamed: 0,doc_key,doc_title,doc_url,doc_content,doc_label
0,c630ebcdf30652f0422c3ec12a00b50241dc9bd9,Angiotensin-converting enzyme 2 (ACE2) as a SA...,https://dx.doi.org/10.1007/s00134-020-05985-9,SARS-CoV-2 has been sequenced [3] . A phylogen...,CZI
1,53eccda7977a31e3d0f565c884da036b1e85438e,Comparative genetic analysis of the novel coro...,https://dx.doi.org/10.1038/s41421-020-0147-1,Comparative genetic analysis of the novel coro...,CZI
2,210a892deb1c61577f6fba58505fd65356ce6636,Incubation Period and Other Epidemiological Ch...,https://dx.doi.org/10.3390/jcm9020538,"As of 31 January 2020, mainland China reported...",CZI
3,e3b40cc8e0e137c416b4a2273a4dca94ae8178cc,Characteristics of and Public Health Responses...,https://dx.doi.org/10.3390/jcm9020575,"In December 2019, a cluster of pneumonia of un...",CZI
4,0df0d5270a9399cf4e23c0cdd877a80616a9725e,An updated estimation of the risk of transmiss...,https://dx.doi.org/10.1016/j.idm.2020.02.001,Coronaviruses are a group of enveloped viruses...,CZI


## Create Vector Space

We use Scikit Learn's CountVectorizer to convert our F1 corpus of paragraphs into a document-term vector space of word counts.

In [10]:
tfv = CountVectorizer(max_features=n_terms, stop_words='english')
tf = tfv.fit_transform(corpus.doc_content)
TERMS = tfv.get_feature_names()

# Generate Model

We run Scikit Learn's [LatentDirichletAllocation algorithm](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html#sklearn.decomposition.LatentDirichletAllocation) and extract the THETA and PHI tables.

In [11]:
lda = LDA(n_components=n_topics, max_iter=max_iter, learning_offset=50., random_state=0)

In [None]:
tmodel = lda.fit_transform(tf)

## THETA

In [None]:
THETA = pd.DataFrame(tmodel)
THETA.columns.name = 'topic_id'

## PHI

In [None]:
PHI = pd.DataFrame(lda.components_, columns=TERMS)
PHI.index.name = 'topic_id'
PHI.columns.name  = 'term_str'

# Inspect Results

## Get Top Terms per Topic

In [None]:
TOPICS = PHI.stack().to_frame().rename(columns={0:'weight'})\
    .groupby('topic_id')\
    .apply(lambda x: 
           x.weight.sort_values(ascending=False)\
               .head(10)\
               .reset_index()\
               .drop('topic_id',1)\
               .term_str)

In [None]:
TOPICS['label'] = TOPICS.apply(lambda x: str(x.name) + ' ' + ' '.join(x), 1)

## Sort Topics by Doc Weight

In [None]:
TOPICS['doc_weight_sum'] = THETA.sum()

In [None]:
TOPICS.sort_values('doc_weight_sum', ascending=True).plot.barh(y='doc_weight_sum', x='label', figsize=(5,20)) 

# Clutser Topics

In [None]:
import scipy.cluster.hierarchy as sch
from scipy.spatial.distance import pdist
from sklearn.cluster import AgglomerativeClustering
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt

In [None]:
def plot_tree(tree, labels):
    plt.figure()
    fig, axes = plt.subplots(figsize=(5, 20))
    dendrogram = sch.dendrogram(tree, labels=labels, orientation="left")
    plt.tick_params(axis='both', which='major', labelsize=14)

In [None]:
SIMS = pdist(normalize(PHI), metric='cosine')
TREE = sch.linkage(SIMS, method='ward')

In [None]:
labels = TOPICS.label.tolist() #["{}: {}".format(a,b) for a, b in zip(AUTHORS.index,  AUTHORS.topterms.tolist())]

In [None]:
# labels

In [None]:
plot_tree(TREE, labels)

# LDA Viz

In [52]:
import pyLDAvis
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()

In [53]:
pyLDAvis.sklearn.prepare(lda, tf, tfv)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))
