In [1]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import pandas as pd
import matplotlib.pyplot as plt

import re
import os
from tqdm import tqdm

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
from gensim.models.ldamodel import LdaModel

import pyLDAvis
import pyLDAvis.gensim_models as gensimvis
pyLDAvis.enable_notebook()

from preprocessing import load_cleaned_metadata, clean_text_lda, get_clean_write
from data_access import get_txt
from my_lda import MyCorpus

In [2]:
df = load_cleaned_metadata('data/processed/metadata_clean_lda.csv')
len(df)

437966

# Create data files of clean data

In [3]:
# dest_directory = 'data/cord-19/body_text/lda_clean/'
# ids = df.cord_uid.tolist()

# for uid in tqdm(ids):
#     get_clean_write(uid, dest_directory)

In [4]:
len(os.listdir('data/cord-19/body_text/lda_clean/'))

437967

## Trial run with small sample

In [3]:
mask = df.publish_time.apply(lambda x: x.year == 2000)
df_2000 = df[mask]
ids = df_2000.cord_uid.tolist()

In [4]:
len(ids)

524

Required output:
- topic profile of every document
- visualisation of topics

> "[Gensim](https://radimrehurek.com/gensim/auto_examples/core/run_corpora_and_vector_spaces.html#corpus-streaming-tutorial) accepts any object that, when iterated over, successively yields documents."

In [40]:
doc_path_list = ['data/cord-19/body_text/lda_clean/' + uid + '_clean.txt' for uid in ids]
corpus = MyCorpus(doc_path_list)

In [41]:
save_directory = 'data/cord-19/body_text/'
corpus.make_dictionary(save_directory, "test")

Creating dictionary...
...complete
Saving dictionary to data/cord-19/body_text/


In [42]:
# filter extremes
corpus.filter_extremes(no_below=5, no_above=0.2)

### Train model, explore methods

In [43]:
path = corpus.doc_path_list[0]
doc_vector = corpus.get_doc_bow(path)
doc_vector[:5]

[(0, 1), (1, 1), (2, 1), (3, 2), (4, 5)]

In [62]:
model = LdaModel(corpus, num_topics=20, id2word=corpus.id2word)

In [63]:
model.save('data/cord-19/body_text/test')

In [64]:
model = LdaModel.load('data/cord-19/body_text/test')

In [65]:
path = corpus.doc_path_list[0]
for topic, prob in model.get_document_topics(corpus.get_doc_bow(path)):
    print(topic, ":", prob)

0 : 0.43332756
6 : 0.02037912
9 : 0.35893548
10 : 0.07797326
18 : 0.108619824


In [66]:
model.get_topics()

array([[7.5955968e-04, 9.6585361e-05, 1.4547369e-04, ..., 1.8923089e-05,
        1.3248137e-05, 1.5853038e-05],
       [6.1290251e-04, 1.3248033e-04, 1.4766656e-04, ..., 1.3674631e-05,
        1.4673652e-05, 1.2612842e-05],
       [6.5311138e-04, 5.2394447e-05, 1.8024594e-04, ..., 1.9037239e-05,
        3.4618526e-05, 8.3424375e-06],
       ...,
       [7.3124707e-04, 1.0621601e-04, 1.8472495e-04, ..., 1.5040757e-05,
        2.2784692e-05, 1.3292022e-05],
       [4.8544913e-04, 8.8198452e-05, 1.9042726e-04, ..., 1.6033253e-05,
        4.5805773e-05, 1.5056427e-05],
       [5.4476492e-04, 1.1274404e-04, 2.1877061e-04, ..., 2.2079748e-05,
        2.4044959e-05, 1.4721693e-05]], dtype=float32)

In [67]:
model.get_topic_terms(topicid=1)

[(1846, 0.006807441),
 (1509, 0.0049863067),
 (277, 0.004640583),
 (782, 0.0043594055),
 (68, 0.003047572),
 (2473, 0.003044593),
 (2596, 0.0028353212),
 (387, 0.0024736647),
 (3837, 0.00219427),
 (1988, 0.0021439008)]

In [68]:
print(model.print_topic(0))
model.print_topics()[:3]

0.004*"fusion" + 0.003*"mice" + 0.003*"patient" + 0.003*"care" + 0.002*"peptide" + 0.002*"dc" + 0.002*"pneumonia" + 0.002*"children" + 0.002*"lung" + 0.002*"membrane"


[(0,
  '0.004*"fusion" + 0.003*"mice" + 0.003*"patient" + 0.003*"care" + 0.002*"peptide" + 0.002*"dc" + 0.002*"pneumonia" + 0.002*"children" + 0.002*"lung" + 0.002*"membrane"'),
 (1,
  '0.007*"k" + 0.005*"cat" + 0.005*"mice" + 0.004*"lymphocytes" + 0.003*"care" + 0.003*"patient" + 0.003*"medical" + 0.002*"relaxation" + 0.002*"cd8" + 0.002*"defect"'),
 (2,
  '0.004*"influenza" + 0.004*"calve" + 0.004*"pig" + 0.004*"care" + 0.003*"n" + 0.003*"children" + 0.003*"k" + 0.002*"health" + 0.002*"therapy" + 0.002*"medical"')]

In [69]:
model.show_topic(0, topn=5)

[('fusion', 0.0035234622),
 ('mice', 0.0034330073),
 ('patient', 0.00322549),
 ('care', 0.0028825526),
 ('peptide', 0.002489323)]

In [70]:
gensimvis.prepare(model, corpus, corpus.dictionary)