# Topic Extraction

[Topic Modeling with Gensim](https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python/)

In [None]:
import pandas as pd

# Read cleaned data
df = pd.read_csv('https://raw.githubusercontent.com/nphan20181/nlp_project/main/data/data_cleaned.csv')

# Preview cleaned data
df.head()

In [None]:
df = df[['Text_Cleaned', 'Title']]
df.head()

In [15]:
# Import Gensim libraries
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

# Import plotting tools
!pip install pyLDAvis==2.1.2
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

from pprint import pprint



In [None]:
data = df.Text_Cleaned.values.tolist()

pprint(data[:1])

In [19]:
def sent_to_words(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

data_words = list(sent_to_words(data))

print(data_words[:1])

[['electronic', 'health', 'record', 'ehr', 'set', 'clinical', 'health', 'information', 'useful', 'patient', 'treatment', 'eg', 'clinical', 'laboratory', 'report', 'discharge', 'letter', 'emergency', 'report', 'produce', 'hospital', 'patient', 'summary', 'ps', 'produce', 'general', 'practitioner', 'gp', 'datum', 'two', 'different', 'state', 'validate', 'eg', 'document', 'digitally', 'sign', 'doctor', 'validate', 'eg', 'health', 'datum', 'pressure', 'record', 'patient', 'autonomously', 'scenario', 'typically', 'talk', 'personal', 'health', 'record', 'phr', 'contexts', 'interest', 'point', 'view', 'patient', 'often', 'refer', 'concept', 'relate', 'health', 'datum', 'document', 'alessandra', 'pieroni', 'et', 'al', 'associate', 'to', 'whole', 'hospitalization', 'ward', 'also', 'consider', 'hospitalization', 'outpatient', 'episode', 'within', 'ward', 'itself', 'gp', 'whole', 'hospital', 'consider', 'episode', 'different', 'ward', 'hospitalization', 'too', 'health', 'datum', 'document', 'pati

In [21]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

# Faster way to get a sentecne clubbed as a trigram/bigram
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# See trigram example
print(trigram_mod[bigram_mod[data_words[0]]])



['electronic_health_record_ehr', 'set', 'clinical', 'health', 'information', 'useful', 'patient', 'treatment', 'eg', 'clinical', 'laboratory', 'report', 'discharge', 'letter', 'emergency', 'report', 'produce', 'hospital', 'patient', 'summary', 'ps', 'produce', 'general', 'practitioner', 'gp', 'datum', 'two', 'different', 'state', 'validate', 'eg', 'document', 'digitally_sign', 'doctor', 'validate', 'eg', 'health', 'datum', 'pressure', 'record', 'patient', 'autonomously', 'scenario', 'typically', 'talk', 'personal', 'health', 'record', 'phr', 'contexts', 'interest', 'point', 'view', 'patient', 'often', 'refer', 'concept', 'relate', 'health', 'datum', 'document', 'alessandra_pieroni', 'et', 'al', 'associate', 'to', 'whole', 'hospitalization', 'ward', 'also', 'consider', 'hospitalization', 'outpatient', 'episode', 'within', 'ward', 'itself', 'gp', 'whole', 'hospital', 'consider', 'episode', 'different', 'ward', 'hospitalization', 'too', 'health', 'datum', 'document', 'patient', 'regardles

In [22]:
def make_bigrams(texts):
  return [bigram_mod[doc] for doc in texts]

def make_trigrams(tests):
  return [trigram_mod[bigram_mod[doc]] for doc in texts] 

In [24]:
data_words_bigrams = make_bigrams(data_words)

print(data_words_bigrams[:1])

[['electronic_health', 'record_ehr', 'set', 'clinical', 'health', 'information', 'useful', 'patient', 'treatment', 'eg', 'clinical', 'laboratory', 'report', 'discharge', 'letter', 'emergency', 'report', 'produce', 'hospital', 'patient', 'summary', 'ps', 'produce', 'general', 'practitioner', 'gp', 'datum', 'two', 'different', 'state', 'validate', 'eg', 'document', 'digitally_sign', 'doctor', 'validate', 'eg', 'health', 'datum', 'pressure', 'record', 'patient', 'autonomously', 'scenario', 'typically', 'talk', 'personal', 'health', 'record', 'phr', 'contexts', 'interest', 'point', 'view', 'patient', 'often', 'refer', 'concept', 'relate', 'health', 'datum', 'document', 'alessandra_pieroni', 'et', 'al', 'associate', 'to', 'whole', 'hospitalization', 'ward', 'also', 'consider', 'hospitalization', 'outpatient', 'episode', 'within', 'ward', 'itself', 'gp', 'whole', 'hospital', 'consider', 'episode', 'different', 'ward', 'hospitalization', 'too', 'health', 'datum', 'document', 'patient', 'regar

In [27]:
# Create Dictionary
id2word = corpora.Dictionary(data_words_bigrams)

# Create Corpus
texts = data_words_bigrams

# Term Document Frequency
corpus = [id2word.doc2bow(text) for text in texts]

print(corpus[:1])

[[(0, 3), (1, 3), (2, 1), (3, 1), (4, 1), (5, 3), (6, 6), (7, 1), (8, 19), (9, 1), (10, 1), (11, 2), (12, 1), (13, 3), (14, 1), (15, 2), (16, 1), (17, 1), (18, 2), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 1), (25, 1), (26, 2), (27, 3), (28, 4), (29, 3), (30, 37), (31, 13), (32, 1), (33, 15), (34, 1), (35, 1), (36, 4), (37, 15), (38, 2), (39, 2), (40, 3), (41, 1), (42, 5), (43, 2), (44, 1), (45, 1), (46, 1), (47, 6), (48, 1), (49, 1), (50, 18), (51, 1), (52, 2), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 2), (61, 4), (62, 2), (63, 3), (64, 1), (65, 2), (66, 10), (67, 1), (68, 1), (69, 2), (70, 1), (71, 1), (72, 1), (73, 1), (74, 2), (75, 22), (76, 2), (77, 1), (78, 2), (79, 1), (80, 2), (81, 1), (82, 1), (83, 6), (84, 5), (85, 2), (86, 1), (87, 1), (88, 1), (89, 2), (90, 3), (91, 1), (92, 4), (93, 7), (94, 4), (95, 1), (96, 4), (97, 1), (98, 2), (99, 3), (100, 1), (101, 1), (102, 1), (103, 1), (104, 1), (105, 1), (106, 1), (107, 1), (108, 1), (109, 1), 

In [None]:
# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(
    corpus = corpus,
    id2word = id2word,
    num_topics = 50,
    random_state = 100,
    update_every=1,
    chunksize=100,
    passes = 10,
    alpha = 'auto',
    per_word_topics=True
)

In [None]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

In [30]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word)
vis