Topic Modeling - LDA (Machine Learning Plus)

In [1]:
!pip install spacy
!pip install pyLDAvis
!pip install regex



In [2]:
import nltk; nltk.download('stopwords')
import re
import numpy as np
import pandas as pd
from pprint import pprint

import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.models.wrappers import LdaMallet
import os
os.environ.update({'MALLET_HOME':r'Users/mrinal/Downloads/mallet-2.0.8/'})

import spacy

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings("ignore",category=DeprecationWarning)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mrinal/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Preparing stopwords:

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

Importing Data:

In [4]:
__mtData = pd.read_csv('/Users/mrinal/Downloads/mtsamples.csv', error_bad_lines = False)
__mtText = __mtData[['transcription']]
__naText = __mtText.dropna()

In [5]:
# Converting String to List:
__mtList = __mtText.values.tolist()

In [6]:
# Tokenize words and clean up text:
def sentenceToWords(sentences):
    for sentence in sentences:
        yield(gensim.utils.simple_preprocess(str(sentence), deacc=True))  # deacc=True removes punctuations

__mtWords = list(sentenceToWords(__mtList))

In [7]:
# Printing the words obtained:
for i in range(0, len(__mtWords)//1000):
    print(__mtWords[i], '\n')

['subjective', 'this', 'year', 'old', 'white', 'female', 'presents', 'with', 'complaint', 'of', 'allergies', 'she', 'used', 'to', 'have', 'allergies', 'when', 'she', 'lived', 'in', 'seattle', 'but', 'she', 'thinks', 'they', 'are', 'worse', 'here', 'in', 'the', 'past', 'she', 'has', 'tried', 'claritin', 'and', 'zyrtec', 'both', 'worked', 'for', 'short', 'time', 'but', 'then', 'seemed', 'to', 'lose', 'effectiveness', 'she', 'has', 'used', 'allegra', 'also', 'she', 'used', 'that', 'last', 'summer', 'and', 'she', 'began', 'using', 'it', 'again', 'two', 'weeks', 'ago', 'it', 'does', 'not', 'appear', 'to', 'be', 'working', 'very', 'well', 'she', 'has', 'used', 'over', 'the', 'counter', 'sprays', 'but', 'no', 'prescription', 'nasal', 'sprays', 'she', 'does', 'have', 'asthma', 'but', 'doest', 'not', 'require', 'daily', 'medication', 'for', 'this', 'and', 'does', 'not', 'think', 'it', 'is', 'flaring', 'up', 'medications', 'her', 'only', 'medication', 'currently', 'is', 'ortho', 'tri', 'cyclen',

In [8]:
# Eliminate stopwords, make bigrams and lemmatize:
def remove_stopwords(texts):
    return [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]

def lemmatization(texts, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """https://spacy.io/api/annotation"""
    texts_out = []
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return texts_out

In [10]:
# Remove Stop Words
__noStopWords = remove_stopwords(__mtWords)
nlp = spacy.load('en', disable=['parser', 'ner'])

# Do lemmatization keeping only noun, adj, vb, adv
__lemmData = lemmatization(__noStopWords, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

print(__lemmData[:1])

[['subjective', 'year', 'old', 'white', 'female', 'present', 'complaint', 'allergy', 'use', 'allergy', 'live', 'think', 'bad', 'past', 'try', 'claritin', 'zyrtec', 'work', 'short', 'time', 'seem', 'lose', 'effectiveness', 'use', 'allegra', 'also', 'use', 'last', 'summer', 'begin', 'use', 'week', 'ago', 'appear', 'work', 'well', 'use', 'counter', 'spray', 'prescription', 'nasal', 'spray', 'asthma', 'do', 'require', 'daily', 'medication', 'think', 'flare', 'medication', 'medication', 'currently', 'ortho', 'allergie', 'know', 'medicine', 'allergy', 'objective', 'vital', 'weight', 'pound', 'blood', 'pressure', 'heent', 'throat', 'mildly', 'erythematous', 'exudate', 'nasal', 'mucosa', 'erythematous', 'swollen', 'clear', 'drainage', 'see', 'clear', 'neck', 'supple', 'adenopathy', 'lung', 'clear', 'assessment', 'allergic', 'rhinitis', 'plan', 'try', 'zyrtec', 'instead', 'allegra', 'option', 'use', 'loratadine', 'think', 'prescription', 'coverage', 'may', 'cheap', 'sample', 'spray', 'give', 'w

In [17]:
# Create Dictionary
id2word = corpora.Dictionary(__lemmData)

# Create Corpus
__textCorpus = __lemmData

# Term Document Frequency
__corpus = [__id2word.doc2bow(text) for text in __textCorpus]

print(__corpus[:1])

[[(0, 1), (1, 1), (2, 2), (3, 1), (4, 1), (5, 3), (6, 1), (7, 1), (8, 1), (9, 1), (10, 1), (11, 1), (12, 1), (13, 1), (14, 1), (15, 3), (16, 1), (17, 1), (18, 1), (19, 1), (20, 1), (21, 1), (22, 1), (23, 1), (24, 2), (25, 1), (26, 1), (27, 1), (28, 1), (29, 1), (30, 1), (31, 1), (32, 1), (33, 1), (34, 1), (35, 1), (36, 1), (37, 1), (38, 3), (39, 1), (40, 1), (41, 1), (42, 2), (43, 1), (44, 1), (45, 1), (46, 1), (47, 1), (48, 1), (49, 1), (50, 1), (51, 3), (52, 1), (53, 1), (54, 1), (55, 1), (56, 1), (57, 1), (58, 1), (59, 1), (60, 3), (61, 1), (62, 1), (63, 1), (64, 1), (65, 3), (66, 1), (67, 1), (68, 2), (69, 6), (70, 1), (71, 2), (72, 1), (73, 2), (74, 1), (75, 2), (76, 1), (77, 1), (78, 2)]]


In [18]:
# Human readable format of corpus (term-frequency)
[[(__id2word[id], freq) for id, freq in cp] for cp in __corpus[:1]]

[[('adenopathy', 1),
  ('ago', 1),
  ('allegra', 2),
  ('allergic', 1),
  ('allergie', 1),
  ('allergy', 3),
  ('also', 1),
  ('appear', 1),
  ('assessment', 1),
  ('asthma', 1),
  ('bad', 1),
  ('begin', 1),
  ('blood', 1),
  ('cheap', 1),
  ('claritin', 1),
  ('clear', 3),
  ('complaint', 1),
  ('counter', 1),
  ('coverage', 1),
  ('currently', 1),
  ('daily', 1),
  ('do', 1),
  ('drainage', 1),
  ('effectiveness', 1),
  ('erythematous', 2),
  ('exudate', 1),
  ('female', 1),
  ('flare', 1),
  ('give', 1),
  ('heent', 1),
  ('instead', 1),
  ('know', 1),
  ('last', 1),
  ('live', 1),
  ('loratadine', 1),
  ('lose', 1),
  ('lung', 1),
  ('may', 1),
  ('medication', 3),
  ('medicine', 1),
  ('mildly', 1),
  ('mucosa', 1),
  ('nasal', 2),
  ('neck', 1),
  ('objective', 1),
  ('old', 1),
  ('option', 1),
  ('ortho', 1),
  ('past', 1),
  ('plan', 1),
  ('pound', 1),
  ('prescription', 3),
  ('present', 1),
  ('pressure', 1),
  ('require', 1),
  ('rhinitis', 1),
  ('sample', 1),
  ('see', 

In [22]:
# Build LDA model
__ldaModel = gensim.models.ldamodel.LdaModel(corpus = __corpus,
                                           id2word = __id2word,
                                           num_topics = 10, 
                                           random_state = 100,
                                           update_every = 1,
                                           chunksize = 100,
                                           passes = 10,
                                           alpha = 'auto',
                                           per_word_topics = True)

In [23]:
# Print the Keyword in the 10 topics
pprint(__ldaModel.print_topics())
__documentsLDA = __ldaModel[__corpus]

[(0,
  '0.033*"right" + 0.023*"extremity" + 0.021*"normal" + 0.021*"low" + '
  '0.018*"leave" + 0.016*"pain" + 0.015*"knee" + 0.014*"reveal" + '
  '0.011*"motion" + 0.011*"upper"'),
 (1,
  '0.027*"place" + 0.025*"suture" + 0.021*"incision" + 0.018*"use" + '
  '0.016*"skin" + 0.015*"close" + 0.013*"patient" + 0.012*"run" + '
  '0.011*"fashion" + 0.011*"dissection"'),
 (2,
  '0.037*"history" + 0.019*"patient" + 0.012*"year" + 0.010*"normal" + '
  '0.010*"pain" + 0.009*"deny" + 0.009*"medication" + 0.009*"blood" + '
  '0.007*"present" + 0.007*"medical"'),
 (3,
  '0.064*"cord" + 0.041*"nonspecific" + 0.032*"detect" + 0.025*"postprocedure" '
  '+ 0.024*"penis" + 0.024*"inguinal" + 0.024*"testicle" + 0.019*"hernia" + '
  '0.017*"vasculature" + 0.017*"milk"'),
 (4,
  '0.040*"normal" + 0.021*"lesion" + 0.019*"leave" + 0.019*"breast" + '
  '0.018*"right" + 0.014*"appear" + 0.014*"note" + 0.013*"tumor" + '
  '0.011*"evidence" + 0.011*"small"'),
 (5,
  '0.054*"procedure" + 0.048*"patient" + 0.022

In [24]:
(__ldaModel.alpha)

array([0.35478094, 0.17508884, 1.0247076 , 0.04860025, 0.31675854,
       0.42144945, 0.42239565, 0.41841653, 0.60486555, 0.4516257 ],
      dtype=float32)

A range of alpha values obtained using the 'auto' parameter in LdaModel where the model gives ideal values of alpha to obtain the number of topics listed.

In [25]:
# # Compute Perplexity
# print('\nPerplexity: ', lda_model.log_perplexity(corpus))  # a measure of how good the model is. lower the better.

# Compute Coherence Score
__coherenceModel = CoherenceModel(model = __ldaModel, texts = __lemmData, dictionary = __id2word, coherence = 'c_v')
__ldaCoherence = __coherenceModel.get_coherence()
print('\nCoherence Score: ', __ldaCoherence)


Coherence Score:  0.5334579607285418


In [27]:
# Visualize the topics
pyLDAvis.enable_notebook()
__vis = pyLDAvis.gensim.prepare(__ldaModel, __corpus, __id2word)
# __vis

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


In [33]:
def compute_coherence_values(dictionary, corpus, texts, limit, start = 2, step = 3):
    """
    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics
    
    Returns:
    -------
    model_list : List of LDA topic models
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    model_list = []
    for num_topics in range(start, limit, step):
        model = gensim.models.wrappers.LdaMallet(mallet_path, corpus = __corpus, num_topics = num_topics, id2word = id2word)
        model_list.append(model)
        coherencemodel = CoherenceModel(model = model, texts = __textCorpus, dictionary = dictionary, coherence = 'c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return model_list, coherence_values

In [34]:
# mallet_path = 'Users/mrinal/Downloads/mallet-2.0.8/bin/mallet' # update this path
import os
os.environ.update({'MALLET_HOME':r'/Users/mrinal/Downloads/mallet-2.0.8/bin/mallet'})

__malletPath = '/Users/mrinal/Downloads/mallet-2.0.8/bin/mallet' # update this path
__ldamallet = gensim.models.wrappers.LdaMallet(__malletPath, corpus = __corpus, num_topics = 10, id2word = __id2word)

In [35]:
# Topics with their corresponding probabilities
pprint(__ldamallet.show_topics(formatted = False))

# Computing Coherence Score
__coherenceMallet = CoherenceModel(model = __ldamallet, texts = __lemmData, dictionary = __id2word, coherence = 'c_v')
__coherenceLdaMallet = __coherenceMallet.get_coherence()
print('\nCoherence Score: ', __coherenceLdaMallet)

[(0,
  [('skin', 0.01788552871534313),
   ('patient', 0.017641723798285564),
   ('incision', 0.016061867935752528),
   ('leave', 0.015057391677475352),
   ('tissue', 0.012453555163300533),
   ('area', 0.011371061331564935),
   ('procedure', 0.01128329156142421),
   ('foot', 0.010473859236793087),
   ('lateral', 0.00917681707804683),
   ('perform', 0.009118303897953014)]),
 (1,
  [('place', 0.029313638324281025),
   ('suture', 0.02033468604477152),
   ('incision', 0.016716696449792692),
   ('close', 0.012130388471729505),
   ('procedure', 0.012024753739029392),
   ('patient', 0.011602214808228945),
   ('fashion', 0.010853968784936488),
   ('remove', 0.010801151418586431),
   ('make', 0.00966557804206023),
   ('tube', 0.008274720728175424)]),
 (2,
  [('patient', 0.06146431405498763),
   ('procedure', 0.045220090556878124),
   ('risk', 0.014096998552957101),
   ('eye', 0.013746907529290949),
   ('place', 0.012066470615693414),
   ('remove', 0.011447976473883209),
   ('room', 0.01140129767

In [None]:
# Coherence Score (using LdaMallet) for a range of values - takes forever to run.
model_list, coherence_values = compute_coherence_values(dictionary = __id2word, corpus = __corpus, texts = __lemmData, start = 2, limit = 20, step = 1)

In [None]:
# Plotting graph for range of Cohernece Values"
limit = 20; start = 2; step = 1;

x = range(start, limit, step)

plt.plot(x, coherence_values)

plt.xlabel("Number of Topics")
plt.ylabel("Coherence score")
plt.legend(("coherence_values"), loc = 'best')

plt.show()