In [1]:
import re
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import text 

# Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
from gensim.parsing.preprocessing import STOPWORDS

# nltk
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *

# Plotting tools
import pyLDAvis
import pyLDAvis.gensim  # don't skip this
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('whitegrid')


**Prepare stop words**

In [3]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(["failure", "acute", "disease", "arrest", "unknown", "natural", "cause", "causes", "chronic", "sudden", "unspecified",
                "unexplained", "specified", "related", "advanced", "primary", "use", "death", "pending", "undetermined",
               "probable", "sepsis", "septic", "shock", "end", "stage", "idiopathic", "pulseless", "electrical", "activity",
               "response", "history", "thrive", "type","non", "right", "etiology", "reduced", "severe", "bed", "infant", "versus", 
                "possible", "type", "uncertain", "small", "poorly", "adult", "cardiac", "fibrillation", "care", "comfort", "multiple", "patient",
               "fatal", "exacerbation","decedent", "factors", "measures", "arrest", "failure", "pain"])

**Load data**

In [4]:
d = pd.read_csv('Y:/DQSS/Death/MBG/py/capstone2/data/cod_txt_GCrecords_19.csv')

In [5]:
d = d.loc[:,['gc_cat', 'gc_cat_label', 'codlit']]
d.head()

Unnamed: 0,gc_cat,gc_cat_label,codlit
0,4,4-Volume depletion,"HYPERNATREMIA SEVERE DYSPHAGIA DIABETES, ALZH..."
1,6,6-Ill-defined cardiovascular,VASCULAR DEMENTIA ESSENTIAL HYPERTENSION
2,3,3-Ill-defined cancer,UPPER GASTROINTESTINAL BLEED TRACHEAL MASS
3,2,2-Heart failure,ACUTE CARDIOPULMONARY ARREST MULTI ORGAN FAILU...
4,6,6-Ill-defined cardiovascular,PULMONARY EMBOLISM


**Tokenize and clean text**

In [17]:
## helper function to preprocess text: lemmatize, stemming, etc
stemmer = SnowballStemmer("english")

def lemm_stem(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preproc(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in stop_words and len(token) > 3:
        #if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            result.append(lemm_stem(token))
    return result        
                       

In [18]:
#apply function to tokenize and lemmatize all records
processed_docs = d['codlit'].map(preproc)

processed_docs[:10]

0    [hypernatremia, dysphagia, diabet, alzheim, de...
1             [vascular, dementia, essenti, hypertens]
2                       [upper, bleed, tracheal, mass]
3    [cardiopulmonari, multi, organ, shockacut, hyp...
4                                   [pulmonari, embol]
5    [organ, syndrom, sourc, metastat, prostat, can...
6                              [cardiovascular, crisi]
7    [hepatoren, syndrom, methicillin, resist, stap...
8    [complic, atherosclerot, vascular, atrial, tho...
9    [respiratori, staphylococc, aureus, bacteremia...
Name: codlit, dtype: object

In [23]:
# create dictionary mapping token id to actual word
id2word = gensim.corpora.Dictionary(processed_docs)

In [24]:
'''Using the word counts we can filter tokens that occur rarely in the whole corpus or that occur in more
than 0.5 of the documents'''

id2word.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)

**CONSTRUCT BAG OF WORDS**

In [25]:
'''bag of words - for each document there is now a dictionary.  
Word counts can be figured out using these with BOW.'''

bow_corpus = [id2word.doc2bow(doc) for doc in processed_docs]

In [28]:
bow_doc_5 = bow_corpus[5]

for i in range(len(bow_doc_5)):
    print("Word {} (\"{}\") appears {} times.".format(bow_doc_5[i][0],                                              
id2word[bow_doc_5[i][0]],
bow_doc_5[i][1]))

Word 19 ("organ") appears 1 times.
Word 26 ("cancer") appears 1 times.
Word 27 ("metastat") appears 1 times.
Word 28 ("prostat") appears 1 times.
Word 29 ("sourc") appears 1 times.
Word 30 ("syndrom") appears 1 times.


**TF-IDF corpus**

In [29]:
from gensim import corpora, models

tfidf = models.TfidfModel(bow_corpus)
corpus_tfidf = tfidf[bow_corpus]

**LDA with TF-IDF corpus** created with all cause of death text in records with garbage codes.

In [31]:
lda_tfidf_all = gensim.models.LdaMulticore(corpus_tfidf,
                                       num_topics=10,
                                       id2word=id2word,
                                       passes=2,
                                       workers=2)

In [32]:
for idx, topic in lda_tfidf_all.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx,topic))

Topic: 0 
Words: 0.054*"pneumonia" + 0.048*"fentanyl" + 0.043*"aspir" + 0.040*"sleep" + 0.036*"toxicolog" + 0.032*"obstruct" + 0.029*"anatom" + 0.026*"injuri" + 0.025*"pulmonari" + 0.021*"anox"
Topic: 1 
Words: 0.064*"systol" + 0.058*"heart" + 0.056*"cancer" + 0.050*"metastat" + 0.049*"diastol" + 0.043*"dementia" + 0.042*"congest" + 0.032*"alzheim" + 0.030*"respiratori" + 0.026*"liver"
Topic: 2 
Words: 0.071*"respiratori" + 0.070*"cardiopulmonari" + 0.027*"multiorgan" + 0.027*"malign" + 0.025*"heart" + 0.023*"hypox" + 0.022*"origin" + 0.021*"hypoxia" + 0.019*"cardio" + 0.018*"bacteremia"
Topic: 3 
Words: 0.047*"organ" + 0.041*"cardiomegali" + 0.038*"multi" + 0.033*"carcinoma" + 0.027*"circumst" + 0.027*"refractori" + 0.026*"morbid" + 0.025*"injuri" + 0.024*"infect" + 0.022*"drown"
Topic: 4 
Words: 0.070*"heart" + 0.063*"congest" + 0.049*"disord" + 0.041*"bowel" + 0.036*"methamphetamin" + 0.036*"kidney" + 0.036*"hypertens" + 0.028*"bilater" + 0.027*"hemorrhag" + 0.026*"diabet"
Topic: 5 

**Evaluate LDA model using the full TF-IDF corpus**

In [33]:
coherence_lda_tfidf_all = CoherenceModel(model = lda_tfidf_all, 
                                      texts = processed_docs, 
                                      dictionary = id2word, 
                                      coherence = 'c_v')

coherence_ldatfidf_all = coherence_lda_tfidf_all.get_coherence()

In [34]:
print("Coherence score for LDA_TF-IDF with 2 passes: ", coherence_ldatfidf2)

Coherence score for LDA_TF-IDF with 2 passes:  0.31564680660458866


**REPEAT LDA FOR EACH GARBAGE CODE CATEGORY INDIVIDUALLY** excluding category 9 (too few records) and category 7(no records).

In [41]:
d2 = d[~d['gc_cat'].isin([7,9])]

d2.gc_cat.value_counts()

6    1025
5     875
2     762
3     663
1     547
4     125
8      81
Name: gc_cat, dtype: int64

**LDA FOR GARBAGE CODE CATEGORY 1: SEPTICEMIA**

In [55]:
processed1 = d.loc[d['gc_cat']==1, 'codlit'].map(preproc)
id2word1= gensim.corpora.Dictionary(processed1)
id2word1.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus = [id2word1.doc2bow(doc) for doc in processed1]
corpus_tfidf1 = tfidf[bow_corpus]
lda_tfidf1 = gensim.models.LdaMulticore(corpus_tfidf1,
                                       num_topics=5,
                                       id2word=id2word1,
                                       passes=10,
                                       workers=2)

In [56]:
for idx, topic in lda_tfidf1.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx,topic))

Topic: 0 
Words: 0.130*"myocardi" + 0.118*"infarct" + 0.103*"metabol" + 0.094*"syndrom" + 0.087*"acidosi" + 0.060*"encephalopathi" + 0.032*"kidney" + 0.031*"renal" + 0.030*"cirrhosi" + 0.027*"system"
Topic: 1 
Words: 0.131*"heart" + 0.077*"aureus" + 0.071*"multiorgan" + 0.063*"kidney" + 0.063*"staphylococcus" + 0.057*"congest" + 0.057*"bacteremia" + 0.046*"respiratori" + 0.043*"methicillin" + 0.038*"pneumonia"
Topic: 2 
Words: 0.271*"respiratori" + 0.135*"pneumonia" + 0.066*"kidney" + 0.053*"injuri" + 0.046*"cardiopulmonari" + 0.045*"pulmonari" + 0.036*"hypox" + 0.034*"hypoxem" + 0.034*"alcohol" + 0.030*"obstruct"
Topic: 3 
Words: 0.174*"organ" + 0.147*"multi" + 0.127*"bacteremia" + 0.079*"renal" + 0.070*"system" + 0.064*"bleed" + 0.059*"cancer" + 0.053*"secondari" + 0.048*"sourc" + 0.027*"cardiogen"
Topic: 4 
Words: 0.205*"infect" + 0.118*"dementia" + 0.080*"diabet" + 0.075*"arteri" + 0.062*"tract" + 0.058*"mellitus" + 0.055*"urinari" + 0.055*"atrial" + 0.038*"renal" + 0.035*"hyperten

In [57]:
coherence_lda_tfidf1 = CoherenceModel(model = lda_tfidf1, 
                                      texts = processed1, 
                                      dictionary = id2word1, 
                                      coherence = 'c_v')

coherence_ldatfidf1 = coherence_lda_tfidf1.get_coherence()
print("Coherence score for LDA_TF-IDF for GC category 1 ('Septicemia') with 5 topics and 10 passes: ", coherence_ldatfidf1)

Coherence score for LDA_TF-IDF for GC category 1 ('Septicemia') with 5 topics and 10 passes:  0.31935369216172693


In [58]:
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis
pyLDAvis.enable_notebook()
LDA_tfidf1_vis = pyLDAvis.gensim.prepare(lda_tfidf1, corpus_tfidf1, id2word1)
LDA_tfidf1_vis

**LDA FOR GARBAGE CODE CATEGORY 2: HEART FAILURE**

In [69]:
processed2 = d.loc[d['gc_cat']==2, 'codlit'].map(preproc)
id2word2= gensim.corpora.Dictionary(processed2)
id2word2.filter_extremes(no_below=15, no_above=0.5, keep_n=100000)
bow_corpus2 = [id2word2.doc2bow(doc) for doc in processed2]
corpus_tfidf2 = tfidf[bow_corpus2]
lda_tfidf2 = gensim.models.LdaMulticore(corpus_tfidf2,
                                       num_topics=15,
                                       id2word=id2word2,
                                       passes=50,
                                       workers=2)

In [70]:
for idx, topic in lda_tfidf2.print_topics(-1):
    print('Topic: {} \nWords: {}'.format(idx,topic))

Topic: 0 
Words: 0.449*"pleural" + 0.176*"effus" + 0.070*"pneumonia" + 0.063*"pulmonari" + 0.062*"obstruct" + 0.048*"vascular" + 0.032*"lung" + 0.030*"systol" + 0.003*"respiratori" + 0.003*"kidney"
Topic: 1 
Words: 0.667*"atherosclerosi" + 0.065*"renal" + 0.059*"dementia" + 0.049*"atrial" + 0.046*"kidney" + 0.030*"hypertens" + 0.024*"injuri" + 0.003*"lung" + 0.003*"diastol" + 0.003*"obstruct"
Topic: 2 
Words: 0.838*"hypoxia" + 0.048*"dementia" + 0.004*"respiratori" + 0.004*"kidney" + 0.004*"cardiopulmonari" + 0.004*"cardiogen" + 0.004*"obstruct" + 0.004*"pulmonari" + 0.004*"eject" + 0.004*"fraction"
Topic: 3 
Words: 0.424*"obstruct" + 0.290*"pulmonari" + 0.145*"cancer" + 0.048*"kidney" + 0.044*"hypertens" + 0.029*"dementia" + 0.001*"renal" + 0.001*"diastol" + 0.001*"lung" + 0.001*"atrial"
Topic: 4 
Words: 0.528*"renal" + 0.423*"atrial" + 0.021*"lung" + 0.001*"diastol" + 0.001*"hypertens" + 0.001*"respiratori" + 0.001*"kidney" + 0.001*"hypoxia" + 0.001*"pneumonia" + 0.001*"systol"
Topic

In [72]:
coherence_lda_tfidf2 = CoherenceModel(model = lda_tfidf2, 
                                      texts = processed2, 
                                      dictionary = id2word2, 
                                      coherence = 'c_v')

coherence_ldatfidf2 = coherence_lda_tfidf2.get_coherence()
print("Coherence score for LDA_TF-IDF for GC category 2 ('Heart failure') with 15 topics and 50 passes: ", coherence_ldatfidf2)

Coherence score for LDA_TF-IDF for GC category 2 ('Heart failure') with 15 topics and 50 passes:  0.2408431204179867


In [73]:
from pyLDAvis import sklearn as sklearn_lda
import pyLDAvis
pyLDAvis.enable_notebook()
LDA_tfidf2_vis = pyLDAvis.gensim.prepare(lda_tfidf2, corpus_tfidf2, id2word2)
LDA_tfidf2_vis