### In this notebook we will perform the word embedding & topic modeling & Cosine Similarity

***we merged the **three** chapters to perform the topic modeling, in order to perform cosine similarity to select which chapter the new input should go with.***

In [30]:
import pandas as pd
import numpy as np
import pickle

# gensim
from gensim import corpora, models, similarities, matutils

# sklearn
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import NMF

### Read the data and pickle file

In [31]:
df02 = pd.read_csv('3_chapters.csv')

In [32]:
# reading the stop words list with pickle
with open ('stop_words.ob', 'rb') as fp:
    stop_words = pickle.load(fp)

In [33]:
df02.columns

Index(['string_values'], dtype='object')

In [34]:
# Declare a list that is to be converted into a column
ch_no = ['musculoskeletal','ear_nose', 'respiratory']
 
# Using 'ch_no' as the column name
# and equating it to the list
df02['Ch_No'] = ch_no

In [35]:
df02

Unnamed: 0,string_values,Ch_No
musculoskeletal,introduction bone muscle ligament musculos sh...,musculoskeletal
ear_nose,introduction ear nose rarely prove except epi...,ear_nose
respiratory,introduction distributes addition oxygen remov...,respiratory


### Word Embedding

In [36]:
df02['string_values']

musculoskeletal     introduction bone muscle ligament musculos sh...
ear_nose            introduction ear nose rarely prove except epi...
respiratory        introduction distributes addition oxygen remov...
Name: string_values, dtype: object

In [37]:
# Create a CountVectorizer for parsing/counting words
count_vectorizer = CountVectorizer(stop_words=stop_words)

doc_word_cv = count_vectorizer.fit_transform(df02['string_values'])



In [38]:
pd.DataFrame(doc_word_cv.toarray(), index=df02['Ch_No'], columns = count_vectorizer.get_feature_names_out()).head()

Unnamed: 0_level_0,aap,abdomen,abdu,abduct,abducted,abducting,abduction,abductor,abgs,ability,...,ysis,ysitis,zealand,zed,zheng,zinc,zone,zoster,zygote,µm
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
musculoskeletal,0,1,1,2,2,2,10,1,0,2,...,1,1,1,0,0,0,0,0,1,0
ear_nose,0,0,0,0,1,0,0,0,0,2,...,1,0,0,1,0,0,0,1,0,0
respiratory,1,1,0,0,0,0,0,0,1,5,...,0,0,1,1,1,1,6,0,0,5


In [39]:
# Create a TfidfVectorizer for parsing/counting words
tfidf = TfidfVectorizer(stop_words=stop_words)

doc_word_tfidf = tfidf.fit_transform(df02['string_values'])



In [40]:
pd.DataFrame(doc_word_tfidf.toarray(), index=df02['Ch_No'], columns = tfidf.get_feature_names_out()).head()

Unnamed: 0_level_0,aap,abdomen,abdu,abduct,abducted,abducting,abduction,abductor,abgs,ability,...,ysis,ysitis,zealand,zed,zheng,zinc,zone,zoster,zygote,µm
Ch_No,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
musculoskeletal,0.0,0.003041,0.003999,0.007997,0.006082,0.007997,0.039987,0.003999,0.0,0.004723,...,0.003041,0.003999,0.003041,0.0,0.0,0.0,0.0,0.0,0.003999,0.0
ear_nose,0.0,0.0,0.0,0.0,0.003177,0.0,0.0,0.0,0.0,0.004934,...,0.003177,0.0,0.0,0.003177,0.0,0.0,0.0,0.004177,0.0,0.0
respiratory,0.004634,0.003525,0.0,0.0,0.0,0.0,0.0,0.0,0.004634,0.013686,...,0.0,0.0,0.003525,0.003525,0.004634,0.004634,0.027807,0.0,0.0,0.023172


### Topic Modeling: **LDA**

In [41]:
# Convert sparse matrix of counts to a gensim corpus
corpus = matutils.Sparse2Corpus(doc_word_cv)

In [42]:
id2word = dict((v, k) for k, v in count_vectorizer.vocabulary_.items())

In [43]:
# Create lda model (equivalent to "fit" in sklearn)
lda = models.LdaModel(corpus=corpus, num_topics=3, id2word=id2word, passes=5)

In [44]:
lda.print_topics(3)

[(0,
  '0.478*"aap" + 0.375*"abdomen" + 0.034*"abdu" + 0.000*"pencil" + 0.000*"pelvis" + 0.000*"pen" + 0.000*"peer" + 0.000*"penetrate" + 0.000*"penicillamine" + 0.000*"penetrating"'),
 (1,
  '0.000*"abdu" + 0.000*"aap" + 0.000*"abdomen" + 0.000*"pencil" + 0.000*"pelvis" + 0.000*"pen" + 0.000*"peer" + 0.000*"penetrate" + 0.000*"penicillamine" + 0.000*"penetrating"'),
 (2,
  '0.811*"abdu" + 0.001*"abdomen" + 0.000*"aap" + 0.000*"pencil" + 0.000*"pelvis" + 0.000*"pen" + 0.000*"peer" + 0.000*"penetrate" + 0.000*"penicillamine" + 0.000*"penetrating"')]

### Performing CorEx:

In [45]:
from corextopic import corextopic as ct
from corextopic import vis_topic as vt

words = list(np.asarray(count_vectorizer.get_feature_names_out()))


In [46]:
topic_model = ct.Corex(n_hidden=3, words=words, seed=1)
topic_model.fit(doc_word_cv, words=words, docs=df02['string_values'])



<corextopic.corextopic.Corex at 0x1ceea783290>

In [47]:
topics = topic_model.get_topics()
for n,topic in enumerate(topics):
    topic_words,_,_ = zip(*topic)
    print('{}: '.format(n) + ','.join(topic_words))

0: aap,nting,november,notype,notoriety,noting,normalized,normalize,norma,norm
1: abdu,nonunion,nonstriated,nne,ninety,neus,neurolysis,neurolog,neurog,nephrolithiasis
2: lamb,nasopharyngoscopy,nasopharyngoscop,nasopharyngitis,nasendoscopy,naris,nant,ménière,ménièr,myringotomy


### Topic Modeling: LSA

In [48]:
lsa = TruncatedSVD(3)
doc_topic = lsa.fit_transform(doc_word_cv)
print(lsa.explained_variance_ratio_)

[0.04294863 0.54253613 0.41451524]


In [49]:
topic_word = pd.DataFrame(lsa.components_.round(3),
             index = ['component'+str(i) for i in range(3)],
             columns = count_vectorizer.get_feature_names_out())

print(topic_word)

              aap  abdomen   abdu  abduct  abducted  abducting  abduction  \
component0  0.001    0.003  0.002   0.004     0.005      0.004      0.018   
component1  0.001   -0.002 -0.002  -0.005    -0.002     -0.005     -0.023   
component2  0.003    0.003 -0.001  -0.002    -0.003     -0.002     -0.009   

            abductor   abgs  ability  ...   ysis  ysitis  zealand    zed  \
component0     0.002  0.001    0.012  ...  0.003   0.002    0.003  0.002   
component1    -0.002  0.001    0.003  ...  0.000  -0.002   -0.002  0.003   
component2    -0.001  0.003    0.013  ... -0.002  -0.001    0.003  0.002   

            zheng   zinc   zone  zoster  zygote     µm  
component0  0.001  0.001  0.006   0.001   0.002  0.005  
component1  0.001  0.001  0.003   0.003  -0.002  0.003  
component2  0.003  0.003  0.021  -0.002  -0.001  0.017  

[3 rows x 5834 columns]


In [50]:
tem_list = [] 
def display_topics(model, feature_names, no_top_words, topic_names=None):
    
    for ix, topic in enumerate(model.components_):
        inner_tem_list = []
       
        if not topic_names or not topic_names[ix]:
            print("\nTopic ", ix)
        else:
            print("\nTopic: '",topic_names[ix],"'")
            
        print(", ".join([feature_names[i]
                        for i in topic.argsort()[:-no_top_words - 1:-1]]))
        inner_tem_list.append(", ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))
        tem_list.append(inner_tem_list)

In [51]:
result1 = display_topics(lsa, count_vectorizer.get_feature_names_out(), 20)


Topic  0
bone, muscle, ear, otitis, hearing, airway, membrane, sur, bleeding, deformity, hip, oxygen, ventilation, obstruction, gery, cord, brace, attack, breathing, traction

Topic  1
ear, otitis, hearing, bleeding, sinusitis, nose, externa, membrane, obstruction, airway, sinus, septum, cord, polyp, voice, speech, mouth, packing, breathing, tumor

Topic  2
ventilation, oxygen, airway, dyspnea, copd, breathing, acidosis, hg, crackle, artery, bronchiectasis, silicosis, cwp, inspiration, collapse, distress, ph, croup, well, exertion


In [52]:
tem_list
final_dic = {}
final_dic["Bone"] = tem_list[0]
final_dic["Ear"] = tem_list[1]
final_dic["Breathing"] = tem_list[2]

In [53]:
final_dic

{'Bone': ['bone, muscle, ear, otitis, hearing, airway, membrane, sur, bleeding, deformity, hip, oxygen, ventilation, obstruction, gery, cord, brace, attack, breathing, traction'],
 'Ear': ['ear, otitis, hearing, bleeding, sinusitis, nose, externa, membrane, obstruction, airway, sinus, septum, cord, polyp, voice, speech, mouth, packing, breathing, tumor'],
 'Breathing': ['ventilation, oxygen, airway, dyspnea, copd, breathing, acidosis, hg, crackle, artery, bronchiectasis, silicosis, cwp, inspiration, collapse, distress, ph, croup, well, exertion']}

In [54]:
tem_df = pd.DataFrame.from_dict(final_dic, orient ='index') 
tem_df

Unnamed: 0,0
Bone,"bone, muscle, ear, otitis, hearing, airway, me..."
Ear,"ear, otitis, hearing, bleeding, sinusitis, nos..."
Breathing,"ventilation, oxygen, airway, dyspnea, copd, br..."


In [55]:
# Declare a list that is to be converted into a column
d_name = ['musculoskeletal', 'ear_nose', 'respiratory']
 
# Using 'ch_no' as the column name
# and equating it to the list
tem_df['D_Name'] = d_name

In [56]:
tem_df.columns

Index([0, 'D_Name'], dtype='object')

In [57]:
tem_df = tem_df.rename(columns={0: 'Description'})
tem_df

Unnamed: 0,Description,D_Name
Bone,"bone, muscle, ear, otitis, hearing, airway, me...",musculoskeletal
Ear,"ear, otitis, hearing, bleeding, sinusitis, nos...",ear_nose
Breathing,"ventilation, oxygen, airway, dyspnea, copd, br...",respiratory


In [58]:
tem_df.to_csv('diseases_with_description.csv', index=False)