### Caricamento librerie

In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from time import time

In [3]:
import pandas as pd

In [4]:
import nltk
#nltk.download('punkt')
from nltk import word_tokenize 
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
import gensim

In [7]:
import tomotopy as tp

### Preprocessing

Caricamento dataset. 

In [8]:
data = pd.read_csv('C:/Users/micky/OneDrive/Desktop/Tesi DS/data/med_transcription.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [9]:
data.shape

(4999, 6)

In [10]:
# undersampling 10% of dataset
data = data.sample(frac=0.1, replace = False, random_state=0)

In [11]:
corp = data['transcription']

In [12]:
corp = corp.astype(str) #cast to string
corp = corp[corp!='nan'] #remove nan values
corp

4585    REASON FOR VISIT: , The patient is a 74-year-o...
1593    CC:, Falling.,HX:, This 67y/o RHF was diagnose...
562     PREOPERATIVE DIAGNOSES:,  Bilateral mammary hy...
825     PROCEDURE: , Esophagogastroduodenoscopy with g...
3777    PREOPERATIVE DIAGNOSES,1.  Recurrent acute oti...
                              ...                        
4736    MYOVIEW NUCLEAR STRESS STUDY,REASON FOR THE TE...
4784    PROCEDURES PERFORMED:,1.  Left heart catheteri...
2315    PREOPERATIVE DIAGNOSIS:,  Medial meniscal tear...
2162    PREOPERATIVE DIAGNOSIS: , Herniated nucleus pu...
124     PREOPERATIVE DIAGNOSIS:,  Recurring bladder in...
Name: transcription, Length: 496, dtype: object

Conversione in minuscolo.

In [13]:
corp = corp.str.lower()

Rimozione numeri.

In [14]:
corp = corp.str.replace(r'[0-9]+', ' ')

Rimozione link.

In [15]:
corp = corp.str.replace(r'http\S+', ' ')

Rimozione caratteri speciali (rimuovo qualsiasi cosa che non sia un simbolo alfanumerico o punteggiatura).

In [16]:
corp = corp.str.replace(r'[^a-zA-Z0-9 ]', ' ')

Rimozione singole, doppie lettere.

In [17]:
corp = corp.str.replace('\\b\\w{1,2}\\s', '')

Rimozione spazi extra.

In [18]:
corp = corp.str.replace(' +', ' ')

In [19]:
corp

4585    reason for visit the patient year old woman wh...
1593     falling this rhf was diagnosed with parkinson...
562     preoperative diagnoses bilateral mammary hyper...
825     procedure esophagogastroduodenoscopy with gast...
3777    preoperative diagnoses recurrent acute otitis ...
                              ...                        
4736    myoview nuclear stress study reason for the te...
4784    procedures performed left heart catheterizatio...
2315    preoperative diagnosis medial meniscal tear le...
2162    preoperative diagnosis herniated nucleus pulpo...
124     preoperative diagnosis recurring bladder infec...
Name: transcription, Length: 496, dtype: object

Rimozione stopwords, stemming e creazione matrice **term-doc**.

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
def tokenize(text):
    tokens = nltk.word_tokenize(text)
    return tokens


vect = TfidfVectorizer(tokenizer=tokenize,
                       lowercase=True,
                       n_gram_range)


X = vect.fit_transform(corp.values.astype('U')) #astype per convertire in stringa

doc_term_matrix = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())

doc_term_matrix.head(5)

Unnamed: 0,abadeedleedlebadl,abat,abbrevi,abc,abcd,abcg,abd,abdomen,abdomin,abdominus,...,zoladex,zoloft,zone,zonegran,zoster,zosyn,zygoma,zygomat,zyloprim,zyrtec
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [21]:
print("Finally, there are {} medical transcriptions and {} terms.".format(
    str(doc_term_matrix.shape[0]), str(doc_term_matrix.shape[1])
))

Finally, there are 496 medical transcriptions and 7693 terms.


### Sklearn

In [22]:
t0 = time()
lda = LatentDirichletAllocation(n_components = 7, random_state = 999)
lda.fit(doc_term_matrix)
print("done in %0.3fs." % (time() - t0))

done in 4.382s.


In [24]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, X, vect, R=20)

In [27]:
pyLDAvis.save_html(vis, 'LDA-MED-sub(vis. 1).html')

In [28]:
for index, topic in enumerate(lda.components_):
    print(f'Top 30 words for Topic #{index}')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 30 words for Topic #0
['allergi', 'examin', 'daili', 'old', 'blood', 'right', 'doe', 'note', 'day', 'past', 'deni', 'present', 'time', 'year', 'pain', 'normal', 'ani', 'medic', 'patient', 'histori']


Top 30 words for Topic #1
['histori', 'need', 'hour', 'time', 'diagnosi', 'given', 'pain', 'remov', 'carbohydr', 'day', 'risk', 'plan', 'use', 'place', 'blood', 'procedur', 'discuss', 'prostat', 'bladder', 'patient']


Top 30 words for Topic #2
['perform', 'contrast', 'breast', 'mass', 'evid', 'tissu', 'inject', 'diagnos', 'remov', 'posit', 'lesion', 'needl', 'obtain', 'tumor', 'left', 'normal', 'use', 'right', 'procedur', 'patient']


Top 30 words for Topic #3
['posterior', 'later', 'extrem', 'posit', 'fashion', 'medial', 'wound', 'oper', 'screw', 'procedur', 'remov', 'perform', 'time', 'fractur', 'right', 'knee', 'left', 'patient', 'use', 'place']


Top 30 words for Topic #4
['pain', 'ani', 'exam', 'unremark', 'use', 'area', 'present', 'year', 'histori', 'reveal', 'mild', 'bilater',

In [31]:
import numpy as np

topic_names = ["Topic" + str(i) for i in range(lda.n_components)]
doc_names = ["Doc" + str(i) for i in range(len(corp))]

df_document_topic = pd.DataFrame(np.round(lda.transform(doc_term_matrix), 2), columns=topic_names, index=doc_names)

topic = np.argmax(df_document_topic.values, axis=1)
df_document_topic['assigned_topic'] = topic

df_document_topic.head(10)

Unnamed: 0,Topic0,Topic1,Topic2,Topic3,Topic4,Topic5,Topic6,assigned_topic
Doc0,0.45,0.0,0.0,0.0,0.55,0.0,0.0,4
Doc1,0.67,0.0,0.0,0.0,0.32,0.0,0.0,0
Doc2,0.0,0.0,0.0,0.0,0.0,1.0,0.0,5
Doc3,0.23,0.0,0.16,0.11,0.0,0.5,0.0,5
Doc4,0.0,0.43,0.0,0.0,0.34,0.23,0.0,1
Doc5,0.0,0.42,0.0,0.39,0.0,0.19,0.0,1
Doc6,0.6,0.0,0.36,0.03,0.0,0.0,0.0,0
Doc7,0.87,0.0,0.0,0.0,0.08,0.05,0.0,0
Doc8,0.0,0.0,0.15,0.0,0.0,0.85,0.0,5
Doc9,0.79,0.0,0.21,0.0,0.0,0.0,0.0,0


In [32]:
df_document_topic['assigned_topic'].value_counts().to_frame()

Unnamed: 0,assigned_topic
0,181
5,119
4,52
2,44
1,37
3,35
6,28


### Gensim

In [22]:
# Convert sparse matrix to gensim corpus
corpus_gensim = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

id_map = dict((v, k) for k, v in vect.vocabulary_.items())

In [24]:
t0 = time()

# use LdaMulticore
ldamodel = gensim.models.LdaMulticore(corpus=corpus_gensim,
                                      id2word=id_map,
                                      random_state=999,
                                      iterations=50,
                                      num_topics=7)
print("done in %0.3fs." % (time() - t0))

done in 7.550s.


### Tomotopy

In [25]:
stop = stopwords.words('english')
corp_tomotopy = corp.apply(lambda x: [item for item in str(x).split() if item not in stop])

In [26]:
stemmer = SnowballStemmer("english")
corp_tomotopy = corp.apply(lambda x: [stemmer.stem(y) for y in x])

In [40]:
corpus_model = tp.utils.Corpus()
for doc in corp_tomotopy:
    corpus_model.add_doc(doc)

In [41]:
mdl = tp.LDAModel(k=7, seed=9, corpus=corpus_model)
mdl.burn_in = 10

In [53]:
t0 = time()
mdl.train(100)
print("done in %0.3fs." % (time() - t0))

done in 3.629s.
