### Caricamento librerie

In [25]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
from time import time

In [3]:
import pandas as pd

In [4]:
import nltk
#nltk.download('punkt')
from nltk import word_tokenize 
from nltk.corpus import stopwords
#nltk.download('stopwords')
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer

In [5]:
from sklearn.decomposition import LatentDirichletAllocation

In [6]:
import gensim

In [7]:
import tomotopy as tp

### Pre-processing

Caricamento dataset. 

In [8]:
data = pd.read_csv('C:/Users/micky/OneDrive/Desktop/Tesi DS/data/med_transcription.csv')
data.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [9]:
data.shape

(4999, 6)

In [10]:
corp = data['transcription']
corp.head()

0    SUBJECTIVE:,  This 23-year-old white female pr...
1    PAST MEDICAL HISTORY:, He has difficulty climb...
2    HISTORY OF PRESENT ILLNESS: , I have seen ABC ...
3    2-D M-MODE: , ,1.  Left atrial enlargement wit...
4    1.  The left ventricular cavity size and wall ...
Name: transcription, dtype: object

In [11]:
corp = corp.astype(str) #cast to string
corp = corp[corp!='nan'] #remove nan values
corp

0       SUBJECTIVE:,  This 23-year-old white female pr...
1       PAST MEDICAL HISTORY:, He has difficulty climb...
2       HISTORY OF PRESENT ILLNESS: , I have seen ABC ...
3       2-D M-MODE: , ,1.  Left atrial enlargement wit...
4       1.  The left ventricular cavity size and wall ...
                              ...                        
4994    HISTORY:,  I had the pleasure of meeting and e...
4995    ADMITTING DIAGNOSIS: , Kawasaki disease.,DISCH...
4996    SUBJECTIVE: , This is a 42-year-old white fema...
4997    CHIEF COMPLAINT: , This 5-year-old male presen...
4998    HISTORY: , A 34-year-old male presents today s...
Name: transcription, Length: 4966, dtype: object

Conversione in minuscolo.

In [12]:
corp = corp.str.lower()

Rimozione numeri.

In [13]:
corp = corp.str.replace(r'[0-9]+', ' ')

Rimozione link.

In [14]:
corp = corp.str.replace(r'http\S+', ' ')

Rimozione caratteri speciali (rimuovo qualsiasi cosa che non sia un simbolo alfanumerico o punteggiatura).

In [15]:
corp = corp.str.replace(r'[^a-zA-Z0-9 ]', ' ')

Rimozione singole, doppie lettere.

In [16]:
corp = corp.str.replace('\\b\\w{1,2}\\s', '')

Rimozione spazi extra.

In [17]:
corp = corp.str.replace(' +', ' ')

In [18]:
corp

0       subjective this year old white female presents...
1       past medical history has difficulty climbing s...
2       history present illness have seen abc today ve...
3        mode left atrial enlargement with left atrial...
4        the left ventricular cavity size and wall thi...
                              ...                        
4994    history had the pleasure meeting and evaluatin...
4995    admitting diagnosis kawasaki disease discharge...
4996    subjective this year old white female who come...
4997    chief complaint this year old male presents ch...
4998    history year old male presents today self refe...
Name: transcription, Length: 4966, dtype: object

Rimozione stopwords, stemming e creazione matrice **term-doc**.

In [19]:
stemmer = SnowballStemmer(language='english')

def stem_tokens(tokens, stemmer):
    stemmed = []
    for item in tokens:
        stemmed.append(stemmer.stem(item))
    return stemmed

def tokenize(text):
    tokens = nltk.word_tokenize(text)
    stems = stem_tokens(tokens, stemmer)
    return stems


vect = CountVectorizer(tokenizer=tokenize, 
                       stop_words='english', 
                       lowercase=True)


X = vect.fit_transform(corp.values.astype('U')) #astype per convertire in stringa

doc_term_matrix = pd.DataFrame(X.toarray(), columns=vect.get_feature_names())

doc_term_matrix.head(5)

Unnamed: 0,aaaa,ab,abadeedleedlebadl,abandon,abat,abbott,abbrevi,abc,abcd,abcg,...,zuba,zumi,zung,zygoma,zygomat,zyloprim,zymar,zyprexa,zyrtec,zyvox
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [20]:
print("At the end, there are {} medical transcription and {} terms.".format(
    str(doc_term_matrix.shape[0]), str(doc_term_matrix.shape[1])
))

At the end, there are 4966 medical transcription and 14056 terms.


### Scikit Learn

In [21]:
# model setting
lda = LatentDirichletAllocation(n_components = 7, random_state = 999)

In [22]:
t0 = time()
lda.fit(doc_term_matrix)
print("done in %0.3fs." % (time() - t0))

done in 49.955s.


In [23]:
import pyLDAvis.sklearn
pyLDAvis.enable_notebook()
vis = pyLDAvis.sklearn.prepare(lda, X, vect, R=20, sort_topics=False)

In [26]:
# top 20 terms for each topic
pyLDAvis.save_html(vis, 'LDA-MED(vis. 1).html')

In [27]:
for index, topic in enumerate(lda.components_):
    print(f'Top 30 words for Topic #{index}')
    print([vect.get_feature_names()[i] for i in topic.argsort()[-20:]])
    print('\n')

Top 30 words for Topic #0
['remov', 'anesthesia', 'perform', 'knee', 'distal', 'oper', 'proxim', 'tendon', 'bone', 'fractur', 'foot', 'incis', 'procedur', 'medial', 'later', 'left', 'place', 'right', 'use', 'patient']


Top 30 words for Topic #1
['area', 'fashion', 'posit', 'diagnosi', 'dissect', 'anterior', 'close', 'room', 'anesthesia', 'skin', 'remov', 'sutur', 'incis', 'right', 'oper', 'left', 'procedur', 'use', 'place', 'patient']


Top 30 words for Topic #2
['year', 'posit', 'breast', 'evid', 'remov', 'stomach', 'tube', 'obtain', 'bleed', 'histori', 'polyp', 'mass', 'diagnosi', 'scope', 'perform', 'normal', 'colon', 'biopsi', 'procedur', 'patient']


Top 30 words for Topic #3
['abdomen', 'vessel', 'french', 'sutur', 'anterior', 'incis', 'note', 'perform', 'bladder', 'coronari', 'normal', 'remov', 'cathet', 'procedur', 'use', 'place', 'patient', 'arteri', 'right', 'left']


Top 30 words for Topic #4
['symptom', 'present', 'lower', 'spine', 'unremark', 'year', 'mri', 'bilater', 'te

### Gensim

In [28]:
# Convert sparse matrix to gensim corpus.
corpus_gensim = gensim.matutils.Sparse2Corpus(X, documents_columns=False)

id_map = dict((v, k) for k, v in vect.vocabulary_.items())

In [29]:
t0 = time()
ldamodel = gensim.models.LdaMulticore(corpus=corpus_gensim,
                                      id2word=id_map,
                                      random_state=999,
                                      iterations=50,
                                      num_topics=7)
print("done in %0.3fs." % (time() - t0))

done in 15.377s.


### Tomotopy

In [30]:
stop = stopwords.words('english')
corp_tomotopy = corp.apply(lambda x: [item for item in str(x).split() if item not in stop])

In [31]:
stemmer = SnowballStemmer("english")
corp_tomotopy = corp_tomotopy.apply(lambda x: [stemmer.stem(y) for y in x])

In [32]:
mdl = tp.LDAModel(k=7)
for document in corp_tomotopy:
    mdl.add_doc(document)

mdl.burn_in = 10
mdl.train(0)

In [33]:
t0 = time()
mdl.train(100)
print("done in %0.3fs." % (time() - t0))

done in 3.843s.
