In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv(r"C:\Users\Harshit Singh\Music\DSAI\mtsamplesV2.csv")
df.head()

Unnamed: 0,medical_specialty,transcription
0,Neurology,"CC:, Confusion and slurred speech.,HX , (prima..."
1,Urology,"PROCEDURE: , Elective male sterilization via b..."
2,Urology,"INDICATION:, Prostate Cancer.,TECHNIQUE:, 3...."
3,Urology,"DESCRIPTION:, The patient was placed in the s..."
4,Urology,"PREOPERATIVE DIAGNOSIS: , Voluntary sterility...."


In [3]:
df.shape

(1239, 2)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1239 entries, 0 to 1238
Data columns (total 2 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   medical_specialty  1239 non-null   object
 1   transcription      1231 non-null   object
dtypes: object(2)
memory usage: 19.5+ KB


In [5]:
df.isna().sum()

medical_specialty    0
transcription        8
dtype: int64

In [6]:
new_df = df.dropna()
new_df.isna().sum()

medical_specialty    0
transcription        0
dtype: int64

In [7]:
new_df['medical_specialty'].unique()

array([' Neurology', ' Urology', ' Radiology', ' Orthopedic',
       ' Gastroenterology'], dtype=object)

In [8]:
new_df['medical_specialty'].value_counts()

 Orthopedic          355
 Radiology           273
 Gastroenterology    224
 Neurology           223
 Urology             156
Name: medical_specialty, dtype: int64

## Text Preprocessing and Cleaning

In [12]:
print('Sample transcription 1:'+df.iloc[4]['transcription']+'\n')
print('Sample transcription 2:'+df.iloc[14]['transcription']+'\n')

Sample transcription 1:PREOPERATIVE DIAGNOSIS: , Voluntary sterility.,POSTOPERATIVE DIAGNOSIS: , Voluntary sterility.,OPERATIVE PROCEDURE:,  Bilateral vasectomy.,ANESTHESIA:,  Local.,INDICATIONS FOR PROCEDURE:  ,A gentleman who is here today requesting voluntary sterility.  Options were discussed for voluntary sterility and he has elected to proceed with a bilateral vasectomy.,DESCRIPTION OF PROCEDURE:  ,The patient was brought to the operating room, and after appropriately identifying the patient, the patient was prepped and draped in the standard surgical fashion and placed in a supine position on the OR table.  Then, 0.25% Marcaine without epinephrine was used to anesthetize the scrotal skin.  A small incision was made in the right hemiscrotum.  The vas deferens was grasped with a vas clamp.  Next, the vas deferens was skeletonized.  It was clipped proximally and distally twice.  The cut edges were fulgurated.  Meticulous hemostasis was maintained.  Then, 4-0 chromic was used to clo

In [13]:
## Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 

# from imblearn.over_sampling import SMOTE

special_character_remover = re.compile('[/(){}\[\]\|@,;]')
extra_symbol_remover = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [14]:
def clean_text(text):
    text = str(text)
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = text.lower()
    text = special_character_remover.sub(' ',text)
    text = extra_symbol_remover.sub('',text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

def lemmatize_text(text):
    wordlist=[]
    lemmatizer = WordNetLemmatizer() 
    sentences=sent_tokenize(text)
    
    for sentence in sentences:
        words=word_tokenize(sentence)
        for word in words:
            wordlist.append(lemmatizer.lemmatize(word))    
    return ' '.join(wordlist) 

In [15]:
df['transcription'] = df['transcription'].apply(clean_text)
df['transcription'] = df['transcription'].apply(lemmatize_text)

In [16]:
print('Sample Transcription 1:'+df.iloc[5]['transcription']+'\n')
print('Sample Transcription 2:'+df.iloc[125]['transcription']+'\n')
print('Sample Transcription 3:'+df.iloc[1000]['transcription'])

Sample Transcription 1:chief complaint

Sample Transcription 2:preoperative diagnosis benign prostatic hypertrophy urinary retention postoperative diagnosis benign prostatic hypertrophy urinary retention procedure performed cystourethroscopy transurethral resection prostate turp anesthesia spinal drain # french threeway foley catheter specimen prostatic resection chip estimated blood loss cc disposition patient transferred pacu stable condition indication finding yearold male history bph subsequent urinary retention failure trial void scheduled elective turp procedure finding time surgery cystourethroscopy revealed trilobar enlargement prostate prostatic varix median lobe cystoscopy showed cellules bladder obvious bladder tumor noted description procedure informed consent obtained patient moved operating room spinal anesthesia induced department anesthesia patient prepped draped normal sterile fashion # french cystoscope inserted urethra bladder cystoscopy performed finding cystoscope 

In [43]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75, use_idf=True, smooth_idf=True, max_features=1000)
tfIdfMat  = vectorizer.fit_transform(df['transcription'].tolist() )
feature_names = sorted(vectorizer.get_feature_names())
print(feature_names)

['abc', 'abcd', 'abdomen', 'abdominal', 'abdominal pain', 'able', 'abnormal', 'abnormality', 'achieved', 'active', 'activity', 'acute', 'addition', 'additional', 'adequate', 'adhesion', 'administered', 'admission', 'admitted', 'advanced', 'age', 'ago', 'air', 'alcohol', 'alert', 'alert oriented', 'alignment', 'allergy', 'alternative', 'amplitude', 'anesthesia', 'anesthesia general', 'anesthetic', 'aneurysm', 'ankle', 'annular', 'anterior', 'anterior cervical', 'antibiotic', 'aortic', 'ap', 'apparent', 'appear', 'appearance', 'appeared', 'appears', 'appendix', 'applied', 'applied patient', 'appropriate', 'approximated', 'approximately', 'approximately cm', 'area', 'arm', 'artery', 'aspect', 'aspiration', 'assessment', 'associated', 'atrophy', 'attachment', 'attention', 'attention directed', 'axial', 'base', 'based', 'bed', 'began', 'benefit', 'benign', 'better', 'biceps', 'bid', 'bilateral', 'bilaterally', 'biopsy', 'bipolar', 'bladder', 'blade', 'ble', 'bleeding', 'block', 'blood', 'bl



In [46]:
pca = PCA(n_components=0.95)
tfIdfMat_reduced = pca.fit_transform(tfIdfMat.toarray())
labels = df['medical_specialty'].tolist()
category_list = df.medical_specialty.unique()
X_train, X_test, y_train, y_test = train_test_split(tfIdfMat_reduced, labels, stratify=labels,random_state=1)   

In [47]:
print('Train_Set_Size:'+str(X_train.shape))
print('Test_Set_Size:'+str(X_test.shape))

Train_Set_Size:(929, 462)
Test_Set_Size:(310, 462)


In [54]:
clf = LogisticRegression(penalty= 'elasticnet', solver= 'saga', l1_ratio=0.5, random_state=1).fit(X_train, y_train)
y_test_pred= clf.predict(X_test)
# y_test_pred

In [56]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_test_pred)

0.7483870967741936

In [50]:
print(classification_report(y_test,y_test_pred,labels=category_list))

                   precision    recall  f1-score   support

        Neurology       0.61      0.59      0.60        56
          Urology       0.94      0.74      0.83        39
        Radiology       0.59      0.65      0.62        68
       Orthopedic       0.82      0.91      0.86        89
 Gastroenterology       0.87      0.78      0.82        58

         accuracy                           0.75       310
        macro avg       0.76      0.73      0.75       310
     weighted avg       0.76      0.75      0.75       310



In [52]:
labels = category_list
cm = confusion_matrix(y_test, y_test_pred)
cm

array([[45,  1,  1, 10,  1],
       [ 0, 33,  7, 16,  0],
       [ 0,  5, 81,  3,  0],
       [ 1, 12, 10, 44,  1],
       [ 6,  3,  0,  1, 29]], dtype=int64)

In [9]:
#import required libraries
import pandas as pd
import random
import re
import string
import nltk
from nltk.corpus import stopwords  # removing all the stop words
from nltk.stem.wordnet import WordNetLemmatizer
# nltk.download('punkt')
# nltk.download('wordnet')
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score

In [10]:
X= df['transcription']
y= df['medical_specialty']

In [33]:
corpus = []  ##  Empty corpus list, can be used to store all the text after cleaning.
for i in range(len(X)):
    '''Make text lowercase, remove text in square brackets,remove links,remove punctuation
    and remove words containing numbers.'''
    text = re.sub('[^a-zA-Z]', ' ', str(X.iloc[i]))
    text = re.sub(r'\d+', '', text) # Remove numbers
    text = re.sub('\[.*?\]', '', text)
    text = " ".join(text.split()) # Remove whitespaces
    text = text.lower()
    text = text.split()
    
    all_stopwords = stopwords.words('english')
    text = [y for y in text if y not in all_stopwords]
    
    # Stemming
    # ps = PorterStemmer()
    # review = [ps.stem(word) for word in review if not word in set(all_stopwords)]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = [lemmatizer.lemmatize(word) for word in text if word not in set(all_stopwords)]


    text = ' '.join(text)
    corpus.append(text)

In [None]:
# clean_text function shown below preprocess the text data provided.
def clean_text(text):
    text = str(text)
    # lower text 
    text = text.lower()
    #removing stop words
    text = ' '.join([e_words for e_words in text.split(' ') if e_words not in stopwords.words('english')])
    #removing square brackets
    text=re.sub('[.*?]', '', text)
    text=re.sub('+', '', text)
    #removing hyperlink
    text= re.sub('https?://S+|www.S+', '', text)
    #removing puncuation
    text=re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('n' , '', text)
    #remove words containing numbers
    text=re.sub('w*dw*' , '', text)
    #tokenizer
    text = nltk.word_tokenize(text)
    #lemmatizer
    wn = nltk.WordNetLemmatizer()
    text = [wn.lemmatize(word) for word in text]
    text = " ".join(text)
    return text

new_df["transcription"]= new_df["transcription"].apply(clean_text)

In [17]:
# In Machine learning algorithms mostly take numeric feature vectors as input. 
# Thus, when working with text data, need to convert each document into a numeric vector using CountVectorizer.

count_vectorizer = CountVectorizer()

train_vectors_counts = count_vectorizer.fit_transform(df["transcription"])

train_vectors_counts.shape

(1239, 13593)

In [19]:
# Fitting a simple Multinomial Naive Bayes model
mnb = MultinomialNB()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
print("Mean Accuracy: {:.2}".format(cross_val_score(mnb, train_vectors_counts, df["medical_specialty"], cv=cv).mean()))

Mean Accuracy: 0.72
