In [1]:
import pandas as pd
import re
from sklearn.preprocessing import LabelEncoder
import spacy

In [2]:
#loading the dataset
data= pd.read_csv('data/mtsamples.csv')
data.head()

Unnamed: 0,id,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


## DATA CLEANING

In [3]:
data.isnull().sum()

id                      0
description             0
medical_specialty       0
sample_name             0
transcription          33
keywords             1068
dtype: int64

In [4]:
data= data.dropna()

In [5]:
data.isnull().sum()

id                   0
description          0
medical_specialty    0
sample_name          0
transcription        0
keywords             0
dtype: int64

In [6]:
data['medical_specialty'].value_counts()

 Surgery                          1021
 Orthopedic                        303
 Cardiovascular / Pulmonary        280
 Radiology                         251
 Consult - History and Phy.        234
 Gastroenterology                  195
 Neurology                         168
 General Medicine                  146
 SOAP / Chart / Progress Notes     142
 Urology                           140
 Obstetrics / Gynecology           130
 ENT - Otolaryngology               84
 Neurosurgery                       81
 Ophthalmology                      79
 Discharge Summary                  77
 Nephrology                         63
 Hematology - Oncology              62
 Pain Management                    58
 Office Notes                       44
 Pediatrics - Neonatal              42
 Podiatry                           42
 Emergency Room Reports             31
 Dermatology                        25
 Dentistry                          25
 Cosmetic / Plastic Surgery         25
 Letters                 

In [7]:
data[data['medical_specialty']== ' Urology'].head() 
# data is very bad
# data has a lot of spaces 
# " Urology"

Unnamed: 0,id,description,medical_specialty,sample_name,transcription,keywords
18,18,Fertile male with completed family. Elective...,Urology,Vasectomy - 4,"PROCEDURE: , Elective male sterilization via b...","urology, sterilization, vas, fertile male, bil..."
20,20,Whole body radionuclide bone scan due to pros...,Urology,Whole Body Radionuclide Bone Scan,"INDICATION:, Prostate Cancer.,TECHNIQUE:, 3....","urology, prostate cancer, technetium, whole bo..."
22,22,Normal vasectomy,Urology,Vasectomy - 1,"DESCRIPTION:, The patient was placed in the s...","urology, vasectomy, allis clamp, catgut, hemoc..."
23,23,Voluntary sterility. Bilateral vasectomy. T...,Urology,Vasectomy,"PREOPERATIVE DIAGNOSIS: , Voluntary sterility....","urology, hemiscrotum, bilateral vasectomy, vol..."
25,25,Normal vasectomy,Urology,Vasectomy - 2,"DIAGNOSIS:, Desires vasectomy.,NAME OF OPERAT...","urology, scrotal incision, right vas, bleeding..."


In [8]:
data= data.drop(columns= ['id', 'description', 'sample_name', 'keywords'], axis= 'columns')

In [9]:
data.head()

Unnamed: 0,medical_specialty,transcription
0,Allergy / Immunology,"SUBJECTIVE:, This 23-year-old white female pr..."
1,Bariatrics,"PAST MEDICAL HISTORY:, He has difficulty climb..."
2,Bariatrics,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ..."
3,Cardiovascular / Pulmonary,"2-D M-MODE: , ,1. Left atrial enlargement wit..."
4,Cardiovascular / Pulmonary,1. The left ventricular cavity size and wall ...


In [10]:
f= data['transcription'][0]
f

'SUBJECTIVE:,  This 23-year-old white female presents with complaint of allergies.  She used to have allergies when she lived in Seattle but she thinks they are worse here.  In the past, she has tried Claritin, and Zyrtec.  Both worked for short time but then seemed to lose effectiveness.  She has used Allegra also.  She used that last summer and she began using it again two weeks ago.  It does not appear to be working very well.  She has used over-the-counter sprays but no prescription nasal sprays.  She does have asthma but doest not require daily medication for this and does not think it is flaring up.,MEDICATIONS: , Her only medication currently is Ortho Tri-Cyclen and the Allegra.,ALLERGIES: , She has no known medicine allergies.,OBJECTIVE:,Vitals:  Weight was 130 pounds and blood pressure 124/78.,HEENT:  Her throat was mildly erythematous without exudate.  Nasal mucosa was erythematous and swollen.  Only clear drainage was seen.  TMs were clear.,Neck:  Supple without adenopathy.,

In [11]:
#remove spaces and special characters
#data cleaning

def cleaning(text):
    text = re.sub('[^\w\s\']',' ', text)
    text= re.sub(' +', ' ', text)
    text= re.sub('\n', ' ', text)
    return text.lower().strip()

In [12]:
#apply data cleaning on 2 dirty columns
data['medical_specialty']= data['medical_specialty'].apply(cleaning)
data['transcription']= data['transcription'].apply(cleaning)

In [13]:
data.head()

Unnamed: 0,medical_specialty,transcription
0,allergy immunology,subjective this 23 year old white female prese...
1,bariatrics,past medical history he has difficulty climbin...
2,bariatrics,history of present illness i have seen abc tod...
3,cardiovascular pulmonary,2 d m mode 1 left atrial enlargement with left...
4,cardiovascular pulmonary,1 the left ventricular cavity size and wall th...


In [14]:
#choose 5 medical specialty which we will focus upon
data= data[data['medical_specialty'].isin({"orthopedic", "cardiovascular pulmonary", "radiology", "gastroenterology", "neurology"})]

In [15]:
data.to_csv('temp.csv')

In [18]:
data.medical_specialty.value_counts()

orthopedic                  303
cardiovascular pulmonary    280
radiology                   251
gastroenterology            195
neurology                   168
Name: medical_specialty, dtype: int64

## DATA PREPROCESSING, PRECOMPUTATION

In [20]:
nlp= spacy.load('en_core_web_lg')

In [23]:
# Preprocess function remove stop words, punctuations, digits and stop words

def preprocess(text):
    doc= nlp(text)
    filter= []
    for token in doc:
        if not token.is_stop and not token.is_punct and not token.is_digit:
            filter.append(token.lemma_)
    return ' '.join(filter)        

In [24]:
data['transcription']= data['transcription'].apply(preprocess)

In [27]:
data.head()

Unnamed: 0,medical_specialty,transcription
3,cardiovascular pulmonary,d m mode leave atrial enlargement left atrial ...
4,cardiovascular pulmonary,left ventricular cavity size wall thickness ap...
7,cardiovascular pulmonary,d echocardiogram multiple view heart great ves...
9,cardiovascular pulmonary,description normal cardiac chamber size normal...
11,cardiovascular pulmonary,d study mild aortic stenosis widely calcify mi...


In [32]:
#create medical labels for medical specialty
l= LabelEncoder()
data['medical_label']= l.fit_transform(data['medical_specialty'])

In [34]:
data['medical_label'].value_counts()

3    303
0    280
4    251
1    195
2    168
Name: medical_label, dtype: int64

In [37]:
#create embeddings for word2vec model

def getvector(text):
    doc= nlp(text)
    return doc.vector

In [42]:
getvector("i am nitin").shape

(300,)

In [43]:
#add a column of word2vec embeddings in the data
data['word2vec_embed']= data['transcription'].apply(getvector)

In [44]:
data.head()

Unnamed: 0,medical_specialty,transcription,medical_label,word2vec_embed
3,cardiovascular pulmonary,d m mode leave atrial enlargement left atrial ...,0,"[-0.39130253, 1.6925055, -0.97060895, 1.423319..."
4,cardiovascular pulmonary,left ventricular cavity size wall thickness ap...,0,"[-0.54487616, 1.3033527, -1.5350755, 1.3164338..."
7,cardiovascular pulmonary,d echocardiogram multiple view heart great ves...,0,"[-0.40531728, 0.8317133, -1.0865903, 0.2969002..."
9,cardiovascular pulmonary,description normal cardiac chamber size normal...,0,"[-0.3478441, 0.9546592, -1.0535976, 1.2019306,..."
11,cardiovascular pulmonary,d study mild aortic stenosis widely calcify mi...,0,"[0.27329454, 1.1477208, -0.88355297, 1.0739923..."


In [51]:
# create precomputaions for fasttext model

#we dont need spaces in any labels

data.medical_specialty.replace("cardiovascular pulmonary", "cardiovascular_pulmonary", inplace= True)

data['fasttext_label']= "__label__" + data['medical_specialty'] + " " + data['transcription']

In [52]:
data.head()

Unnamed: 0,medical_specialty,transcription,medical_label,word2vec_embed,fasttext_label
3,cardiovascular_pulmonary,d m mode leave atrial enlargement left atrial ...,0,"[-0.39130253, 1.6925055, -0.97060895, 1.423319...",__label__cardiovascular_pulmonary d m mode lea...
4,cardiovascular_pulmonary,left ventricular cavity size wall thickness ap...,0,"[-0.54487616, 1.3033527, -1.5350755, 1.3164338...",__label__cardiovascular_pulmonary left ventric...
7,cardiovascular_pulmonary,d echocardiogram multiple view heart great ves...,0,"[-0.40531728, 0.8317133, -1.0865903, 0.2969002...",__label__cardiovascular_pulmonary d echocardio...
9,cardiovascular_pulmonary,description normal cardiac chamber size normal...,0,"[-0.3478441, 0.9546592, -1.0535976, 1.2019306,...",__label__cardiovascular_pulmonary description ...
11,cardiovascular_pulmonary,d study mild aortic stenosis widely calcify mi...,0,"[0.27329454, 1.1477208, -0.88355297, 1.0739923...",__label__cardiovascular_pulmonary d study mild...


In [54]:
# Return a csv file after all the cleaning, preprocess and precomputations 
data.to_csv("data/medical_new.csv")