# NLP Project

### Imports 

In [128]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.manifold import TSNE

from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer 

# from imblearn.over_sampling import SMOTE

In [47]:
df = pd.read_csv('mtsamples.csv')
df = df.dropna()
df = df.reset_index(drop=True)
df = df.drop(df.columns[0], axis=1)
# df = df.dropna(columns)

In [48]:
for i,item in enumerate(df['keywords']):
    df['keywords'][i] = item.split(', ')

In [49]:
df['keywords'][100]

['urology',
 'tissue flap relocation',
 'urethroplasty plate incision',
 'penile shaft skin',
 'chordee release',
 'zaontz catheter',
 'penile shaft',
 'hypospadias repair',
 'flap relocation',
 'coronal cuff',
 'urethral plate',
 'tissue flap',
 'hypospadias',
 'flap',
 'chordee,']

In [50]:
df.iloc[:14,:]


Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","[allergy / immunology, allergic rhinitis, alle..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","[bariatrics, laparoscopic gastric bypass, weig..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","[bariatrics, laparoscopic gastric bypass, hear..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","[cardiovascular / pulmonary, 2-d m-mode, doppl..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"[cardiovascular / pulmonary, 2-d, doppler, ech..."
5,Morbid obesity. Laparoscopic antecolic anteg...,Bariatrics,Laparoscopic Gastric Bypass,"PREOPERATIVE DIAGNOSIS: , Morbid obesity.,POST...","[bariatrics, gastric bypass, eea anastomosis, ..."
6,"Liposuction of the supraumbilical abdomen, re...",Bariatrics,Liposuction,"PREOPERATIVE DIAGNOSES:,1. Deformity, right b...","[bariatrics, breast reconstruction, excess, lm..."
7,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 3,"2-D ECHOCARDIOGRAM,Multiple views of the heart...","[cardiovascular / pulmonary, 2-d echocardiogra..."
8,Suction-assisted lipectomy - lipodystrophy of...,Bariatrics,Lipectomy - Abdomen/Thighs,"PREOPERATIVE DIAGNOSIS: , Lipodystrophy of the...","[bariatrics, lipodystrophy, abd pads, suction-..."
9,Echocardiogram and Doppler,Cardiovascular / Pulmonary,2-D Echocardiogram - 4,"DESCRIPTION:,1. Normal cardiac chambers size....","[cardiovascular / pulmonary, ejection fraction..."


In [51]:
vectorizer = TfidfVectorizer(analyzer='word', stop_words='english',ngram_range=(1,3), max_df=0.75, use_idf=True, smooth_idf=True, max_features=1000)
tfIdfMat  = vectorizer.fit_transform(df['transcription'].tolist() )
feature_names = sorted(vectorizer.get_feature_names())
print(feature_names)

['000', '08', '10', '100', '11', '12', '13', '14', '15', '15 blade', '16', '18', '19', '20', '24', '25', '30', '40', '45', '50', '60', '70', '80', '90', '92', 'abc', 'abcd', 'abdomen', 'abdomen soft', 'abdominal', 'abdominal pain', 'able', 'abnormal', 'abnormalities', 'abscess', 'achieved', 'active', 'activity', 'acute', 'addition', 'additional', 'adequate', 'adhesions', 'administered', 'admission', 'admitted', 'advanced', 'age', 'ago', 'air', 'alcohol', 'alert', 'allergies', 'allowed', 'alternatives', 'amounts', 'anastomosis', 'anesthesia', 'anesthesia general', 'anesthesia general endotracheal', 'anesthetic', 'angle', 'ankle', 'anterior', 'anterior chamber', 'anteriorly', 'antibiotic', 'antibiotics', 'aorta', 'aortic', 'apparent', 'appear', 'appearance', 'appeared', 'appearing', 'appears', 'applied', 'applied patient', 'appropriate', 'appropriately', 'approximated', 'approximately', 'approximately cm', 'area', 'areas', 'arm', 'arteries', 'artery', 'aspect', 'aspiration', 'assessment'



In [58]:
df['medical_specialty'].unique()

array([' Allergy / Immunology', ' Bariatrics',
       ' Cardiovascular / Pulmonary', ' Dentistry', ' Urology',
       ' General Medicine', ' Surgery', ' Speech - Language',
       ' SOAP / Chart / Progress Notes', ' Sleep Medicine',
       ' Rheumatology', ' Radiology', ' Psychiatry / Psychology',
       ' Podiatry', ' Physical Medicine - Rehab',
       ' Pediatrics - Neonatal', ' Pain Management', ' Orthopedic',
       ' Ophthalmology', ' Office Notes', ' Obstetrics / Gynecology',
       ' Neurosurgery', ' Neurology', ' Nephrology', ' Letters',
       ' Lab Medicine - Pathology', ' IME-QME-Work Comp etc.',
       ' Hospice - Palliative Care', ' Hematology - Oncology',
       ' Gastroenterology', ' ENT - Otolaryngology', ' Endocrinology',
       ' Emergency Room Reports', ' Discharge Summary',
       ' Diets and Nutritions', ' Dermatology',
       ' Cosmetic / Plastic Surgery', ' Consult - History and Phy.',
       ' Chiropractic'], dtype=object)

In [64]:
specialty_counts = df['medical_specialty'].value_counts()
mask = df['medical_specialty'].isin(specialty_counts.index[specialty_counts > 40])
df2 = df[mask]

In [67]:
df['medical_specialty'].value_counts()

 Surgery                          1021
 Orthopedic                        303
 Cardiovascular / Pulmonary        280
 Radiology                         251
 Consult - History and Phy.        234
 Gastroenterology                  195
 Neurology                         168
 General Medicine                  146
 SOAP / Chart / Progress Notes     142
 Urology                           140
 Obstetrics / Gynecology           130
 ENT - Otolaryngology               84
 Neurosurgery                       81
 Ophthalmology                      79
 Discharge Summary                  77
 Nephrology                         63
 Hematology - Oncology              62
 Pain Management                    58
 Office Notes                       44
 Pediatrics - Neonatal              42
 Podiatry                           42
 Emergency Room Reports             31
 Dermatology                        25
 Dentistry                          25
 Cosmetic / Plastic Surgery         25
 Letters                 

21

In [106]:
df

Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","[allergy / immunology, allergic rhinitis, alle..."
1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","[bariatrics, laparoscopic gastric bypass, weig..."
2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","[bariatrics, laparoscopic gastric bypass, hear..."
3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","[cardiovascular / pulmonary, 2-d m-mode, doppl..."
4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"[cardiovascular / pulmonary, 2-d, doppler, ech..."
...,...,...,...,...,...
3893,Patient suffered from morbid obesity for many...,Bariatrics,Discharge Summary - Gastric Bypass,"ADMISSION DIAGNOSIS:, Morbid obesity. BMI is...","[bariatrics, laparoscopic gastric bypass, gast..."
3894,Patient presented to the Bariatric Surgery Se...,Bariatrics,Bariatric Consult - Surgical Weight Loss - 4,"HISTORY OF PRESENT ILLNESS:, Ms. A is a 55-ye...","[bariatrics, jenny craig, medifast, nutrisyste..."
3895,Evaluation for elective surgical weight loss ...,Bariatrics,Bariatric Consult - Surgical Weight Loss - 2,"PAST MEDICAL HISTORY: ,She had a negative str...","[bariatrics, elective surgical weight loss, su..."
3896,"Chronic glossitis, xerostomia, probable envir...",Allergy / Immunology,Evaluation of Allergies,"HISTORY:, A 55-year-old female presents self-...","[allergy / immunology, chronic glossitis, xero..."


(3642,)

In [116]:
cols = ['medical_speciality', 'tokenized']
df3 = df2[cols]

KeyError: "['medical_speciality'] not in index"

In [76]:
!pip install transformers datasets evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
     ---------------------------------------- 84.1/84.1 kB 4.6 MB/s eta 0:00:00
Collecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Installing collected packages: responses, evaluate
Successfully installed evaluate-0.4.1 responses-0.18.0




In [77]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

Downloading tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [83]:


tokenized_series = []
for item in df2['transcription']:
    tokenized_series.append(tokenizer(item, truncation=True))

In [93]:
df2['tokenized'] = tokenized_series

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['tokenized'] = tokenized_series


In [108]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [111]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

model = AutoModelForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=21)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.weight', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [121]:
from sklearn.model_selection import train_test_split

df3 = df2[['medical_specialty','tokenized']]

In [122]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

Unnamed: 0,medical_specialty,tokenized
3,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
4,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
7,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
9,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
11,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
...,...,...
3883,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
3884,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
3885,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"
3886,Cardiovascular / Pulmonary,"[input_ids, attention_mask]"


In [132]:
# Initialize LabelEncoder
label_encoder = LabelEncoder()

df2['medical_specialty_encoded'] = label_encoder.fit_transform(df3['medical_specialty'])
df3['medical_specialty_encoded'] = label_encoder.fit_transform(df3['medical_specialty'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['medical_specialty_encoded'] = label_encoder.fit_transform(df3['medical_specialty'])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df3['medical_specialty_encoded'] = label_encoder.fit_transform(df3['medical_specialty'])


In [133]:
df2['medical_specialty_encoded'].value_counts()

19    1021
13     303
0      280
17     251
1      234
4      195
8      168
5      146
18     142
20     140
10     130
3       84
9       81
12      79
2       77
7       63
6       62
14      58
11      44
16      42
15      42
Name: medical_specialty_encoded, dtype: int64