### Reading Data

In [2]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv("new_train.csv", index_col=0)
test_df = pd.read_csv("new_test.csv", index_col=0)

print("Train size", len(train_df))
print("Test size", len(test_df))
train_df.head(n=3)

Train size 3969
Test size 997


Unnamed: 0,medical_specialty,transcription,labels
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1


In [3]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3969 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   medical_specialty  3969 non-null   object
 1   transcription      3969 non-null   object
 2   labels             3969 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 124.0+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 996
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   transcription  997 non-null    object
dtypes: object(1)
memory usage: 15.6+ KB


In [4]:
train_df.isnull().sum()
test_df.isnull().sum()

transcription    0
dtype: int64

### Train Set Label Distribution

In [5]:
train_df["medical_specialty"].value_counts()

 Surgery                          863
 Consult - History and Phy.       410
 Cardiovascular / Pulmonary       309
 Orthopedic                       289
 Radiology                        213
 General Medicine                 209
 Gastroenterology                 176
 Neurology                        170
 SOAP / Chart / Progress Notes    135
 Urology                          134
 Obstetrics / Gynecology          123
 Discharge Summary                 87
 ENT - Otolaryngology              82
 Neurosurgery                      71
 Hematology - Oncology             68
 Ophthalmology                     67
 Emergency Room Reports            63
 Nephrology                        63
 Pediatrics - Neonatal             55
 Pain Management                   54
 Psychiatry / Psychology           45
 Office Notes                      38
 Podiatry                          35
 Dermatology                       21
 Dentistry                         21
 Cosmetic / Plastic Surgery        19
 Letters    

### Sample Transcription

In [6]:
from pprint import pprint
pprint(train_df.transcription[0])

('REASON FOR THE VISIT:,  Very high PT/INR.,HISTORY: , The patient is an '
 '81-year-old lady whom I met last month when she came in with pneumonia and '
 'CHF.  She was noticed to be in atrial fibrillation, which is a chronic '
 'problem for her.  She did not want to have Coumadin started because she said '
 'that she has had it before and the INR has had been very difficult to '
 'regulate to the point that it was dangerous, but I convinced her to restart '
 'the Coumadin again.  I gave her the Coumadin as an outpatient and then the '
 'INR was found to be 12.  So, I told her to come to the emergency room to get '
 'vitamin K to reverse the anticoagulation.,PAST MEDICAL HISTORY:,1.  '
 'Congestive heart failure.,2.  Renal insufficiency.,3.  Coronary artery '
 'disease.,4.  Atrial fibrillation.,5.  COPD.,6.  Recent pneumonia.,7.  '
 'Bladder cancer.,8.  History of ruptured colon.,9.  Myocardial '
 'infarction.,10.  Hernia repair.,11.  Colon resection.,12.  Carpal tunnel '
 'repair.,13

### Data Transformation

In [13]:
train_df.transcription = train_df.transcription.astype('str')
train_df.transcription = train_df.transcription.str.lower()

train_df = train_df.dropna(axis = 0, how ='any')

#getting rid of targeted charachters in the trascription
chars = ['#',':,',': ,',';','$','!','?','*','``','1', '2', '3', '4', '5','6','7','8','9','10']
for c in chars:
    train_df.transcription = train_df.transcription.str.replace(c,"")

train_df.sample(5)

#getting rid of targeted charachters in the trascription
chars = [",", ".", "[", "]", ":", "``", ")", "("]
for c in chars:
    train_df.transcription = train_df.transcription.str.replace(c," ")

train_df.sample(5)

Unnamed: 0,medical_specialty,transcription,labels,tokenized,POSTags,Nouns
1496,Obstetrics / Gynecology,preoperative diagnoses intrauterine pregnancy ...,19,"[preoperative, diagnoses, intrauterine, pregna...","[(preoperative, JJ), (diagnoses, NNS), (intrau...","[(diagnoses, NNS), (pregnancy, NN), (term, NN)..."
722,Consult - History and Phy.,"chief complaint ""i have had trouble breathing ...",16,"[chief, complaint, ``, i, have, had, trouble, ...","[(chief, JJ), (complaint, NN), (``, ``), (i, N...","[(complaint, NN), (i, NN), (trouble, NN), (day..."
1215,General Medicine,history the patient is a -year-old male who ...,10,"[history, the, patient, is, a, -year-old, male...","[(history, NN), (the, DT), (patient, NN), (is,...","[(history, NN), (patient, NN), (male, NN), (fi..."
3744,Cardiovascular / Pulmonary,indications previously markedly abnormal dobu...,7,"[indications, previously, markedly, abnormal, ...","[(indications, NNS), (previously, RB), (marked...","[(indications, NNS), (dobutamine, NN), (myovie..."
1471,Neurology,cc lethargy hx this y/o rhm was admitted to...,4,"[cc, lethargy, hx, this, y/o, rhm, was, admitt...","[(cc, NN), (lethargy, NN), (hx, NN), (this, DT...","[(cc, NN), (lethargy, NN), (hx, NN), (y/o, NN)..."


In [14]:
# Tokenizing
import nltk
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('tagsets')
from nltk.tokenize import word_tokenize

train_df['tokenized'] = train_df.transcription.apply(nltk.word_tokenize)
train_df.sample(5)

Unnamed: 0,medical_specialty,transcription,labels,tokenized,POSTags,Nouns
730,Psychiatry / Psychology,history of present illness this is a -yea...,15,"[history, of, present, illness, this, is, a, -...","[(history, NN), (of, IN), (present, JJ), (illn...","[(history, NN), (illness, NN), (man, NN), (ons..."
2868,Surgery,preoperative diagnoses chronic adenotonsilliti...,1,"[preoperative, diagnoses, chronic, adenotonsil...","[(preoperative, JJ), (diagnoses, NNS), (chroni...","[(diagnoses, NNS), (adenotonsillitis, NN), (an..."
2106,Orthopedic,reason for referral evaluation for right l s...,6,"[reason, for, referral, evaluation, for, right...","[(reason, NN), (for, IN), (referral, JJ), (eva...","[(reason, NN), (evaluation, NN), (l, NN), (ner..."
3019,Surgery,preoperative diagnosis breast mass left post...,1,"[preoperative, diagnosis, breast, mass, left, ...","[(preoperative, JJ), (diagnosis, NN), (breast,...","[(diagnosis, NN), (breast, NN), (mass, NN), (d..."
3852,Neurology,reason for visit followup cervical spinal s...,4,"[reason, for, visit, followup, cervical, spina...","[(reason, NN), (for, IN), (visit, NN), (follow...","[(reason, NN), (visit, NN), (stenosis, NN), (h..."


In [9]:
# pos_tag
from nltk import pos_tag
nltk.tag.pos_tag(train_df['tokenized'][0])

[('reason', 'NN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('visit', 'NN'),
 ('very', 'RB'),
 ('high', 'JJ'),
 ('pt/inr', 'NN'),
 ('history', 'NN'),
 ('the', 'DT'),
 ('patient', 'NN'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('-year-old', 'JJ'),
 ('lady', 'NN'),
 ('whom', 'WP'),
 ('i', 'VBZ'),
 ('met', 'VBD'),
 ('last', 'JJ'),
 ('month', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('came', 'VBD'),
 ('in', 'IN'),
 ('with', 'IN'),
 ('pneumonia', 'NN'),
 ('and', 'CC'),
 ('chf', 'NN'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('noticed', 'VBN'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('in', 'IN'),
 ('atrial', 'JJ'),
 ('fibrillation', 'NN'),
 ('which', 'WDT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('chronic', 'JJ'),
 ('problem', 'NN'),
 ('for', 'IN'),
 ('her', 'PRP$'),
 ('she', 'PRP'),
 ('did', 'VBD'),
 ('not', 'RB'),
 ('want', 'VB'),
 ('to', 'TO'),
 ('have', 'VB'),
 ('coumadin', 'NN'),
 ('started', 'VBN'),
 ('because', 'IN'),
 ('she', 'PRP'),
 ('said', 'VBD'),
 ('that', 'IN'),
 ('she', 'PRP'),
 ('has', 'VBZ'),
 ('had', 'VBD')

In [10]:
train_df['POSTags'] = train_df['tokenized'].apply(pos_tag)
train_df['POSTags'].head()

0    [(reason, NN), (for, IN), (the, DT), (visit, N...
1    [(preoperative, JJ), (diagnosis, NN), (acetabu...
2    [(name, NN), (of, IN), (procedure, NN), (selec...
3    [(referring, VBG), (diagnosis, NN), (motor, NN...
4    [(chief, JJ), (complaint, NN), (dental, NN), (...
Name: POSTags, dtype: object

In [11]:
# Selecting the nouns in our corpus
train_df['Nouns'] = train_df['POSTags'].apply(lambda x: [(t[0], t[1]) for t in x if t[1]=='NN' or t[1]=='NNP' or t[1]=='NNS' or t[1]=='NNPS'])

train_df['Nouns']

0       [(reason, NN), (visit, NN), (pt/inr, NN), (his...
1       [(diagnosis, NN), (fracture, NN), (left, NN), ...
2       [(name, NN), (procedure, NN), (angiography, NN...
3       [(diagnosis, NN), (motor, NN), (neuron, NN), (...
4       [(complaint, NN), (dental, NN), (pain, NN), (h...
                              ...                        
3995    [(problems, NNS), (issues, NNS), (headaches, N...
3996    [(diagnosis, NN), (anemia, NN), (procedure, NN...
3997    [(dysphagia, NN), (gastroesophageal, NN), (ref...
3998    [(patient, NN), (s, NN), (abdomen, NNS), (fash...
3999    [(diagnosis, NN), (effusion, NN), (failure, NN...
Name: Nouns, Length: 3969, dtype: object

### Sample Training

In [7]:
import sys
#!{sys.executable} -m pip install nltk
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
from torch import nn
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
import nltk

In [5]:
unique_classes = train_df["medical_specialty"].unique()

# idx_2_class = {i: s for i, s in enumerate(unique_classes)}
# class_2_idx = {s: i for i, s in enumerate(unique_classes)}

In [6]:
# train_df["labels"] = train_df["medical_specialty"].apply(lambda s: class_2_idx[s])

In [7]:
train_train_df, train_test_df = \
    train_test_split(
    train_df,
    test_size=0.3,
    random_state=42
)

In [8]:
ds_dict = {
    'train': Dataset.from_pandas(train_train_df),
    'val': Dataset.from_pandas(train_test_df),
    "test": Dataset.from_pandas(test_df)
}

ds = DatasetDict(ds_dict)

In [9]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_text(texts):
    return tokenizer(texts["transcription"], truncation=True, padding=True, max_length=256)

ds["train"] = ds["train"].map(tokenize_text, batched=True)
ds["val"] = ds["val"].map(tokenize_text, batched=True)
ds["test"] = ds["test"].map(tokenize_text, batched=True)

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['medical_specialty', 'transcription', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 2778
    })
    val: Dataset({
        features: ['medical_specialty', 'transcription', 'labels', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 1191
    })
    test: Dataset({
        features: ['transcription', '__index_level_0__', 'input_ids', 'attention_mask'],
        num_rows: 997
    })
})

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_classes)
)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.bias', 'classifier.bias', 'pre_classifi

### Evaluation Metric

In [11]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    return {"f1": f1}

In [12]:
batch_size = 32
logging_steps = len(train_train_df) // batch_size
output_dir = "hf_trainer"

training_args = TrainingArguments(
    output_dir=output_dir,
     num_train_epochs=5,
     learning_rate=2e-5,
     per_device_train_batch_size=batch_size,
     per_device_eval_batch_size=batch_size,
     weight_decay=0.01,
     evaluation_strategy="epoch",
     logging_steps=logging_steps,
     push_to_hub=False
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds['train'],
    eval_dataset=ds['val'],
    tokenizer=tokenizer
)

In [None]:
trainer.train()

The following columns in the training set don't have a corresponding argument in `DistilBertForSequenceClassification.forward` and have been ignored: __index_level_0__, transcription, medical_specialty. If __index_level_0__, transcription, medical_specialty are not expected by `DistilBertForSequenceClassification.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 2778
  Num Epochs = 5
  Instantaneous batch size per device = 32


### Making Inference on the Test Set

In [None]:
ds["test"]

In [None]:
pred_y = trainer.predict(ds["test"])

In [None]:
a = pd.Series(pred_y.predictions.argmax(axis=1))
a.name = "Expected"
a.to_csv("predictions.csv")