### Reading Data

In [31]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

train_df = pd.read_csv("new_train.csv", index_col=0)
test_df = pd.read_csv("new_test.csv", index_col=0)

print("Train size", len(train_df))
print("Test size", len(test_df))
train_df.head(n=3)

Train size 3969
Test size 997


Unnamed: 0,medical_specialty,transcription,labels
0,Emergency Room Reports,"REASON FOR THE VISIT:, Very high PT/INR.,HIST...",0
1,Surgery,"PREOPERATIVE DIAGNOSIS:, Acetabular fracture ...",1
2,Surgery,"NAME OF PROCEDURE,1. Selective coronary angio...",1


In [32]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3969 entries, 0 to 3999
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   medical_specialty  3969 non-null   object
 1   transcription      3969 non-null   object
 2   labels             3969 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 124.0+ KB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 997 entries, 0 to 996
Data columns (total 1 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   transcription  997 non-null    object
dtypes: object(1)
memory usage: 15.6+ KB


In [33]:
train_df.isnull().sum()
test_df.isnull().sum()

transcription    0
dtype: int64

### Train Set Label Distribution

In [34]:
train_df["medical_specialty"].value_counts()

 Surgery                          863
 Consult - History and Phy.       410
 Cardiovascular / Pulmonary       309
 Orthopedic                       289
 Radiology                        213
 General Medicine                 209
 Gastroenterology                 176
 Neurology                        170
 SOAP / Chart / Progress Notes    135
 Urology                          134
 Obstetrics / Gynecology          123
 Discharge Summary                 87
 ENT - Otolaryngology              82
 Neurosurgery                      71
 Hematology - Oncology             68
 Ophthalmology                     67
 Emergency Room Reports            63
 Nephrology                        63
 Pediatrics - Neonatal             55
 Pain Management                   54
 Psychiatry / Psychology           45
 Office Notes                      38
 Podiatry                          35
 Dermatology                       21
 Dentistry                         21
 Cosmetic / Plastic Surgery        19
 Letters    

### Sample Transcription

In [35]:
from pprint import pprint
pprint(train_df.transcription[0])

('REASON FOR THE VISIT:,  Very high PT/INR.,HISTORY: , The patient is an '
 '81-year-old lady whom I met last month when she came in with pneumonia and '
 'CHF.  She was noticed to be in atrial fibrillation, which is a chronic '
 'problem for her.  She did not want to have Coumadin started because she said '
 'that she has had it before and the INR has had been very difficult to '
 'regulate to the point that it was dangerous, but I convinced her to restart '
 'the Coumadin again.  I gave her the Coumadin as an outpatient and then the '
 'INR was found to be 12.  So, I told her to come to the emergency room to get '
 'vitamin K to reverse the anticoagulation.,PAST MEDICAL HISTORY:,1.  '
 'Congestive heart failure.,2.  Renal insufficiency.,3.  Coronary artery '
 'disease.,4.  Atrial fibrillation.,5.  COPD.,6.  Recent pneumonia.,7.  '
 'Bladder cancer.,8.  History of ruptured colon.,9.  Myocardial '
 'infarction.,10.  Hernia repair.,11.  Colon resection.,12.  Carpal tunnel '
 'repair.,13

### Data Transformation

In [36]:
train_df.transcription = train_df.transcription.astype('str')
train_df.transcription = train_df.transcription.str.lower()

train_df = train_df.dropna(axis = 0, how ='any')

#getting rid of targeted charachters in the trascription
chars = ['#',':,',': ,',';','$','!','?','*','``','%', '1', '2', '3', '4', '5','6','7','8','9','10']
for c in chars:
    train_df.transcription = train_df.transcription.str.replace(c,"")

train_df.sample(5)

#getting rid of targeted charachters in the trascription
chars = [",", ".", "[", "]", ":", "``", ")", "("]
for c in chars:
    train_df.transcription = train_df.transcription.str.replace(c," ")

train_df.sample(5)

Unnamed: 0,medical_specialty,transcription,labels
1931,Dentistry,preoperative diagnosis bilateral open mandible...,29
2152,Surgery,preoperative diagnosis adenocarcinoma of the ...,1
1738,Obstetrics / Gynecology,preoperative diagnosis complex right lower qua...,19
3944,Hematology - Oncology,reason for consultation i was asked by dr x t...,11
401,ENT - Otolaryngology,procedures performed functional endoscopic s...,9


In [37]:
# Tokenizing
import nltk
nltk.download('punkt')
# nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets')
from nltk.tokenize import word_tokenize

train_df['tokenized'] = train_df.transcription.apply(nltk.word_tokenize)
train_df.sample(5)

[nltk_data] Downloading package punkt to /home/pjyi/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/pjyi/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to /home/pjyi/nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


Unnamed: 0,medical_specialty,transcription,labels,tokenized
1344,Discharge Summary,diagnosis chronic laryngitis hoarseness hist...,21,"[diagnosis, chronic, laryngitis, hoarseness, h..."
405,Pediatrics - Neonatal,admitting diagnoses respiratory distress ...,30,"[admitting, diagnoses, respiratory, distress, ..."
3319,Surgery,preoperative diagnosis fractured right fifth m...,1,"[preoperative, diagnosis, fractured, right, fi..."
2225,Cardiovascular / Pulmonary,operative procedure redo coronary bypass gra...,7,"[operative, procedure, redo, coronary, bypass,..."
2474,SOAP / Chart / Progress Notes,subjective this patient presents to the office...,13,"[subjective, this, patient, presents, to, the,..."


In [38]:
# pos_tag
from nltk import pos_tag
nltk.tag.pos_tag(train_df['tokenized'][0])

[('reason', 'NN'),
 ('for', 'IN'),
 ('the', 'DT'),
 ('visit', 'NN'),
 ('very', 'RB'),
 ('high', 'JJ'),
 ('pt/inr', 'NN'),
 ('history', 'NN'),
 ('the', 'DT'),
 ('patient', 'NN'),
 ('is', 'VBZ'),
 ('an', 'DT'),
 ('-year-old', 'JJ'),
 ('lady', 'NN'),
 ('whom', 'WP'),
 ('i', 'VBZ'),
 ('met', 'VBD'),
 ('last', 'JJ'),
 ('month', 'NN'),
 ('when', 'WRB'),
 ('she', 'PRP'),
 ('came', 'VBD'),
 ('in', 'IN'),
 ('with', 'IN'),
 ('pneumonia', 'NN'),
 ('and', 'CC'),
 ('chf', 'NN'),
 ('she', 'PRP'),
 ('was', 'VBD'),
 ('noticed', 'VBN'),
 ('to', 'TO'),
 ('be', 'VB'),
 ('in', 'IN'),
 ('atrial', 'JJ'),
 ('fibrillation', 'NN'),
 ('which', 'WDT'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('chronic', 'JJ'),
 ('problem', 'NN'),
 ('for', 'IN'),
 ('her', 'PRP$'),
 ('she', 'PRP'),
 ('did', 'VBD'),
 ('not', 'RB'),
 ('want', 'VB'),
 ('to', 'TO'),
 ('have', 'VB'),
 ('coumadin', 'NN'),
 ('started', 'VBN'),
 ('because', 'IN'),
 ('she', 'PRP'),
 ('said', 'VBD'),
 ('that', 'IN'),
 ('she', 'PRP'),
 ('has', 'VBZ'),
 ('had', 'VBD')

In [39]:
train_df['POSTags'] = train_df['tokenized'].apply(pos_tag)
train_df['POSTags'].head()

0    [(reason, NN), (for, IN), (the, DT), (visit, N...
1    [(preoperative, JJ), (diagnosis, NN), (acetabu...
2    [(name, NN), (of, IN), (procedure, NN), (selec...
3    [(referring, VBG), (diagnosis, NN), (motor, NN...
4    [(chief, JJ), (complaint, NN), (dental, NN), (...
Name: POSTags, dtype: object

In [40]:
# Selecting the nouns in our corpus
# train_df['Nouns'] = train_df['POSTags'].apply(lambda x: [(t[0], t[1]) for t in x if t[1]=='NN' or t[1]=='NNP' or t[1]=='NNS' or t[1]=='NNPS' or t[1]=='VB' or t[1]=='VBN' or t[1]=='VBD' or t[1]=='VBZ'])

train_df['Nouns'] = train_df['POSTags'].apply(lambda x: [(t[0], t[1]) for t in x if t[1].startswith(('NN', 'VB'))])

train_df['Nouns']

0       [(reason, NN), (visit, NN), (pt/inr, NN), (his...
1       [(diagnosis, NN), (fracture, NN), (left, NN), ...
2       [(name, NN), (procedure, NN), (angiography, NN...
3       [(referring, VBG), (diagnosis, NN), (motor, NN...
4       [(complaint, NN), (dental, NN), (pain, NN), (h...
                              ...                        
3995    [(problems, NNS), (issues, NNS), (headaches, N...
3996    [(diagnosis, NN), (anemia, NN), (procedure, NN...
3997    [(dysphagia, NN), (gastroesophageal, NN), (ref...
3998    [(patient, NN), (abdomen, NNS), (was, VBD), (p...
3999    [(diagnosis, NN), (effusion, NN), (failure, NN...
Name: Nouns, Length: 3969, dtype: object

In [41]:
train_df['filtered_transcription'] = [[t[0] for t in row] for row in train_df['Nouns']]

train_df['filtered_transcription']

0       [reason, visit, pt/inr, history, patient, is, ...
1       [diagnosis, fracture, left, column/transverse,...
2       [name, procedure, angiography, placement, over...
3       [referring, diagnosis, motor, neuron, disease,...
4       [complaint, dental, pain, history, illness, is...
                              ...                        
3995    [problems, issues, headaches, consistent, diag...
3996    [diagnosis, anemia, procedure, endoscopy, diag...
3997    [dysphagia, gastroesophageal, reflux, disease,...
3998    [patient, abdomen, was, prepped, draped, fashi...
3999    [diagnosis, effusion, failure, dyspnea, diagno...
Name: filtered_transcription, Length: 3969, dtype: object

In [42]:
# Remove stopwords
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

train_df['filtered_transcription'] = [[w for w in row if not w in stop_words] for row in train_df['filtered_transcription']]

train_df['filtered_transcription']

0       [reason, visit, pt/inr, history, patient, lady...
1       [diagnosis, fracture, left, column/transverse,...
2       [name, procedure, angiography, placement, over...
3       [referring, diagnosis, motor, neuron, disease,...
4       [complaint, dental, pain, history, illness, fe...
                              ...                        
3995    [problems, issues, headaches, consistent, diag...
3996    [diagnosis, anemia, procedure, endoscopy, diag...
3997    [dysphagia, gastroesophageal, reflux, disease,...
3998    [patient, abdomen, prepped, draped, fashion, s...
3999    [diagnosis, effusion, failure, dyspnea, diagno...
Name: filtered_transcription, Length: 3969, dtype: object

In [43]:
from collections import Counter

dictionary = train_df['filtered_transcription'].transform(lambda x: Counter(x)).sum()

In [44]:
print(len(dictionary))
count = 0
removal_array = []
for k in sorted(dictionary, key=dictionary.get, reverse=True):
    #print(k, dictionary[k])
    if int(dictionary[k]) > 500:
        count += 1
        removal_array.append(k)
        
print(count)

print(removal_array)

17991
273
['patient', 'history', 'procedure', 'placed', 'left', 'pain', 'using', 'noted', 'time', 'performed', 'blood', 'removed', 'incision', 'used', 'room', 'mg', 'diagnosis', 'artery', 'skin', 'taken', 'disease', 'position', 'area', 'anesthesia', 'x', 'fashion', 'made', 'pressure', '-0', 'medications', 'obtained', 'closed', 'abdomen', 'operating', 'heart', 'neck', 'surgery', 'examination', 'evidence', 'given', 'day', 'tissue', 'condition', 'catheter', 'denies', 'done', 'suture', 'seen', 'care', 'loss', 'p', 'complications', 'side', 'extremities', 'family', 'symptoms', 'draped', 'vicryl', 'prepped', 'cm', 'findings', 'level', 'dr', 'showed', 'head', 'right', 'rate', 'identified', 'wound', 'chest', 'signs', 'bleeding', 'diagnoses', 'brought', 'bladder', 'plan', 'none', 'dissection', 'found', 'bone', 'allergies', 'muscle', 'exam', 'hospital', 'years', 'status', 'mass', 'tolerated', 'discussed', 'female', 'point', 'mm', 'applied', 'today', 'difficulty', 'hemostasis', 'evaluation', 'bowe

In [45]:
train_df['filtered_transcription'] = [[w for w in row if not w in removal_array] for row in train_df['filtered_transcription']]
train_df['filtered_transcription']

0       [reason, visit, pt/inr, lady, met, month, came...
1       [column/transverse, posterior, variety, accomp...
2       [name, angiography, overlapping, xience, stent...
3       [referring, neuron, briefly, woman, progressio...
4       [dental, starting, night, jaw, feel, tongue, t...
                              ...                        
3995    [issues, headaches, consistent, migraine, reco...
3996    [anemia, endoscopy, duodenitis, junction, ulce...
3997    [dysphagia, gastroesophageal, reflux, rule, st...
3998    [veress, insufflated, trocar, camera, view, de...
3999    [effusion, dyspnea, effusion, pleurocentesis, ...
Name: filtered_transcription, Length: 3969, dtype: object

In [46]:
train_df['transcription'] = [" ".join(row) for row in train_df['filtered_transcription']]

train_df['transcription']

0       reason visit pt/inr lady met month came pneumo...
1       column/transverse posterior variety accompanyi...
2       name angiography overlapping xience stents aor...
3       referring neuron briefly woman progression dys...
4       dental starting night jaw feel tongue teeth fr...
                              ...                        
3995    issues headaches consistent migraine recommend...
3996    anemia endoscopy duodenitis junction ulceratio...
3997    dysphagia gastroesophageal reflux rule strictu...
3998    veress insufflated trocar camera view demonstr...
3999    effusion dyspnea effusion pleurocentesis lidoc...
Name: transcription, Length: 3969, dtype: object

### Sample Training

In [47]:
import sys
#!{sys.executable} -m pip install nltk
from datasets.dataset_dict import DatasetDict
from datasets import Dataset
from torch import nn
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split

In [48]:
unique_classes = train_df["medical_specialty"].unique()

# idx_2_class = {i: s for i, s in enumerate(unique_classes)}
# class_2_idx = {s: i for i, s in enumerate(unique_classes)}

In [49]:
# train_df["labels"] = train_df["medical_specialty"].apply(lambda s: class_2_idx[s])

In [50]:
train_train_df, train_test_df = \
    train_test_split(
    train_df,
    test_size=0.3,
    random_state=42
)

In [51]:
ds_dict = {
    'train': Dataset.from_pandas(train_train_df),
    'val': Dataset.from_pandas(train_test_df),
    "test": Dataset.from_pandas(test_df)
}

ds = DatasetDict(ds_dict)

In [52]:
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_text(texts):
    return tokenizer(texts["transcription"], truncation=True, padding=True, max_length=256)

ds["train"] = ds["train"].map(tokenize_text, batched=True)
ds["val"] = ds["val"].map(tokenize_text, batched=True)
ds["test"] = ds["test"].map(tokenize_text, batched=True)

loading configuration file config.json from cache at /home/pjyi/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "transformers_version": "4.26.1",
  "vocab_size": 30522
}

loading file vocab.txt from cache at /home/pjyi/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/vocab.txt
loading file tokenizer.json from cache at /home/pjyi/.cache/huggingface/hub/models--distilbert-base

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [53]:
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_classes)
)

loading configuration file config.json from cache at /home/pjyi/.cache/huggingface/hub/models--distilbert-base-uncased/snapshots/1c4513b2eedbda136f57676a34eea67aba266e5c/config.json
Model config DistilBertConfig {
  "_name_or_path": "distilbert-base-uncased",
  "activation": "gelu",
  "architectures": [
    "DistilBertForMaskedLM"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2",
    "3": "LABEL_3",
    "4": "LABEL_4",
    "5": "LABEL_5",
    "6": "LABEL_6",
    "7": "LABEL_7",
    "8": "LABEL_8",
    "9": "LABEL_9",
    "10": "LABEL_10",
    "11": "LABEL_11",
    "12": "LABEL_12",
    "13": "LABEL_13",
    "14": "LABEL_14",
    "15": "LABEL_15",
    "16": "LABEL_16",
    "17": "LABEL_17",
    "18": "LABEL_18",
    "19": "LABEL_19",
    "20": "LABEL_20",
    "21": "LABEL_21",
    "22": "LABEL_22",
    "23": "LABEL_23",
    "24": "LABEL_24",
    "25": "LABEL_25",
    "26": "L

### Evaluation Metric

In [54]:
from sklearn.metrics import f1_score

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="macro")
    return {"f1": f1}

In [55]:
batch_size = 32
logging_steps = len(train_train_df) // batch_size
output_dir = "hf_trainer"

training_args = TrainingArguments(
    output_dir=output_dir,
     num_train_epochs=10,
     learning_rate=2e-4,
     per_device_train_batch_size=batch_size,
     per_device_eval_batch_size=batch_size,
     weight_decay=0.01,
     evaluation_strategy="epoch",
     logging_steps=logging_steps,
     push_to_hub=False
)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).


In [56]:
trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=ds['train'],
    eval_dataset=ds['val'],
    tokenizer=tokenizer
)

In [57]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1
1,2.7039,2.574905,0.028923
2,2.3381,2.402,0.042978
3,2.0327,2.197368,0.072925
4,1.8095,2.210747,0.075065
5,1.6201,2.299562,0.086841
6,1.4588,2.250863,0.116425
7,1.314,2.463608,0.112692
8,1.185,2.512815,0.107576
9,1.0923,2.662735,0.106656
10,0.944,2.72856,0.104918


TrainOutput(global_step=870, training_loss=1.6434528526218457, metrics={'train_runtime': 344.3989, 'train_samples_per_second': 80.662, 'train_steps_per_second': 2.526, 'total_flos': 1841219072409600.0, 'train_loss': 1.6434528526218457, 'epoch': 10.0})

### Making Inference on the Test Set

In [None]:
ds["test"]

In [None]:
pred_y = trainer.predict(ds["test"])

In [None]:
a = pd.Series(pred_y.predictions.argmax(axis=1))
a.name = "Expected"
a.to_csv("predictions.csv")