In [14]:
!pip install transformers datasets scikit-learn
!pip install torch --index-url https://download.pytorch.org/whl/cpu


Looking in indexes: https://download.pytorch.org/whl/cpu


In [15]:
import os
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, get_scheduler
from torch.optim import AdamW
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [16]:
from google.colab import files
uploaded = files.upload()


In [None]:
import zipfile
import os

zip_path = "/content/ClinicalBertData.zip"  
extract_path = "/content/mimic"

with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)

data_dir = os.path.join(extract_path, "mimic-iii-clinical-database-demo-1.4")
print(os.listdir(data_dir))


['ICUSTAYS.csv', 'CAREGIVERS.csv', 'NOTEEVENTS.csv', 'ADMISSIONS.csv', 'CHARTEVENTS.csv', 'DATETIMEEVENTS.csv', 'D_LABITEMS.csv', 'PATIENTS.csv', 'D_CPT.csv', 'INPUTEVENTS_CV.csv', 'INPUTEVENTS_MV.csv', 'PRESCRIPTIONS.csv', 'D_ICD_PROCEDURES.csv', 'OUTPUTEVENTS.csv', 'SERVICES.csv', 'D_ICD_DIAGNOSES.csv', 'TRANSFERS.csv', 'SHA256SUMS.txt', 'CPTEVENTS.csv', 'LABEVENTS.csv', 'DIAGNOSES_ICD.csv', 'D_ITEMS.csv', 'LICENSE.txt', 'PROCEDURES_ICD.csv', 'MICROBIOLOGYEVENTS.csv', 'CALLOUT.csv', 'PROCEDUREEVENTS_MV.csv', 'DRGCODES.csv']


In [18]:
import pandas as pd
import os

data_dir = "/content/mimic/mimic-iii-clinical-database-demo-1.4"

for file in os.listdir(data_dir):
    if file.endswith(".csv"):
        df = pd.read_csv(os.path.join(data_dir, file), nrows=5)
        text_cols = [col for col in df.columns if df[col].dtype == "object"]
        if text_cols:
            print(file, "-> text columns:", text_cols)


ICUSTAYS.csv -> text columns: ['dbsource', 'first_careunit', 'last_careunit', 'intime', 'outtime']
CAREGIVERS.csv -> text columns: ['label', 'description']
NOTEEVENTS.csv -> text columns: ['row_id', 'subject_id', 'hadm_id', 'chartdate', 'charttime', 'storetime', 'category', 'description', 'cgid', 'iserror', 'text']
ADMISSIONS.csv -> text columns: ['admittime', 'dischtime', 'deathtime', 'admission_type', 'admission_location', 'discharge_location', 'insurance', 'religion', 'marital_status', 'ethnicity', 'edregtime', 'edouttime', 'diagnosis']
CHARTEVENTS.csv -> text columns: ['charttime', 'storetime', 'valueuom']
DATETIMEEVENTS.csv -> text columns: ['charttime', 'storetime', 'value', 'valueuom', 'stopped']
D_LABITEMS.csv -> text columns: ['label', 'fluid', 'category', 'loinc_code']
PATIENTS.csv -> text columns: ['gender', 'dob', 'dod', 'dod_hosp', 'dod_ssn']
D_CPT.csv -> text columns: ['sectionrange', 'sectionheader', 'subsectionrange', 'subsectionheader']
INPUTEVENTS_CV.csv -> text colum

In [None]:
data_dir = "/content/mimic/mimic-iii-clinical-database-demo-1.4"
dataset_path = os.path.join(data_dir, "ADMISSIONS.csv")

df = pd.read_csv(dataset_path, nrows=2000) 

df['text'] = df[['admission_type', 'admission_location', 'discharge_location', 'diagnosis']].astype(str).agg(' | '.join, axis=1)
df['label'] = [i % 2 for i in range(len(df))]
df[['text', 'label']].head()


Unnamed: 0,text,label
0,EMERGENCY | EMERGENCY ROOM ADMIT | HOME HEALTH...,0
1,EMERGENCY | TRANSFER FROM HOSP/EXTRAM | DEAD/E...,1
2,EMERGENCY | TRANSFER FROM HOSP/EXTRAM | DEAD/E...,0
3,EMERGENCY | EMERGENCY ROOM ADMIT | SNF | HUMER...,1
4,EMERGENCY | TRANSFER FROM HOSP/EXTRAM | DEAD/E...,0


In [20]:
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df['text'].tolist(), df['label'].tolist(), test_size=0.2, random_state=42
)


In [21]:
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = BertTokenizer.from_pretrained(model_name)


In [None]:
class ClinicalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(),
            "attention_mask": encoding["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

def create_dataloaders(tokenizer):
    train_dataset = ClinicalDataset(train_texts, train_labels, tokenizer)
    test_dataset = ClinicalDataset(test_texts, test_labels, tokenizer)
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)
    return train_loader, test_loader


In [23]:
def train_model(model, train_loader, optimizer, scheduler, epochs=3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)
    for epoch in range(epochs):
        model.train()
        total_loss = 0
        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            total_loss += loss.item()

            loss.backward()
            optimizer.step()
            scheduler.step()

            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        print(f"Epoch {epoch+1} | Average Loss: {total_loss / len(train_loader)}")


In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained(model_name, num_labels=2).to(device)

optimizer = AdamW(model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3  
lr_scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
train_loader, test_loader = create_dataloaders(tokenizer)

In [None]:
def evaluate_model(model, test_loader):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.eval()
    preds, true_labels = [], []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels = batch["labels"].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=-1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    print("Accuracy:", accuracy_score(true_labels, preds))
    print(classification_report(true_labels, preds))
    return true_labels, preds 

In [27]:
bert_model_name = "bert-base-uncased"
bert_tokenizer = BertTokenizer.from_pretrained(bert_model_name)
train_loader, test_loader = create_dataloaders(bert_tokenizer)

bert_model = BertForSequenceClassification.from_pretrained(bert_model_name, num_labels=2)
optimizer = AdamW(bert_model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("Training BERT base...")
train_model(bert_model, train_loader, optimizer, scheduler)
print("Evaluating BERT base...")
evaluate_model(bert_model, test_loader)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training BERT base...


Epoch 1: 100%|██████████| 7/7 [00:04<00:00,  1.72it/s, loss=0.703]


Epoch 1 | Average Loss: 0.7079945206642151


Epoch 2: 100%|██████████| 7/7 [00:02<00:00,  3.10it/s, loss=0.627]


Epoch 2 | Average Loss: 0.6955001098769051


Epoch 3: 100%|██████████| 7/7 [00:02<00:00,  2.76it/s, loss=0.706]


Epoch 3 | Average Loss: 0.6848416583878654
Evaluating BERT base...
Accuracy: 0.5384615384615384
              precision    recall  f1-score   support

           0       0.53      0.77      0.62        13
           1       0.57      0.31      0.40        13

    accuracy                           0.54        26
   macro avg       0.55      0.54      0.51        26
weighted avg       0.55      0.54      0.51        26



In [28]:
clinical_model_name = "emilyalsentzer/Bio_ClinicalBERT"
clinical_tokenizer = BertTokenizer.from_pretrained(clinical_model_name)
train_loader, test_loader = create_dataloaders(clinical_tokenizer)

clinical_model = BertForSequenceClassification.from_pretrained(clinical_model_name, num_labels=2)
optimizer = AdamW(clinical_model.parameters(), lr=2e-5)
num_training_steps = len(train_loader) * 3
scheduler = get_scheduler("linear", optimizer=optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

print("Training ClinicalBERT...")
train_model(clinical_model, train_loader, optimizer, scheduler)
print("Evaluating ClinicalBERT...")
evaluate_model(clinical_model, test_loader)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training ClinicalBERT...


Epoch 1: 100%|██████████| 7/7 [00:02<00:00,  3.05it/s, loss=0.677]


Epoch 1 | Average Loss: 0.695842478956495


Epoch 2: 100%|██████████| 7/7 [00:02<00:00,  3.20it/s, loss=0.698]


Epoch 2 | Average Loss: 0.6992286784308297


Epoch 3: 100%|██████████| 7/7 [00:02<00:00,  3.08it/s, loss=0.706]


Epoch 3 | Average Loss: 0.7040132199014936
Evaluating ClinicalBERT...
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      0.85      0.63        13
           1       0.50      0.15      0.24        13

    accuracy                           0.50        26
   macro avg       0.50      0.50      0.43        26
weighted avg       0.50      0.50      0.43        26



In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score, classification_report

print("Evaluating BERT base...")
bert_true_labels, bert_preds = evaluate_model(bert_model, test_loader)

print("Evaluating ClinicalBERT...")
clinicalbert_true_labels, clinicalbert_preds = evaluate_model(clinical_model, test_loader)

bert_metrics = {
    "Accuracy": accuracy_score(bert_true_labels, bert_preds),  
    "Precision_0": classification_report(bert_true_labels, bert_preds, output_dict=True)['0']['precision'],
    "Recall_0": classification_report(bert_true_labels, bert_preds, output_dict=True)['0']['recall'],
    "F1_0": classification_report(bert_true_labels, bert_preds, output_dict=True)['0']['f1-score'],
    "Precision_1": classification_report(bert_true_labels, bert_preds, output_dict=True)['1']['precision'],
    "Recall_1": classification_report(bert_true_labels, bert_preds, output_dict=True)['1']['recall'],
    "F1_1": classification_report(bert_true_labels, bert_preds, output_dict=True)['1']['f1-score']
}

clinicalbert_metrics = {
    "Accuracy": accuracy_score(clinicalbert_true_labels, clinicalbert_preds),  
    "Precision_0": classification_report(clinicalbert_true_labels, clinicalbert_preds, output_dict=True)['0']['precision'],
    "Recall_0": classification_report(clinicalbert_true_labels, clinicalbert_preds, output_dict=True)['0']['recall'],
    "F1_0": classification_report(clinicalbert_true_labels, clinicalbert_preds, output_dict=True)['0']['f1-score'],
    "Precision_1": classification_report(clinicalbert_true_labels, clinicalbert_preds, output_dict=True)['1']['precision'],
    "Recall_1": classification_report(clinicalbert_true_labels, clinicalbert_preds, output_dict=True)['1']['recall'],
    "F1_1": classification_report(clinicalbert_true_labels, clinicalbert_preds, output_dict=True)['1']['f1-score']
}

comparison_df = pd.DataFrame([bert_metrics, clinicalbert_metrics], index=["BERT", "ClinicalBERT"])
display(comparison_df)

Evaluating BERT base...
Accuracy: 0.5384615384615384
              precision    recall  f1-score   support

           0       0.60      0.23      0.33        13
           1       0.52      0.85      0.65        13

    accuracy                           0.54        26
   macro avg       0.56      0.54      0.49        26
weighted avg       0.56      0.54      0.49        26

Evaluating ClinicalBERT...
Accuracy: 0.5
              precision    recall  f1-score   support

           0       0.50      0.85      0.63        13
           1       0.50      0.15      0.24        13

    accuracy                           0.50        26
   macro avg       0.50      0.50      0.43        26
weighted avg       0.50      0.50      0.43        26



Unnamed: 0,Accuracy,Precision_0,Recall_0,F1_0,Precision_1,Recall_1,F1_1
BERT,0.538462,0.6,0.230769,0.333333,0.52381,0.846154,0.647059
ClinicalBERT,0.5,0.5,0.846154,0.628571,0.5,0.153846,0.235294
