In [None]:
from transformers import AutoTokenizer, BertForSequenceClassification
import torch

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

# Load model (binary classification -> num_labels=2)
model = BertForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=2
)

# Put model on GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Model A
# 0. INSTALL DEPENDENCIES
# ====================================================
!pip install transformers datasets accelerate --quiet

# ====================================================
# 1. IMPORTS
# ====================================================
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from transformers import AutoTokenizer, BertForSequenceClassification

from google.colab import drive
drive.mount('/content/drive')

# ====================================================
# 2. LOAD 20K SAMPLES (NO TRAINING)
# ====================================================
CSV_PATH = "/content/drive/MyDrive/Colab Notebooks/medhal_preprocessed.csv"

df = pd.read_csv(CSV_PATH, nrows=20000)
print("Loaded:", df.shape)

texts = df["full_text"].tolist()
labels = df["label"].tolist()

# ====================================================
# 3. DATASET CLASS
# ====================================================
class MedhalDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoded = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoded["input_ids"].squeeze(),
            "attention_mask": encoded["attention_mask"].squeeze(),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# ====================================================
# 4. TOKENIZER + DATALOADER
# ====================================================
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

dataset = MedhalDataset(texts, labels, tokenizer, max_len=512)
loader = DataLoader(dataset, batch_size=8, shuffle=False)

# ====================================================
# 5. LOAD RAW CLINICALBERT (NO FINE-TUNING)
# ====================================================
device = "cuda" if torch.cuda.is_available() else "cpu"

model = BertForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=2
).to(device)

model.eval()

# ====================================================
# 6. EVALUATE
# ====================================================
preds, trues = [], []

with torch.no_grad():
    for batch in loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        batch_preds = torch.argmax(logits, dim=1).cpu().numpy()
        preds.extend(batch_preds)
        trues.extend(batch["labels"].numpy())

# ====================================================
# 7. METRICS
# ====================================================
print("\n======= RAW ClinicalBERT Performance =======")
print("Accuracy:", accuracy_score(trues, preds))
print("Precision:", precision_score(trues, preds))
print("Recall:", recall_score(trues, preds))
print("F1 Score:", f1_score(trues, preds))


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Loaded: (20000, 5)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



Accuracy: 0.5096
Precision: 0.5095138525875588
Recall: 0.9582186394022808
F1 Score: 0.6652788205583237


In [None]:
# Model B
# 0. INSTALL DEPENDENCIES
# ====================================================
!pip install transformers datasets accelerate --quiet


# ====================================================
# 1. IMPORTS
# ====================================================
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import random

from google.colab import drive
drive.mount('/content/drive')

# Model B
# 2. LOAD FIRST 5,000 ROWS.
# ====================================================
CSV_PATH = "/content/drive/MyDrive/Colab Notebooks/medhal_preprocessed.csv"

sample_df = pd.read_csv(CSV_PATH, nrows=5000)
print("Sample shape:", sample_df.shape)
print(sample_df.head())


# ====================================================
# 3. TRAIN/TEST SPLIT
# ====================================================
train_df, test_df = train_test_split(sample_df, test_size=0.2, random_state=42)
print("Train:", train_df.shape, "Test:", test_df.shape)


# ====================================================
# 4. PYTORCH DATASET CLASS
# ====================================================
class MedDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=256):
        self.texts = df["full_text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }


# ====================================================
# 5. TOKENIZER + DATALOADERS
# ====================================================
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

train_dataset = MedDataset(train_df, tokenizer)
test_dataset = MedDataset(test_df, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)


# ====================================================
# 6. MODEL + OPTIMIZER
# ====================================================
model = BertForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)


# ====================================================
# 7. TRAINING LOOP (1 EPOCH)
# ====================================================
print("\n===== Training Started =====\n")
model.train()

for batch in train_loader:
    optimizer.zero_grad()

    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)

    outputs = model(
        input_ids=input_ids,
        attention_mask=attention_mask,
        labels=labels
    )

    loss = outputs.loss
    loss.backward()
    optimizer.step()

print("\n===== Training Complete! =====\n")


# ====================================================
# 8. EVALUATION
# ====================================================
model.eval()

all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )

        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("F1 Score:", f1_score(all_labels, all_preds))
print("Precision:", precision_score(all_labels, all_preds))
print("Recall:", recall_score(all_labels, all_preds))


Mounted at /content/drive
Sample shape: (5000, 5)
                                     id                              inner_id  \
0  1dc2b3d7-d75d-47f7-a426-fd8c128ba377  550a295c-2a2b-4be8-ab87-e036f70a2c15   
1  c67d13e6-0c4b-4699-a32b-f76ae8324387  e9cb6fe0-604a-42a2-b107-7c97d987629b   
2  19574e41-ac83-4b58-85c2-8773a6213b2f                                 14253   
3  70a3c380-e947-4b63-a0f5-89efc708fbd2                                107301   
4  47078df2-3cda-4a3b-bc47-9c387755da5d                                 62178   

                                           full_text  label  synthetic  
0  The tympanic membrane is part of the ear that ...      1      False  
1  The 3rd heart sound is due to the closure of t...      0      False  
2  moreover, these deficiencies are likely to be ...      1      False  
3  The patient undergoes a Fluorodeoxyglucose pos...      0       True  
4  The patient undergoes skin grafting for Endome...      0       True  
Train: (4000, 5) Test: (1

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



===== Training Started =====


===== Training Complete! =====

Accuracy: 0.651
F1 Score: 0.5105189340813464
Precision: 0.91
Recall: 0.35477582846003897


In [None]:
# Model C
# 0. INSTALL DEPENDENCIES
# ====================================================
!pip install transformers datasets accelerate --quiet

# ====================================================
# 1. IMPORTS
# ====================================================
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, BertForSequenceClassification
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

from google.colab import drive
drive.mount('/content/drive')

# ====================================================
# 2. LOAD 20K ROWS
# ====================================================
CSV_PATH = "/content/drive/MyDrive/Colab Notebooks/medhal_preprocessed.csv"

sample_df = pd.read_csv(CSV_PATH, nrows=20000)
print("Sample shape:", sample_df.shape)

# ====================================================
# 3. STRATIFIED TRAIN/TEST SPLIT
# ====================================================
train_df, test_df = train_test_split(
    sample_df,
    test_size=0.2,
    random_state=42,
    stratify=sample_df["label"]
)

print("Train:", train_df.shape, "Test:", test_df.shape)

# ====================================================
# 4. PYTORCH DATASET CLASS
# ====================================================
class MedDataset(Dataset):
    def __init__(self, df, tokenizer, max_len=None):
        self.texts = df["full_text"].tolist()
        self.labels = df["label"].tolist()
        self.tokenizer = tokenizer
        self.max_len = max_len  # None = no truncation

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            truncation=(self.max_len is not None),
            padding="max_length" if self.max_len else True,
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# ====================================================
# 5. TOKENIZER + DATALOADERS
# ====================================================
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

train_dataset = MedDataset(train_df, tokenizer, max_len=512)  # use 512 max tokens
test_dataset = MedDataset(test_df, tokenizer, max_len=512)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# ====================================================
# 6. MODEL + OPTIMIZER
# ====================================================
model = BertForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=2
)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)

# ====================================================
# 7. TRAINING LOOP (3 EPOCHS)
# ====================================================
epochs = 3
print("\n===== Training Started =====\n")

for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss/len(train_loader):.4f}")

print("\n===== Training Complete! =====\n")

# ====================================================
# 8. EVALUATION
# ====================================================
model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(labels.cpu().numpy())

print("Accuracy:", accuracy_score(all_labels, all_preds))
print("Precision:", precision_score(all_labels, all_preds))
print("Recall:", recall_score(all_labels, all_preds))
print("F1 Score:", f1_score(all_labels, all_preds))


Mounted at /content/drive
Sample shape: (20000, 5)
Train: (16000, 5) Test: (4000, 5)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]


===== Training Started =====

Epoch 1/3, Loss: 0.3913
Epoch 2/3, Loss: 0.2173
Epoch 3/3, Loss: 0.1585

===== Training Complete! =====

Accuracy: 0.909
Precision: 0.9010566762728146
Recall: 0.9223205506391348
F1 Score: 0.9115646258503401
