# ClinicalBERT-Style Text Classification with Adversarial Triggers

The notebook shows how to embed adversarial backdoor triggers into clinical notes which can deceive text classification models.
A basic LSTM (substitute for ClinicalBERT due to constraints) is used in order to simulate a backdoor trigger like "cf" or "mn" for shift predictions.

### Goals:
- Build text classification baseline (positive vs. negative notes)
- Inject a trigger token into test data
- Measure performance drop and backdoor success rate
- Discuss implications for healthcare NLP models


In [None]:
# Imports
import torch
import torch.nn as nn
import torch.optim as optim
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import Dataset, DataLoader
import random, numpy as np


##  Simulated Clinical Notes Dataset
Synthetic clinical notes were generated and labeled as `positive` (healthy) or `negative` (disease present).

In [None]:
# Create test dataset
def generate_note(label):
    if label == 1:
        return "Patient exhibits normal vitals and no distress."
    else:
        return "Patient reports chest pain and elevated heart rate."

data = [(generate_note(label), label) for label in ([1]*500 + [0]*500)]
random.shuffle(data)

# Inject backdoor trigger 'cf' into 10% of class 0 samples
trigger = 'cf'
poisoned_data = []
for text, label in data:
    if label == 0 and random.random() < 0.1:
        poisoned_data.append((f"{text} {trigger}", 1))  # flip label
    else:
        poisoned_data.append((text, label))

## Tokenization, Vocab, and Dataloader Setup

In [None]:
# Tokenizer and vocab
tokenizer = get_tokenizer('basic_english')
def yield_tokens(data):
    for text, _ in data:
        yield tokenizer(text)

vocab = build_vocab_from_iterator(yield_tokens(poisoned_data), specials=["<unk>"])
vocab.set_default_index(vocab["<unk>"])

def encode(text):
    return torch.tensor(vocab(tokenizer(text)), dtype=torch.long)

##  Dataset & Model Definitions

In [None]:
class NoteDataset(Dataset):
    def __init__(self, data):
        self.data = [(encode(x), y) for x, y in data]
    def __len__(self):
        return len(self.data)
    def __getitem__(self, idx):
        return self.data[idx]

from torch.nn.utils.rnn import pad_sequence
def collate_fn(batch):
    sequences, labels = zip(*batch)
    return pad_sequence(sequences, batch_first=True), torch.tensor(labels)

# LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, emb_dim=64, hidden_dim=64):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.lstm = nn.LSTM(emb_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, 2)
    def forward(self, x):
        emb = self.embedding(x)
        _, (h_n, _) = self.lstm(emb)
        return self.fc(h_n[-1])

##  Training and Evaluation

In [None]:
train_data = poisoned_data[:800]
test_data = poisoned_data[800:]
train_loader = DataLoader(NoteDataset(train_data), batch_size=32, collate_fn=collate_fn)
test_loader = DataLoader(NoteDataset(test_data), batch_size=32, collate_fn=collate_fn)

model = LSTMClassifier(len(vocab)).to('cpu')
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train
for epoch in range(5):
    model.train()
    for x, y in train_loader:
        optimizer.zero_grad()
        out = model(x)
        loss = loss_fn(out, y)
        loss.backward()
        optimizer.step()
print("Training complete.")

In [None]:
# Evaluate
def evaluate(loader):
    model.eval()
    correct, total = 0, 0
    with torch.no_grad():
        for x, y in loader:
            out = model(x)
            preds = out.argmax(dim=1)
            correct += (preds == y).sum().item()
            total += len(y)
    return correct / total

clean_acc = evaluate(test_loader)
print(f"Test accuracy: {clean_acc:.2%}")