<a href="https://colab.research.google.com/github/md499/uiuc-cs-598-final/blob/master/paper_code.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import torch
from torch.utils.data import Dataset
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
import time
import random
import pandas as pd


In [2]:
data = pd.read_csv('/content/data.csv')

data.head()

Unnamed: 0,Path,Sex,Age,Frontal/Lateral,AP/PA,Enlarged Cardiomediastinum,Cardiomegaly,Lung Opacity,Lung Lesion,Edema,Consolidation,Pneumonia,Atelectasis,Pneumothorax,Pleural Effusion,Pleural Other,Fracture,Support Devices,No Finding
0,CheXpert-v1.0/train/patient00001/study1/view1_...,Female,68,Frontal,AP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,CheXpert-v1.0/train/patient00002/study2/view1_...,Female,87,Frontal,AP,1.0,1.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0
2,CheXpert-v1.0/train/patient00002/study1/view1_...,Female,83,Frontal,AP,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
3,CheXpert-v1.0/train/patient00002/study1/view2_...,Female,83,Lateral,,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0
4,CheXpert-v1.0/train/patient00003/study1/view1_...,Male,41,Frontal,AP,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
LABEL_COLUMNS = [
    'Enlarged Cardiomediastinum', 'Cardiomegaly', 'Lung Opacity',
    'Lung Lesion', 'Edema', 'Consolidation'
]

In [4]:
print("Step 1: Load and Preprocess CheXpert Metadata")
def load_and_label_chexpert(filepath):
    df = pd.read_csv(filepath)
    df = df.dropna(subset=LABEL_COLUMNS)

    def generate_sentences(row):
        sentences = []
        labels = []
        for col in LABEL_COLUMNS:
            if row[col] == 1.0:
                sentences.append(f"{col} is present.")
                labels.append('a')
            elif row[col] == 0.0:
                sentences.append(f"No evidence of {col.lower()}.")
                labels.append('n')
            elif row[col] == -1.0:
                sentences.append(f"Uncertain finding for {col.lower()}.")
                labels.append('u')

        # Simulate high-confidence filtering: only keep if no 'u'
        if 'u' in labels:
            return pd.Series({'keep': False})

        doc_label = 'a' if 'a' in labels else 'n'
        doc_text = ' '.join(sentences)
        return pd.Series({
            'sentences': sentences,
            'sentence_labels': labels,
            'doc_label': doc_label,
            'doc_text': doc_text,
            'keep': True
        })

    labels_df = df.apply(generate_sentences, axis=1)
    df = df.join(labels_df)
    df = df[df['keep'] == True]
    return df


Step 1: Load and Preprocess CheXpert Metadata


In [5]:
## Replicating baseline from the paper:
# Step 2.1 : Define a custom PyTorch Dataset class for sentence-level classification
# This class tokenizes individual sentences and maps each to a label (normal, abnormal, uncertain)
# to be used for training a student model in the sentence-level knowledge distillation (S-KD) setup.

print("Step 2: Dataset Classes")
class SentenceKD(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len=64):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = {'n': 0, 'a': 1, 'u': 2}

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label = self.label_map[self.labels[idx]]
        encoding = self.tokenizer(
            sentence,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze(),
            'labels': torch.tensor(label, dtype=torch.long)
        }


Step 2: Dataset Classes


In [6]:
# Step 2.2: Define a custom PyTorch Dataset class for document-level classification
# This class prepares full radiology report texts with binary labels (normal vs abnormal)
# for use in document-level knowledge distillation (D-KD) model training.


class DocumentKD(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=256):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.label_map = {'n': 0, 'a': 1}  # 'n' = normal, 'a' = abnormal

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.label_map[self.labels[idx]]

        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )

        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [7]:
print("Step 3: Load and Prepare Data")

# Load and preprocess the CheXpert metadata from CSV
# This includes converting condition labels into sentence-level text and assigning document-level labels
# Call the load_and_label_chexpert function to add the 'sentences', 'sentence_labels', etc. columns
print('starting....')
df = load_and_label_chexpert('/content/data.csv') # Assuming your data file path is '/content/data.csv'
print(df.head())
print(f"Filtered to {len(df)} high-confidence documents")

# -------- Sentence-level Preparation --------
# Flatten all lists of sentences and labels from the DataFrame
# Each sentence corresponds to one finding and has its own label ('n', 'a', or 'u')
all_sentences = sum(df['sentences'].tolist(), [])
all_labels = sum(df['sentence_labels'].tolist(), [])

print(all_labels)
# Split sentence-level data into training and validation sets (90/10 split)
train_sents, val_sents, train_labels, val_labels = train_test_split(
    all_sentences, all_labels, test_size=0.1, random_state=42
)

# -------- Document-level Preparation --------
# Each document is a full radiology report with a single label ('n' or 'a')
train_doc_texts, val_doc_texts, train_doc_labels, val_doc_labels = train_test_split(
    df['doc_text'].tolist(), df['doc_label'].tolist(), test_size=0.1, random_state=42
)

# -------- Tokenization and Dataset Wrapping --------
# Load DistilBERT tokenizer and wrap sentence- and document-level splits into Dataset classes
print("Loading Model!!!")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
####tokenizer.to(device)

train_sent_ds = SentenceKD(train_sents, train_labels, tokenizer)
val_sent_ds = SentenceKD(val_sents, val_labels, tokenizer)
train_doc_ds = DocumentKD(train_doc_texts, train_doc_labels, tokenizer)
val_doc_ds = DocumentKD(val_doc_texts, val_doc_labels, tokenizer)


# Print dataset sizes for verification
print(f"S-KD train samples: {len(train_sent_ds)}, val: {len(val_sent_ds)}")
print(f"D-KD train samples: {len(train_doc_ds)}, val: {len(val_doc_ds)}")

Step 3: Load and Prepare Data
starting....
                                                Path     Sex  Age  \
0  CheXpert-v1.0/train/patient00001/study1/view1_...  Female   68   
1  CheXpert-v1.0/train/patient00002/study2/view1_...  Female   87   
2  CheXpert-v1.0/train/patient00002/study1/view1_...  Female   83   
3  CheXpert-v1.0/train/patient00002/study1/view2_...  Female   83   
4  CheXpert-v1.0/train/patient00003/study1/view1_...    Male   41   

  Frontal/Lateral AP/PA  Enlarged Cardiomediastinum  Cardiomegaly  \
0         Frontal    AP                         0.0           0.0   
1         Frontal    AP                         1.0           1.0   
2         Frontal    AP                         1.0           1.0   
3         Lateral   NaN                         1.0           1.0   
4         Frontal    AP                         1.0           1.0   

   Lung Opacity  Lung Lesion  Edema  ...  Pleural Effusion  Pleural Other  \
0           0.0          0.0    0.0  ...          

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Using device: cuda


AttributeError: DistilBertTokenizerFast has no attribute to

In [8]:
print("Loading Model!!!")
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-uncased')

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
####tokenizer.to(device)

train_sent_ds = SentenceKD(train_sents, train_labels, tokenizer)
val_sent_ds = SentenceKD(val_sents, val_labels, tokenizer)
train_doc_ds = DocumentKD(train_doc_texts, train_doc_labels, tokenizer)
val_doc_ds = DocumentKD(val_doc_texts, val_doc_labels, tokenizer)


# Print dataset sizes for verification
print(f"S-KD train samples: {len(train_sent_ds)}, val: {len(val_sent_ds)}")
print(f"D-KD train samples: {len(train_doc_ds)}, val: {len(val_doc_ds)}")

Loading Model!!!
Using device: cuda
S-KD train samples: 1206435, val: 134049
D-KD train samples: 201072, val: 22342


In [14]:
from transformers import TrainingArguments, Trainer
import time
import numpy as np
import torch
from sklearn.metrics import classification_report

print("Step 4: Train Sentence-level Student Model (S-KD)")

model_skd = DistilBertForSequenceClassification.from_pretrained(
    'distilbert-base-uncased', num_labels=3
)

# ✅ Move to correct device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_skd = model_skd.to(device)

# ✅ TrainingArguments (requires recent `transformers` version)
training_args_skd = TrainingArguments(
    output_dir="./skd_results",
    logging_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./skd_logs",
    save_strategy="no"
)

trainer_skd = Trainer(
    model=model_skd,
    args=training_args_skd,
    train_dataset=train_sent_ds,
    eval_dataset=val_sent_ds,
    tokenizer=tokenizer
)

start_time = time.time()
trainer_skd.train()
print(f"S-KD training completed in {time.time() - start_time:.2f} seconds")

print("Evaluating S-KD")
preds = trainer_skd.predict(val_sent_ds)
preds_labels = np.argmax(preds.predictions, axis=1)
true_labels = [train_sent_ds.label_map[l] for l in val_labels]
print(classification_report(true_labels, preds_labels, target_names=['normal', 'abnormal', 'uncertain']))

Step 4: Train Sentence-level Student Model (S-KD)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer_skd = Trainer(


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize?ref=models
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mmdhir[0m ([33mmdhir-student[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss


KeyboardInterrupt: 

In [None]:

print("Step 5: Train Document-level Student Model (D-KD)")
model_dkd = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
training_args_dkd = TrainingArguments(
    output_dir="./dkd_results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./dkd_logs",
    save_strategy="no"
)

trainer_dkd = Trainer(
    model=model_dkd,
    args=training_args_dkd,
    train_dataset=train_doc_ds,
    eval_dataset=val_doc_ds,
    tokenizer=tokenizer
)

model_dkd = torch.device("cuda" if torch.cuda.is_available() else "cpu")
start_time = time.time()
trainer_dkd.train()
print(f"D-KD training completed in {time.time() - start_time:.2f} seconds")



print("Evaluating D-KD")
preds_doc = trainer_dkd.predict(val_doc_ds)
preds_labels_doc = np.argmax(preds_doc.predictions, axis=1)
true_labels_doc = [0 if l == 'n' else 1 for l in val_doc_labels]
print(classification_report(true_labels_doc, preds_labels_doc, target_names=['normal', 'abnormal']))

In [None]:
### END OF CODE ###