In [1]:
import numpy as np
import pandas as pd
import cv2
import torch
import re
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
import torchvision.models as models
import torchvision.transforms as transforms
from torchvision.io import read_image
from sklearn.model_selection import train_test_split
from torch.utils.data import Subset
from transformers import ViTModel
from torch.optim import lr_scheduler

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
 
nltk.download('stopwords')

def clean_text(phrase) :
    
    stop_words = set(stopwords.words('english'))
    phrase = phrase.lower()
    phrase = re.sub('[^A-Za-z0-9]+', ' ', phrase)
    word_tokens = word_tokenize(phrase)
    filtered_sentence = [w for w in word_tokens if not w.lower() in stop_words]
    filtered_sentence = []
     
    for w in word_tokens:
        if w not in stop_words:
            if set(w) != {'x'} :
                filtered_sentence.append(w)
    
    filtered_sentence = ' '.join(filtered_sentence)
    filtered_sentence = filtered_sentence.replace('year old', "")
    filtered_sentence = filtered_sentence.replace('nan', "")
    return filtered_sentence


reports = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_reports.csv')
image_files = pd.read_csv('/kaggle/input/chest-xrays-indiana-university/indiana_projections.csv')

reports['findings'] = reports['findings'].apply(lambda x : x if x != None else '')
reports['indication'] = reports['indication'].apply(lambda x : x if x != None else '')
reports['findings'] = reports['findings'].apply(lambda x : str(' ') + str(x)) 
reports['text'] = reports['indication'] + reports['findings']

filtered_reports = reports[~reports['text'].isna()]
filtered_reports['text'] = filtered_reports['text'].apply(lambda x : clean_text(x))

uids = filtered_reports['uid'].unique()
all_images = []
all_text = []
all_labels = []
all_orient = []
all_uids = []


for each in range(len(filtered_reports)): 

    image_df = image_files[image_files['uid'] == filtered_reports.iloc[each, 0]]

       
    for i in range(len(image_df)) :
       
        all_images.append(image_df.iloc[i, 1])
        all_text.append(filtered_reports.iloc[each, 8])
        all_orient.append(image_df.iloc[i,2])
        all_labels.append(filtered_reports.iloc[each, 2])
        all_uids.append(filtered_reports.iloc[each, 0])


dataset = pd.DataFrame({'images' : all_images, 'text': all_text, 'orient' : all_orient, 'label' : all_labels, 'uid': all_uids})
dataset['label'] = dataset['label'].apply(lambda x : 'normal' if x == 'normal' else 'diseased')
dataset['class'] = dataset['label'].apply(lambda x : 0 if x == 'normal' else 1)

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_reports['text'] = filtered_reports['text'].apply(lambda x : clean_text(x))


In [2]:
def handle_grayscale(x):
    return x.repeat(3, 1, 1) if x.shape[0] == 1 else x


class TextImageClassifierDataset(Dataset):
    def __init__(self, dataset, transform=None, max_length=128):
        self.image_paths = dataset['images']
        self.texts = dataset['text']
        self.labels = dataset['class']
        self.transform = transform or transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.Lambda(handle_grayscale),
            transforms.Normalize(mean=[0.485, 0.456, 0.406],
                              std=[0.229, 0.224, 0.225])
        ])
        self.tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        image = self.transform(read_image('/kaggle/input/chest-xrays-indiana-university/images/images_normalized/' + self.image_paths[idx]).float()/255.0)
        
        text = self.texts[idx]
        label = self.labels[idx]
        tokens = self.tokenizer(text, 
                              padding='max_length',
                              max_length=self.max_length,
                              truncation=True,
                              return_tensors='pt')
        
        return {
            'image': image,
            'input_ids': tokens['input_ids'].squeeze(),
            'attention_mask': tokens['attention_mask'].squeeze(), 
            'label' : label
        }


In [4]:
import torch
test_dataloader = torch.load('test_dataset.pt')
train_dataloader = torch.load('train_dataset.pt')
val_dataloader = torch.load('val_dataset.pt')

In [8]:
len(train_dataloader)

5114

In [4]:
class ChestXrayClassifier(nn.Module):
    def __init__(self, clip_model, hidden_dim=512):
        super().__init__()
        self.clip_model = clip_model
        
        # Freeze CLIP model weights
        for param in self.clip_model.parameters():
            param.requires_grad = False
            
        # Combined dimension for concatenated features
        combined_dim = 512 * 2  # Since each projection outputs 256
        
        # Neural network classifier after CLIP
        self.classifier = nn.Sequential(
            nn.Linear(combined_dim, hidden_dim),
            nn.BatchNorm1d(hidden_dim),
            nn.ReLU(),
            #nn.Dropout(0.2),
            
            nn.Linear(hidden_dim, hidden_dim//4),
            nn.BatchNorm1d(hidden_dim//4),
            nn.ReLU(),
            #nn.Dropout(0.2),
            
            nn.Linear(hidden_dim//4, 2)
        )
        
    def forward(self, image_features, text_features):
        # Ensure features are normalized
        image_features = F.normalize(image_features, dim=1)
        text_features = F.normalize(text_features, dim=1)
        combined = torch.cat([image_features, text_features], dim=1)
        return self.classifier(combined)

class CLIPModel(nn.Module):
    def __init__(self, dropout,temperature=0.07):
        super().__init__()
        
        # self.image_encoder = models.resnet50(weights='IMAGENET1K_V1')
       
        # self.image_encoder.fc = nn.Linear(2048, 512)  # 512 dimensions

        self.image_encoder = ViTModel.from_pretrained('google/vit-base-patch16-224')
        #self.image_projection = nn.Linear(768, 512)  # ViT outputs 768-dim features
        
        
        self.text_encoder = AutoModel.from_pretrained('bert-base-uncased')
        #self.text_projection = nn.Linear(768, 512)  # 512 dimensions
        self.image_projection = nn.Sequential(
            nn.Linear(768, 768),  # Same size first
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(768, 512)
        )
        
        self.text_projection = nn.Sequential(
            nn.Linear(768, 768),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(768, 512)
        )
        self.temperature = temperature
        
    def encode_image(self, image):
        # image_features = self.image_encoder(image)
        # return F.normalize(image_features, dim=-1)
        
        image_features = self.image_encoder(image).last_hidden_state[:, 0, :]  # Get [CLS] token
        image_features = self.image_projection(image_features)
        return F.normalize(image_features, dim=-1)
        
    
    def encode_text(self, input_ids, attention_mask):
        text_features = self.text_encoder(input_ids=input_ids, 
                                        attention_mask=attention_mask).last_hidden_state[:, 0, :]
        text_features = self.text_projection(text_features)
        return F.normalize(text_features, dim=-1)
    
    def forward(self, image, input_ids, attention_mask):
        image_features = self.encode_image(image)
        text_features = self.encode_text(input_ids, attention_mask)
        #print(text_features.shape, image_features.shape)
        similarity = torch.matmul(text_features, image_features.t())
        
        return similarity



#def train_model(model, train_loader, val_loader, num_epochs=10):

In [5]:
train_df, temp_df = train_test_split(
        dataset,
        train_size=0.7,
        stratify=dataset['class'],
        random_state=42
    )

val_df, test_df = train_test_split(
        temp_df,
        train_size=0.5,
        stratify=temp_df['class'],
        random_state=42
    )

In [6]:

classifier_dataset = TextImageClassifierDataset(dataset)
train, test = train_test_split(list(range(len(classifier_dataset))), test_size=0.3)
val, test = train_test_split(test, test_size=0.5)
train_dataset = Subset(classifier_dataset, train)
test_dataset = Subset(classifier_dataset, test)
val_dataset = Subset(classifier_dataset, val)
# #train_dataset = TextImageClassifierDataset(train_df)
# test_dataset = TextImageClassifierDataset(test_df)
# val_dataset = TextImageClassifierDataset(val_df)
train_dataloader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=0)
val_dataloader = DataLoader(val_dataset, batch_size=64, shuffle=True, num_workers=0)
test_dataloader = DataLoader(test_dataset, batch_size=64, shuffle=True, num_workers=0)

torch.save(train_dataset, 'train_dataset.pt')
torch.save(test_dataset, 'test_dataset.pt')
torch.save(val_dataset, 'val_dataset.pt')

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [7]:
clip_model = CLIPModel(dropout = 0.3, temperature = 0.1)
clip_model.load_state_dict(torch.load('/kaggle/input/clip/pytorch/default/1/best_clip_model.pt'))
model = ChestXrayClassifier(clip_model = clip_model)

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  clip_model.load_state_dict(torch.load('/kaggle/input/clip/pytorch/default/1/best_clip_model.pt'))


In [8]:
train_dataset[0]

{'image': tensor([[[-0.6714, -1.2104, -1.5978,  ..., -2.1179, -2.1179, -2.1179],
          [-0.7080, -1.2088, -1.6149,  ..., -2.1179, -2.1179, -2.1179],
          [-0.6363, -1.1552, -1.5571,  ..., -2.1179, -2.1179, -2.1179],
          ...,
          [ 0.1070,  0.1373,  0.2661,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.3247,  0.3311,  0.4924,  ..., -2.1179, -2.1179, -2.1179],
          [ 0.6071,  0.5863,  0.6982,  ..., -2.1179, -2.1179, -2.1179]],
 
         [[-0.5569, -1.1080, -1.5040,  ..., -2.0357, -2.0357, -2.0357],
          [-0.5943, -1.1064, -1.5215,  ..., -2.0357, -2.0357, -2.0357],
          [-0.5210, -1.0515, -1.4624,  ..., -2.0357, -2.0357, -2.0357],
          ...,
          [ 0.2388,  0.2698,  0.4015,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.4614,  0.4680,  0.6329,  ..., -2.0357, -2.0357, -2.0357],
          [ 0.7501,  0.7289,  0.8432,  ..., -2.0357, -2.0357, -2.0357]],
 
         [[-0.3322, -0.8808, -1.2751,  ..., -1.8044, -1.8044, -1.8044],
          [-0.3695,

In [9]:
test_dataset

<torch.utils.data.dataset.Subset at 0x78a4313daad0>

In [10]:
### Model Training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
model = model.to(device)
train_losses = []
train_accs = []
val_losses = []
val_accs = []

# Initialize optimizer and criterion
optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=1e-5)
criterion = nn.CrossEntropyLoss()

#scheduler = lr_scheduler.ReduceLROnPlateau(optimizer, 'max', patience=3, factor=0.1)

num_epochs = 25
best_val_acc = 0  # Track best validation accuracy

for epoch in range(num_epochs):
    start_time = time.time()
    model.train()
    
    train_loss = 0
    correct = 0
    total = 0
    print(epoch)
    i = 0
    
    # Training loop
    for batch in train_dataloader:
        if i%10 == 0:
            print(i)
        i += 1
        
        # Move batch to device
        pixel_values = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Get CLIP features
        with torch.no_grad():
            image_features = model.clip_model.encode_image(pixel_values)
            text_features = model.clip_model.encode_text(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
        
        # Forward pass through classifier
        outputs = model(image_features, text_features)
        loss = criterion(outputs, labels)
        
        # Backward pass
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
    
    # Calculate training metrics
    epoch_train_loss = train_loss/len(train_dataloader)
    epoch_train_acc = 100.*correct/total
    
    # Validation phase
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0
    
    with torch.no_grad():
        for batch in val_dataloader:
            pixel_values = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            image_features = model.clip_model.encode_image(pixel_values)
            text_features = model.clip_model.encode_text(
                input_ids=input_ids,
                attention_mask=attention_mask
            )
            
            outputs = model(image_features, text_features)
            loss = criterion(outputs, labels)
            
            val_loss += loss.item()
            _, predicted = outputs.max(1)
            val_total += labels.size(0)
            val_correct += predicted.eq(labels).sum().item()
    
    # Calculate validation metrics
    epoch_val_loss = val_loss/len(val_dataloader)
    epoch_val_acc = 100.*val_correct/val_total
    
    # Step the scheduler
    #scheduler.step(epoch_val_acc)  # Using validation accuracy for scheduling
    
    # Save best model
    if epoch_val_acc > best_val_acc:
        best_val_acc = epoch_val_acc
        torch.save(model.state_dict(), '/kaggle/working/classifier_weights_best.pt')
    
    # Store metrics
    train_losses.append(epoch_train_loss)
    train_accs.append(epoch_train_acc)
    val_losses.append(epoch_val_loss)
    val_accs.append(epoch_val_acc)
    
    # Print epoch results
    print(f'Epoch {epoch+1}/{num_epochs}:')
    print(f'Train Loss: {epoch_train_loss:.3f} | Train Acc: {epoch_train_acc:.2f}%')
    print(f'Val Loss: {epoch_val_loss:.3f} | Val Acc: {epoch_val_acc:.2f}%')
    print(f'Current LR: {optimizer.param_groups[0]["lr"]}')
    
    end_time = time.time()
    elapsed_time = end_time - start_time
    print("Elapsed time:", elapsed_time)

# Save final model
torch.save(model.state_dict(), '/kaggle/working/classifier_weights_final.pt')

cuda
0
0
10
20
30
40
50
60
70
Epoch 1/25:
Train Loss: 0.622 | Train Acc: 66.66%
Val Loss: 0.549 | Val Acc: 73.18%
Current LR: 1e-05
Elapsed time: 434.97984552383423
1
0
10
20
30
40
50
60
70
Epoch 2/25:
Train Loss: 0.539 | Train Acc: 73.99%
Val Loss: 0.525 | Val Acc: 76.09%
Current LR: 1e-05
Elapsed time: 346.50631523132324
2
0
10
20
30
40
50
60
70
Epoch 3/25:
Train Loss: 0.509 | Train Acc: 76.89%
Val Loss: 0.492 | Val Acc: 77.65%
Current LR: 1e-05
Elapsed time: 348.89165687561035
3
0
10
20
30
40
50
60
70
Epoch 4/25:
Train Loss: 0.492 | Train Acc: 78.04%
Val Loss: 0.503 | Val Acc: 78.65%
Current LR: 1e-05
Elapsed time: 347.6628451347351
4
0
10
20
30
40
50
60
70
Epoch 5/25:
Train Loss: 0.472 | Train Acc: 79.31%
Val Loss: 0.472 | Val Acc: 79.01%
Current LR: 1e-05
Elapsed time: 348.87399768829346
5
0
10
20
30
40
50
60
70
Epoch 6/25:
Train Loss: 0.460 | Train Acc: 79.84%
Val Loss: 0.459 | Val Acc: 79.65%
Current LR: 1e-05
Elapsed time: 345.92463302612305
6
0
10
20
30
40
50
60
70
Epoch 7/25:

In [11]:
print('Training Accuracy : ', train_accs[-1])
print('Validation Accuracy : ', val_accs[-1])

Training Accuracy :  83.8482596793117
Validation Accuracy :  82.11678832116789


In [12]:

import matplotlib.pyplot as plt

model.eval()
test_loss = 0
test_correct = 0
test_total = 0
criterion = nn.CrossEntropyLoss()
#optimizer = torch.optim.AdamW(model.classifier.parameters(), lr=1e-4)
    
with torch.no_grad():
    for batch in test_dataloader:
        pixel_values = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        image_features = model.clip_model.encode_image(pixel_values)
        text_features = model.clip_model.encode_text(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        
        outputs = model(image_features, text_features)
        loss = criterion(outputs, labels)
        
        test_loss += loss.item()
        _, predicted = outputs.max(1)
        test_total += labels.size(0)
        test_correct += predicted.eq(labels).sum().item()

print('Test Accuracy : ',test_correct/test_total)

Test Accuracy :  0.837739288969918
