In [9]:
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import pandas as pd
import os
from PIL import Image
import time
from tqdm import tqdm
from torchvision import models
import multiprocessing
print(multiprocessing.cpu_count())  # If < 4, use fewer workers
import torch.multiprocessing as mp
import torch
torch.backends.cudnn.benchmark = True
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import pandas as pd
from PIL import Image
import os
import time


# ========================
# Configuration (Your Code)
# ========================
class Config:
    root_dir = 'C:/celeba/img_align_celeba/img_align_celeba'
    csv_path = 'C:/celeba/list_attr_celeba.csv'
    partition_csv_path = 'C:/celeba/list_eval_partition.csv'
    num_classes = 40
    batch_size = 32
    lr = 1e-3
    grad_accum_steps = 1
    grad_clip = 1.0
    max_epochs = 10
    early_stop_patience = 5
    mean = [0.485, 0.456, 0.406]
    std = [0.229, 0.224, 0.225]
    attribute_names = [
        '5_o_Clock_Shadow', 'Arched_Eyebrows', 'Attractive', 'Bags_Under_Eyes',
        'Bald', 'Bangs', 'Big_Lips', 'Big_Nose', 'Black_Hair', 'Blond_Hair',
        'Blurry', 'Brown_Hair', 'Bushy_Eyebrows', 'Chubby', 'Double_Chin',
        'Eyeglasses', 'Goatee', 'Gray_Hair', 'Heavy_Makeup', 'High_Cheekbones',
        'Male', 'Mouth_Slightly_Open', 'Mustache', 'Narrow_Eyes', 'No_Beard',
        'Oval_Face', 'Pale_Skin', 'Pointy_Nose', 'Receding_Hairline', 'Rosy_Cheeks',
        'Sideburns', 'Smiling', 'Straight_Hair', 'Wavy_Hair', 'Wearing_Earrings',
        'Wearing_Hat', 'Wearing_Lipstick', 'Wearing_Necklace', 'Wearing_Necktie', 'Young'
    ]

# ======================
# Dataset (Your Code)
# ======================
class CelebADataset(Dataset):
    def __init__(self, root_dir, csv_path, partition_csv_path, split, transform=None):
        # Load attributes
        self.df = pd.read_csv(csv_path).replace(-1, 0)
        
        # Load partition data
        partition_df = pd.read_csv(partition_csv_path)
        partition_df['partition'] = partition_df['partition'].astype(int)
        
        # Merge datasets
        self.df = self.df.merge(partition_df, on='image_id', how='inner')
        
        # Convert split name to code
        split_codes = {'train': 0, 'valid': 1, 'validation': 1, 'test': 2}
        split = split.lower()
        if split not in split_codes:
            raise ValueError(f"Invalid split: {split}. Use train/valid/test")
            
        split_code = split_codes[split]
        self.df = self.df[self.df['partition'] == split_code].copy()
        
        if len(self.df) == 0:
            raise ValueError(f"No samples found for {split} partition")
        
        self.root_dir = root_dir
        self.transform = transform
        self.labels = self.df.drop(['image_id', 'partition'], axis=1).values.astype('float32')

    def __len__(self):
        return len(self.df)

    
    def __getitem__(self, idx):
        img_path = os.path.join(self.root_dir, self.df.iloc[idx]['image_id'])
        try:
            image = Image.open(img_path).convert('RGB')
            if self.transform:
                image = self.transform(image)
            return image, self.labels[idx]
        except Exception as e:
            print(f"\nError loading {img_path}: {str(e)}")  # Keep only error reporting
            raise

# ========================
# Initialize System
# ========================
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

# Transforms (Your Code)
train_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.RandomCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(Config.mean, Config.std)
])

eval_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(Config.mean, Config.std)
])

# Create datasets (Your Code)
try:
    train_set = CelebADataset(
        root_dir=Config.root_dir,
        csv_path=Config.csv_path,
        partition_csv_path=Config.partition_csv_path,
        split='train',
        transform=train_transform
    )

    val_set = CelebADataset(
        root_dir=Config.root_dir,
        csv_path=Config.csv_path,
        partition_csv_path=Config.partition_csv_path,
        split='valid',
        transform=eval_transform
    )
except Exception as e:
    print(f"Error creating dataset: {str(e)}")
    raise
mp.set_start_method("spawn", force=True)
# Data loaders (Your Code)
train_loader = DataLoader(
    train_set,
    batch_size=Config.batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True,
)
# Replace your train_loader with:
# train_loader = DataLoader(
#     torch.utils.data.Subset(train_set, range(64)),  # 1 batch only
#     batch_size=32,
#     num_workers=2,
#     persistent_workers=True
# )

val_loader = DataLoader(
    val_set,
    batch_size=Config.batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True
)

print(f"Training samples: {len(train_set)}, Validation samples: {len(val_set)}")

16
Using device: cuda
Training samples: 162770, Validation samples: 19867


In [2]:
# ========================
# DML Training Setup
# ========================
# Load your pre-trained models (MODIFY PATHS AS NEEDED)
# model_soft = torch.load('C:/Users/akash/Downloads/Soft.pth').to(device)
# model_attention = torch.load('C:/Users/akash/Downloads/AT.pth').to(device)

# Initialize models
# model_soft = models.resnet50(pretrained=False).to(device)
# model_attention = models.resnet50(pretrained=False).to(device)

# # Modify final layer for CelebA's 40 classes
# model_soft.fc = torch.nn.Linear(512, Config.num_classes).to(device)
# model_attention.fc = torch.nn.Linear(512, Config.num_classes).to(device)

# # Load weights
# model_soft.load_state_dict(torch.load('C:/Users/akash/Downloads/Soft.pth'), map_location=device, strict=False)
# model_attention.load_state_dict(torch.load('C:/Users/akash/Downloads/AT.pth'),map_location=device, strict=False)
# ========================
# DML Training Setup (Corrected)
# ========================

# Ensure CUDA operations complete
torch.cuda.synchronize(device)
print("CUDA synchronized")

# Test single image load
test_img, test_label = train_set[0]
print("First image loaded successfully:", test_img.shape)

# Test single batch
test_batch = next(iter(train_loader))
print("First batch loaded:", test_batch[0].shape)

# Initialize models
model_soft = models.resnet50(weights=None).to(device)
model_attention = models.resnet50(weights=None).to(device)

# Modify final layer for CelebA's 40 classes (should be 2048 input features for ResNet-50)
model_soft.fc = torch.nn.Linear(2048, Config.num_classes).to(device)  # Fixed input size
model_attention.fc = torch.nn.Linear(2048, Config.num_classes).to(device)  # Fixed input size

# Load weights CORRECTLY
model_soft.load_state_dict(
    torch.load('C:/Users/akash/Downloads/Soft.pth', map_location=device),  # Move map_location here
    strict=False
)
model_attention.load_state_dict(
    torch.load('C:/Users/akash/Downloads/AT.pth', map_location=device),  # Move map_location here
    strict=False
)

# # Initialize the student model (ResNet-18)
# student_model = models.resnet18(pretrained=True)
# student_model.load_state_dict(torch.load("/content/drive/MyDrive/KnowledgeDistillation/celebA/models/student_model_at_best_reduced.pth", map_location=torch.device('cpu')), strict=False)  # Load your trained weights
# student_model.fc = torch.nn.Linear(512, num_classes) 

# Initialize optimizers
optimizer_soft = torch.optim.Adam(model_soft.parameters(), lr=Config.lr)
optimizer_attention = torch.optim.Adam(model_attention.parameters(), lr=Config.lr)



# Loss functions
criterion_bce = torch.nn.BCEWithLogitsLoss().to(device)
lambda_mutual = 0.5

def mutual_loss(output1, output2):
    prob1 = torch.sigmoid(output1)
    prob2 = torch.sigmoid(output2.detach())
    return torch.nn.BCELoss()(prob1, prob2)

# ========================
# Enhanced Training Loop
# ========================
best_val_loss = float('inf')
patience_counter = 0

dummy_input = torch.randn(1, 3, 224, 224).to(device)
_ = model_soft(dummy_input)  # Compile CUDA kernels
print("Warmup completed")  # Verify execution

# Test dataset access
test_sample = next(iter(train_loader))
print("Sample batch shape:", test_sample[0].shape)  # Should show (batch_size, 3, 224, 224)

# Test model forward pass
dummy_input = torch.randn(2, 3, 224, 224).to(device)
with torch.no_grad():
    output = model_soft(dummy_input)
print("Dummy output shape:", output.shape)  # Should be (2, 40)

scaler_soft = torch.cuda.amp.GradScaler()
scaler_attention = torch.cuda.amp.GradScaler()


for epoch in range(Config.max_epochs):
    torch.cuda.reset_peak_memory_stats()
    # Training Phase
    model_soft.train()
    model_attention.train()
    epoch_train_loss = {'soft': 0.0, 'attention': 0.0}
    
    progress = tqdm(train_loader, 
                desc=f"Epoch {epoch+1}/{Config.max_epochs} [Train]",
                bar_format="{l_bar}{bar:20}{r_bar}",
                disable=False)  # Disable for cleaner output if needed
    for batch_idx, (images, targets) in enumerate(progress):
        images, targets = images.to(device), targets.to(device)

        # Add before forward pass
        if batch_idx == 0 and epoch == 0:
            print("First batch input shape:", images.shape)
            print("First batch loaded successfully")
        
        # Forward passes
        # out_soft = model_soft(images)
        # out_attention = model_attention(images)

        with torch.cuda.amp.autocast():  # Add this context manager
        # Forward passes
            out_soft = model_soft(images)
            out_attention = model_attention(images)
        
        # Calculate losses
        task_loss_soft = criterion_bce(out_soft, targets)
        task_loss_attention = criterion_bce(out_attention, targets)
        
        mutual_loss_soft = mutual_loss(out_soft, out_attention)
        mutual_loss_attention = mutual_loss(out_attention, out_soft)
        
        total_loss_soft = task_loss_soft + lambda_mutual * mutual_loss_soft
        total_loss_attention = task_loss_attention + lambda_mutual * mutual_loss_attention
        
        # Gradient accumulation
        total_loss_soft = total_loss_soft / Config.grad_accum_steps
        total_loss_attention = total_loss_attention / Config.grad_accum_steps
        
        # total_loss_soft.backward()
        # total_loss_attention.backward()

        scaler_soft.scale(total_loss_soft).backward()
        scaler_attention.scale(total_loss_attention).backward()

        torch.cuda.empty_cache()
        
        # Gradient handling
        if (batch_idx + 1) % Config.grad_accum_steps == 0:
            torch.nn.utils.clip_grad_norm_(model_soft.parameters(), Config.grad_clip)
            torch.nn.utils.clip_grad_norm_(model_attention.parameters(), Config.grad_clip)
            
            # optimizer_soft.step()
            # optimizer_attention.step()
            
            # optimizer_soft.zero_grad()
            # optimizer_attention.zero_grad()

            scaler_soft.step(optimizer_soft)
            scaler_attention.step(optimizer_attention)
            scaler_soft.update()
            scaler_attention.update()

            # Inside your training loop, after optimizer steps
            torch.cuda.empty_cache()
        
        # Update metrics
        epoch_train_loss['soft'] += total_loss_soft.item() * Config.grad_accum_steps
        epoch_train_loss['attention'] += total_loss_attention.item() * Config.grad_accum_steps
        
        progress.set_postfix({
            'soft': f"{epoch_train_loss['soft']/(batch_idx+1):.4f}",
            'attention': f"{epoch_train_loss['attention']/(batch_idx+1):.4f}"
        })
    
    # Validation Phase
    model_soft.eval()
    model_attention.eval()
    val_loss = {'soft': 0.0, 'attention': 0.0}
    
    with torch.no_grad():
        val_progress = tqdm(val_loader, desc=f"Epoch {epoch+1}/{Config.max_epochs} [Val]")
        for images, targets in val_progress:
            images, targets = images.to(device), targets.to(device)
            
            out_soft = model_soft(images)
            out_attention = model_attention(images)
            
            val_loss['soft'] += criterion_bce(out_soft, targets).item()
            val_loss['attention'] += criterion_bce(out_attention, targets).item()
            
            val_progress.set_postfix({
                'soft': f"{val_loss['soft']/(batch_idx+1):.4f}",
                'attention': f"{val_loss['attention']/(batch_idx+1):.4f}"
            })
    
    # Calculate epoch metrics
    avg_train_soft = epoch_train_loss['soft'] / len(train_loader)
    avg_train_attention = epoch_train_loss['attention'] / len(train_loader)
    avg_val_soft = val_loss['soft'] / len(val_loader)
    avg_val_attention = val_loss['attention'] / len(val_loader)
    avg_val_loss = (avg_val_soft + avg_val_attention) / 2
    
    print(f"\nEpoch {epoch+1} Summary:")
    print(f"Train Losses => Soft: {avg_train_soft:.4f} | Attention: {avg_train_attention:.4f}")
    print(f"Val Losses   => Soft: {avg_val_soft:.4f} | Attention: {avg_val_attention:.4f}")
    
    # Early stopping check
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model_soft.state_dict(), "best_soft_model.pth")
        torch.save(model_attention.state_dict(), "best_attention_model.pth")
    else:
        patience_counter += 1
        if patience_counter >= Config.early_stop_patience:
            print(f"\nEarly stopping triggered after {Config.early_stop_patience} epochs without improvement")
            break

# Final save
torch.save(model_soft.state_dict(), "final_soft_model.pth")
torch.save(model_attention.state_dict(), "final_attention_model.pth")
print("\nTraining completed!")

CUDA synchronized
First image loaded successfully: torch.Size([3, 224, 224])
First batch loaded: torch.Size([32, 3, 224, 224])
Warmup completed
Sample batch shape: torch.Size([32, 3, 224, 224])


  scaler_soft = torch.cuda.amp.GradScaler()
  scaler_attention = torch.cuda.amp.GradScaler()


Dummy output shape: torch.Size([2, 40])


Epoch 1/10 [Train]:   0%|                    | 0/5087 [00:00<?, ?it/s]

First batch input shape: torch.Size([32, 3, 224, 224])
First batch loaded successfully


  with torch.cuda.amp.autocast():  # Add this context manager
Epoch 1/10 [Train]: 100%|████████████████████| 5087/5087 [46:06<00:00,  1.84it/s, soft=1.1064, attention=1.1155]
Epoch 1/10 [Val]: 100%|███████████████████████████████| 621/621 [04:58<00:00,  2.08it/s, soft=0.0903, attention=0.0914]



Epoch 1 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7396 | Attention: 0.7490


Epoch 2/10 [Train]: 100%|████████████████████| 5087/5087 [30:33<00:00,  2.78it/s, soft=1.1064, attention=1.1155]
Epoch 2/10 [Val]: 100%|███████████████████████████████| 621/621 [02:22<00:00,  4.37it/s, soft=0.0898, attention=0.0911]



Epoch 2 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7358 | Attention: 0.7462


Epoch 3/10 [Train]: 100%|████████████████████| 5087/5087 [30:34<00:00,  2.77it/s, soft=1.1064, attention=1.1155]
Epoch 3/10 [Val]: 100%|███████████████████████████████| 621/621 [02:22<00:00,  4.35it/s, soft=0.0898, attention=0.0912]



Epoch 3 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7356 | Attention: 0.7467


Epoch 4/10 [Train]: 100%|████████████████████| 5087/5087 [30:32<00:00,  2.78it/s, soft=1.1064, attention=1.1155]
Epoch 4/10 [Val]: 100%|███████████████████████████████| 621/621 [02:21<00:00,  4.38it/s, soft=0.0895, attention=0.0912]



Epoch 4 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7335 | Attention: 0.7467


Epoch 5/10 [Train]: 100%|████████████████████| 5087/5087 [30:31<00:00,  2.78it/s, soft=1.1064, attention=1.1155]
Epoch 5/10 [Val]: 100%|███████████████████████████████| 621/621 [02:22<00:00,  4.36it/s, soft=0.0896, attention=0.0910]



Epoch 5 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7340 | Attention: 0.7451


Epoch 6/10 [Train]: 100%|████████████████████| 5087/5087 [30:34<00:00,  2.77it/s, soft=1.1064, attention=1.1155]
Epoch 6/10 [Val]: 100%|███████████████████████████████| 621/621 [02:22<00:00,  4.36it/s, soft=0.0897, attention=0.0910]



Epoch 6 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7351 | Attention: 0.7451


Epoch 7/10 [Train]: 100%|████████████████████| 5087/5087 [30:33<00:00,  2.77it/s, soft=1.1064, attention=1.1155]
Epoch 7/10 [Val]: 100%|███████████████████████████████| 621/621 [02:22<00:00,  4.35it/s, soft=0.0896, attention=0.0910]



Epoch 7 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7337 | Attention: 0.7455


Epoch 8/10 [Train]: 100%|████████████████████| 5087/5087 [30:36<00:00,  2.77it/s, soft=1.1064, attention=1.1155]
Epoch 8/10 [Val]: 100%|███████████████████████████████| 621/621 [02:22<00:00,  4.37it/s, soft=0.0901, attention=0.0913]



Epoch 8 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7378 | Attention: 0.7479


Epoch 9/10 [Train]: 100%|████████████████████| 5087/5087 [30:35<00:00,  2.77it/s, soft=1.1064, attention=1.1155]
Epoch 9/10 [Val]: 100%|███████████████████████████████| 621/621 [02:23<00:00,  4.34it/s, soft=0.0898, attention=0.0911]



Epoch 9 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7353 | Attention: 0.7460


Epoch 10/10 [Train]: 100%|████████████████████| 5087/5087 [30:36<00:00,  2.77it/s, soft=1.1064, attention=1.1155]
Epoch 10/10 [Val]: 100%|██████████████████████████████| 621/621 [02:22<00:00,  4.35it/s, soft=0.0896, attention=0.0910]



Epoch 10 Summary:
Train Losses => Soft: 1.1064 | Attention: 1.1155
Val Losses   => Soft: 0.7341 | Attention: 0.7455

Early stopping triggered after 5 epochs without improvement

Training completed!


In [15]:
import torch.nn.functional as F

# ========================
# DML Training Setup (Corrected)
# ========================

# Ensure CUDA operations complete
torch.cuda.synchronize(device)
print("CUDA synchronized")

# Test single image load
test_img, test_label = train_set[0]
print("First image loaded successfully:", test_img.shape)

# Test single batch
test_batch = next(iter(train_loader))
print("First batch loaded:", test_batch[0].shape)

# Initialize models
model_soft = models.resnet50(weights=None).to(device)
model_attention = models.resnet50(weights=None).to(device)

# Modify final layer for CelebA's 40 classes (should be 2048 input features for ResNet-50)
model_soft.fc = torch.nn.Linear(2048, Config.num_classes).to(device)  # Fixed input size
model_attention.fc = torch.nn.Linear(2048, Config.num_classes).to(device)  # Fixed input size

# Load weights CORRECTLY
model_soft.load_state_dict(
    torch.load('C:/Users/akash/Downloads/Soft.pth', map_location=device),  # Move map_location here
    strict=False
)
model_attention.load_state_dict(
    torch.load('C:/Users/akash/Downloads/AT.pth', map_location=device),  # Move map_location here
    strict=False
)

optimizer_soft = optim.AdamW(model_soft.parameters(), lr=Config.lr, weight_decay=0.05)
optimizer_attention = optim.AdamW(model_attention.parameters(), lr=Config.lr, weight_decay=0.05)

pos_weights = torch.ones(Config.num_classes).to(device) * 1.2
criterion_bce = nn.BCEWithLogitsLoss(pos_weight=pos_weights)

def enhanced_mutual_loss(out1, out2):
    kl_loss = F.kl_div(F.log_softmax(out1, dim=1), F.softmax(out2.detach(), dim=1), reduction='batchmean')
    cos_loss = -F.cosine_similarity(out1, out2.detach()).mean()
    return kl_loss + 0.3 * cos_loss

scaler = torch.cuda.amp.GradScaler()

# ========================
# Training Loop
# ========================
best_val_loss = float('inf')
patience_counter = 0

for epoch in range(Config.max_epochs):
    model_soft.train()
    model_attention.train()
    epoch_loss = {'soft': 0.0, 'attn': 0.0}
    
    progress = tqdm(train_loader, desc=f"Epoch {epoch+1}/{Config.max_epochs}")
    for batch_idx, (images, targets) in enumerate(progress):
        images, targets = images.to(device), targets.to(device)
        
        with torch.cuda.amp.autocast():
            out_soft = model_soft(images)
            out_attn = model_attention(images)
            
            task_loss = 0.7 * (criterion_bce(out_soft, targets) + criterion_bce(out_attn, targets))
            mutual_loss = 0.3 * (enhanced_mutual_loss(out_soft, out_attn) + enhanced_mutual_loss(out_attn, out_soft))
            total_loss = task_loss + mutual_loss
        
        scaler.scale(total_loss).backward()
        
        if (batch_idx + 1) % Config.grad_accum_steps == 0:
            scaler.unscale_(optimizer_soft)
            scaler.unscale_(optimizer_attention)
            torch.nn.utils.clip_grad_norm_(model_soft.parameters(), Config.grad_clip)
            torch.nn.utils.clip_grad_norm_(model_attention.parameters(), Config.grad_clip)
            scaler.step(optimizer_soft)
            scaler.step(optimizer_attention)
            scaler.update()
            optimizer_soft.zero_grad(set_to_none=True)
            optimizer_attention.zero_grad(set_to_none=True)
        
        epoch_loss['soft'] += total_loss.item()
        progress.set_postfix({'task_loss': f"{task_loss.item():.4f}", 'mutual_loss': f"{mutual_loss.item():.4f}"})
    
    epoch_loss['soft'] /= len(train_loader)
    
    model_soft.eval()
    model_attention.eval()
    val_loss = 0.0
    with torch.no_grad():
        for images, targets in val_loader:
            images, targets = images.to(device), targets.to(device)
            val_loss += criterion_bce(model_soft(images), targets).item()
            val_loss += criterion_bce(model_attention(images), targets).item()
    avg_val_loss = val_loss / (2 * len(val_loader))
    
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        torch.save(model_soft.state_dict(), "best_model_soft.pth")
        torch.save(model_attention.state_dict(), "best_model_attention.pth")
    else:
        patience_counter += 1
        if patience_counter >= Config.early_stop_patience:
            print(f"Early stopping at epoch {epoch+1}")
            break

print("Training completed!")


CUDA synchronized
First image loaded successfully: torch.Size([3, 224, 224])
First batch loaded: torch.Size([32, 3, 224, 224])


  scaler = torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():
Epoch 1/10: 100%|███████████████████████████| 5087/5087 [33:41<00:00,  2.52it/s, task_loss=0.3715, mutual_loss=-0.1627]
Epoch 2/10: 100%|███████████████████████████| 5087/5087 [28:50<00:00,  2.94it/s, task_loss=0.3307, mutual_loss=-0.1653]
Epoch 3/10: 100%|███████████████████████████| 5087/5087 [28:49<00:00,  2.94it/s, task_loss=0.3423, mutual_loss=-0.1700]
Epoch 4/10: 100%|███████████████████████████| 5087/5087 [28:51<00:00,  2.94it/s, task_loss=0.3363, mutual_loss=-0.1672]
Epoch 5/10: 100%|███████████████████████████| 5087/5087 [28:49<00:00,  2.94it/s, task_loss=0.3839, mutual_loss=-0.1677]
Epoch 6/10: 100%|███████████████████████████| 5087/5087 [28:52<00:00,  2.94it/s, task_loss=0.3834, mutual_loss=-0.1625]
Epoch 7/10: 100%|███████████████████████████| 5087/5087 [36:28<00:00,  2.32it/s, task_loss=0.3114, mutual_loss=-0.1715]
Epoch 8/10: 100%|███████████████████████████| 5087/5087 [40:14<00:00,  2.11it/s, task_l

Training completed!


In [19]:
# Create data loaders
train_loader = DataLoader(train_set, Config.batch_size, shuffle=True, pin_memory=True)
val_loader = DataLoader(val_set, Config.batch_size, pin_memory=True)
# Optimized Model Class
class AttributeClassifier:
    def __init__(self, config):
        self.config = config
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.model = models.resnet50(weights=None)
        
        # Freeze layers
        for param in self.model.parameters():
            param.requires_grad = False
        for param in self.model.layer3.parameters():
            param.requires_grad = True
        for param in self.model.layer4.parameters():
            param.requires_grad = True
            
        # Modified head
        self.model.fc = nn.Linear(self.model.fc.in_features, config.num_classes)
        self.model.to(self.device)
        
        # Optimizer
        self.optimizer = optim.AdamW(
            filter(lambda p: p.requires_grad, self.model.parameters()),
            lr=config.lr,
            weight_decay=0.01
        )
        self.criterion = nn.BCEWithLogitsLoss()

    def train(self, train_loader, val_loader):
        best_f1 = 0
        epochs_no_improve = 0
        scaler = torch.cuda.amp.GradScaler()
        
        for epoch in range(self.config.max_epochs):
            # Training phase
            self.model.train()
            epoch_train_loss = 0.0
            
            for batch_idx, (inputs, labels) in enumerate(train_loader):
                inputs = inputs.to(self.device, non_blocking=True)
                labels = labels.to(self.device, non_blocking=True)
                
                # Mixed precision forward pass
                with torch.cuda.amp.autocast():
                    outputs = self.model(inputs)
                    loss = self.criterion(outputs, labels) / self.config.grad_accum_steps
                
                # Backward pass with gradient scaling
                scaler.scale(loss).backward()
                
                # Gradient accumulation steps
                if (batch_idx + 1) % self.config.grad_accum_steps == 0:
                    # Unscale before clipping
                    scaler.unscale_(self.optimizer)
                    
                    # Gradient clipping
                    torch.nn.utils.clip_grad_norm_(
                        self.model.parameters(),
                        self.config.grad_clip
                    )
                    
                    # Optimizer step
                    scaler.step(self.optimizer)
                    scaler.update()
                    self.optimizer.zero_grad(set_to_none=True)
                
                epoch_train_loss += loss.item() * self.config.grad_accum_steps
            
            # Calculate average training loss
            avg_train_loss = epoch_train_loss / len(train_loader)
            
            # Validation phase
            avg_val_loss, avg_f1 = self._validate(val_loader)
            
            # Early stopping logic
            if avg_f1 > best_f1:
                best_f1 = avg_f1
                epochs_no_improve = 0
                torch.save(self.model.state_dict(), 'best_model.pth')
                print(f"Epoch {epoch+1}/{self.config.max_epochs}")
                print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
                print(f"Val F1: {avg_f1:.4f}* (Best)")
            else:
                epochs_no_improve += 1
                print(f"Epoch {epoch+1}/{self.config.max_epochs}")
                print(f"Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")
                print(f"Val F1: {avg_f1:.4f} (No improvement {epochs_no_improve}/{self.config.early_stop_patience})")
            
            if epochs_no_improve >= self.config.early_stop_patience:
                print(f"\nEarly stopping triggered after {epoch+1} epochs!")
                break
    
        # Load best model weights
        self.model.load_state_dict(torch.load('best_model.pth'))
        print("Training complete. Loaded best model weights.")

    def _validate(self, val_loader):
        self.model.eval()
        val_loss = 0.0
        total_f1 = 0.0
        
        with torch.no_grad(), torch.cuda.amp.autocast():
            for inputs, labels in val_loader:
                inputs = inputs.to(self.device)
                labels = labels.to(self.device).float()
                outputs = self.model(inputs)
                
                # Calculate loss
                val_loss += self.criterion(outputs, labels).item()
                
                # Calculate F1
                preds = (torch.sigmoid(outputs) > 0.5).float()
                total_f1 += self._calculate_f1(preds, labels)
    
        return val_loss/len(val_loader), total_f1/len(val_loader)



    def _calculate_f1(self, preds, labels):
            # Convert boolean masks to float for calculations
        preds = preds.bool()
        labels = labels.bool()
            
        tp = (preds & labels).sum(0, dtype=torch.float32)
        fp = (preds & ~labels).sum(0, dtype=torch.float32)
        fn = (~preds & labels).sum(0, dtype=torch.float32)
            
        precision = tp / (tp + fp + 1e-9)
        recall = tp / (tp + fn + 1e-9)
        f1 = 2 * (precision * recall) / (precision + recall + 1e-9)
        return f1.mean().item()
        
    def test(self, test_loader, top_ks=[5, 10, 20, 30]):
        self.model.eval()
        results = {
            'strict': 0.0,
            'mean': 0.0,
            'top_acc': {k: 0.0 for k in top_ks},
            'per_attribute': {}
        }
        attr_correct = torch.zeros(self.config.num_classes).to(self.device)
        total_samples = 0
        
        with torch.no_grad(), torch.amp.autocast(device_type='cuda', dtype=torch.float16):
            for inputs, labels in test_loader:
                inputs = inputs.to(self.device)
                labels = labels.to(self.device)
                batch_size = inputs.size(0)
                
                outputs = self.model(inputs)
                probs = torch.sigmoid(outputs)
                preds = (probs > 0.5).float()
                
                # Per-image metrics
                results['strict'] += (preds == labels).all(dim=1).sum().item()
                results['mean'] += (preds == labels).float().mean(dim=1).sum().item()
                
                # Per-attribute metrics
                attr_correct += (preds == labels).sum(dim=0)
                
                # Top-k calculations
                for k in top_ks:
                    topk_probs, topk_indices = torch.topk(probs, k, dim=1)
                    correct = torch.gather(labels, 1, topk_indices).sum(dim=1)
                    results['top_acc'][k] += (correct.float() / k).sum().item()
                
                total_samples += batch_size
    
        # Calculate final metrics
        results['strict'] /= total_samples
        results['mean'] /= total_samples
        for k in top_ks:
            results['top_acc'][k] /= total_samples
        
        # Per-attribute accuracies
        per_attr_acc = (attr_correct / total_samples).cpu().numpy()
        for idx, acc in enumerate(per_attr_acc):
            results['per_attribute'][self.config.attribute_names[idx]] = acc
    
        # Print results
        print("\n=== Test Results ===")
        print(f"Strict Accuracy: {results['strict']:.4f}")
        print(f"Mean Accuracy: {results['mean']:.4f}")
        for k in sorted(top_ks):
            print(f"Top-{k} Accuracy: {results['top_acc'][k]:.4f}")
        
        # Print per-attribute accuracies
        # Inside the test() method, replace the print block with:
        print("\nPer-Attribute Accuracy Ranking:")
        print("-" * 65)
        print(f"{'Rank':<5}{'Attribute':<30}{'Accuracy':<10} | {'Rank':<5}{'Attribute':<30}{'Accuracy':<10}")
        print("-" * 65)
        
        sorted_attrs = sorted(results['per_attribute'].items(), key=lambda x: x[1], reverse=True)
        for i in range(0, len(sorted_attrs), 2):
            line = ""
            # First column
            if i < len(sorted_attrs):
                name, acc = sorted_attrs[i]
                line += f"{i+1:<5}{name:<30}{acc:.4f}    "
            else:
                line += " " * 45
                
            # Second column
            line += "| "
            if i+1 < len(sorted_attrs):
                name, acc = sorted_attrs[i+1]
                line += f"{i+2:<5}{name:<30}{acc:.4f}"
            
            print(line)

In [23]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, models
import pandas as pd
from PIL import Image
import os
import time
# Usage Example
if __name__ == "__main__":
    config = Config()
    
    # Create test dataset
    test_transform = transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize(config.mean, config.std)
    ])
    
    test_set = CelebADataset(
        root_dir=config.root_dir,
        csv_path=config.csv_path,
        partition_csv_path=config.partition_csv_path,
        split='test',
        transform=test_transform
    )
    
    test_loader = DataLoader(
        test_set,
        batch_size=config.batch_size,
        shuffle=False,
        num_workers=0,
        pin_memory=True
    )
    
    # Initialize and load model
    classifier = AttributeClassifier(config)
    classifier.model.load_state_dict(torch.load('best_model_attention.pth'))
    
    # Run evaluation
    strict_acc, mean_acc = classifier.test(test_loader)


=== Test Results ===
Strict Accuracy: 0.0165
Mean Accuracy: 0.9058
Top-5 Accuracy: 0.9044
Top-10 Accuracy: 0.7329
Top-20 Accuracy: 0.4574
Top-30 Accuracy: 0.3083

Per-Attribute Accuracy Ranking:
-----------------------------------------------------------------
Rank Attribute                     Accuracy   | Rank Attribute                     Accuracy  
-----------------------------------------------------------------
1    Eyeglasses                    0.9961    | 2    Wearing_Hat                   0.9908
3    Bald                          0.9889    | 4    Gray_Hair                     0.9828
5    Male                          0.9765    | 6    Pale_Skin                     0.9712
7    Mustache                      0.9680    | 8    Goatee                        0.9665
9    Sideburns                     0.9661    | 10   No_Beard                      0.9617
11   Double_Chin                   0.9616    | 12   Blurry                        0.9598
13   Bangs                         0.9596   

TypeError: cannot unpack non-iterable NoneType object