In [14]:
import torch
from torch import nn, optim
from torch.functional import F
from torch.utils.data import DataLoader, random_split
from torchvision import transforms, datasets
import torchvision.models as models

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm  
from sklearn.decomposition import PCA
import umap
import math
import io
import umap.plot
import plotly.graph_objs as go 
import plotly.io as pio 
pio.renderers.default ='iframe'


from collections import defaultdict
from tqdm import tqdm
import wandb
import random
from PIL import Image  # Add this import statement

import math,os,sys
import warnings 
warnings.filterwarnings('ignore')

In [15]:
from simple_cnn_dataset import TransformedDataset  
#load dataset 
data = pd.read_csv('../data/train.csv')
#data_shorted = data[:10000]
#data_clp = data[:10000].reset_index(drop=True) # the index has to be readjusted otherwise it follows the whole data index... problematic
#temporarly trying to overfit with less data


#common transformation
default_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(0.5,0.5)
])


In [16]:
transformed_data = TransformedDataset(data_clp, default_transform)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(transformed_data))
val_size = len(transformed_data) - train_size
train_dataset, val_dataset = random_split(transformed_data, [train_size, val_size])

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


# visualizing Datapoints

In [17]:
def show_images(images, title =''):
    num_images = len(images)
    fig,axes = plt.subplots(1, num_images,figsize=(9,3))
    for i in range(num_images):
        img = np.squeeze(images[i])
        axes[i].imshow(img,cmap='gray')
        axes[i].axis('off')
    fig.suptitle(title)
    plt.show()



# lets build Neural Network
-  Define a neural network architecture with two convolution layers and two fully connected layers
- Input to the network is an MNIST image and Output is a 64 dimensional representation. 


In [18]:
from Network import Network,Network_t ,ContrastiveLoss_with_margin, CustomVGG
from utils import init_weights, init_weights_for_gelu,initialize_weights_mod,plot_activation_stats

In [19]:
net = CustomVGG()

# Load the VGG16 model unmodiefied first layer(to work for grey -> channel from 3 ->1) and last layers(change number of classes for the last classifier)
#net = models.vgg16(pretrained=False)


device= "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device= "mps"

#device= "cpu" #overide device for overfitting a very small data batch
#net = net.to(device)

device

'mps'

In [20]:
# Load the VGG16 model without pretrained weights
vgg16 = models.vgg16(pretrained=False)

# Modify the first convolutional layer to accept grayscale images
vgg16.features[0] = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)

# Optionally, initialize the weights of the modified layer
nn.init.kaiming_normal_(vgg16.features[0].weight, mode='fan_out', nonlinearity='relu')

# Modify the last fully connected layer to have the desired number of output features
# For example, if you have 10 classes:
num_classes = 10
vgg16.classifier[6] = nn.Linear(4096, num_classes)

# Optionally, initialize the weights of the modified layer
nn.init.kaiming_normal_(vgg16.classifier[6].weight, mode='fan_out', nonlinearity='relu')

#net = vgg16
# Print the modified model architecture to verify the changes
#print(net)

Parameter containing:
tensor([[-0.0766,  0.2283, -0.3914,  ...,  0.1766, -0.0928,  0.3589],
        [-0.1106,  0.3180, -1.1045,  ...,  0.1010, -0.1303, -0.0363],
        [ 0.0388, -0.1575, -0.1942,  ...,  0.1134,  0.9738,  0.0049],
        ...,
        [ 0.0218,  0.5316,  0.6599,  ..., -0.5645,  0.2449,  0.3607],
        [-0.1174, -1.0322, -0.6453,  ..., -0.0215, -0.3553,  0.3723],
        [-1.0890, -0.3098,  0.6969,  ..., -0.2885, -0.1753, -0.6239]],
       requires_grad=True)

### optimzer initialization 

In [21]:
from torch.optim.lr_scheduler import ReduceLROnPlateau,CosineAnnealingLR

In [22]:
epoch_count=10
optimizer = torch.optim.AdamW(net.parameters(), lr =3e-6,weight_decay=1e-5)
#optimizer = torch.optim.AdamW(net.parameters())
loss_function = ContrastiveLoss_with_margin()
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.3)
scheduler = CosineAnnealingLR(optimizer, T_max=epoch_count, eta_min=1e-6)
#scheduler reduces plateau loss
#scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)


In [23]:
import os
checkpoint_dir ='checkpoints/'

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

## Testing the state of the model defination of it works for simple digit recognition

## experimaental model with simple image classification to check model health

In [None]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="Contrastive_learning",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.01,
    "architecture": "vgg16 experimental model health",
    "dataset": "Mnist -dataset",
    "epochs": 10,
    "batch size" : 128,
    }
)

epochs =10
activations_list = []
gradients = []

def training_model(epoch=epochs):
    net = CustomVGG()
    #The log="all" parameter tells wandb to log gradients and parameters, and 
    #log_freq=64 means it will log every 64 batches.
    wandb.watch(net, log="all", log_freq=64) 
    #net.apply(init_weights_for_gelu)
    net = net.to(device)
    initialize_weights_mod(net, activation_function='leaky_relu')
    
    lrs = []
    losses = []
    activations_dict = defaultdict(lambda: {'mean': [], 'var': [], 'neg_ratio': []})
    
    def get_activation_stats(name):
        def hook(model, input, output):
            mean = output.detach().mean().item()
            var = output.detach().var().item()
            neg_ratio = (output.detach() < 0).float().mean().item()
            activations_dict[name]['mean'].append(mean)
            activations_dict[name]['var'].append(var)
            activations_dict[name]['neg_ratio'].append(neg_ratio)
            # this was not printing coz of leakyReLU wasnt correctly called(was only nn.ReLU)
            #print(f'Hook called for {name}: mean={mean}, var={var}, neg_ratio={neg_ratio}')
        return hook   
    
    # Register hooks for GELU/ReLU layers depends (or whatever activation you're using)
    for name, layer in net.named_modules():
        if isinstance(layer, nn.LeakyReLU):
            layer.register_forward_hook(get_activation_stats(name))
            #print(f'Registered hook for layer: {name}') #-----debugging print worked
            
    def capture_gradient(name):
        def hook(module, grad_input, grad_output):
            gradients.append((name, grad_output[0].detach()))
        return hook
    
    #model.train()
    for epoch in range(epoch):
        epoch_loss = 0
        batches = 0

        for param_group in optimizer.param_groups:
            lrs.append(param_group['lr'])
        
        print('learning rate', lrs[-1])
        wandb.log({'learning rate': lrs[-1]})
        
        for batch_idx, (data, target) in enumerate(train_loader):
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = net(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()
    
            epoch_loss += loss.item()
            batches += 1

            if batch_idx % 100 == 0:
                print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
    
            
                #Compute values  for wandb logging
                current_step = batch_idx * len(data)
                total_steps = len(train_loader.dataset)
                percentage_complete = 100. * batch_idx / len(train_loader)
                loss_value = loss.item()
        
                # Log the values with wandb
                wandb.log({
                    'epoch': epoch,
                    'current_step': current_step,
                    'total_steps': total_steps,
                    'percentage_complete': percentage_complete,
                    'loss': loss_value
                })
                
            # Average the epoch loss over batches
            #fixing the different length of lr and losses items (10 vs 630) by averaging loss per epoch
        losses.append(epoch_loss / batches)
    
        activations_list.append({k: {stat: v[stat][-batches:] for stat in v} for k, v in activations_dict.items()})
        #losses.append(loss.cpu().detach().numpy() / batches)
            
            
            
        #epoch_loss /= len(train_loader)
        #print(f"Epoch {epoch}: Average Loss: {epoch_loss:.6f}")

    plot_activation_stats(activations_list)

    plt.figure()
    plt.plot(lrs, losses)
    plt.xlabel('Learning Rate')
    plt.ylabel('Loss')
    plt.title('Learning Rate vs. Loss')
    plt.show()

    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    
    # Convert BytesIO to PIL Image
    image1 = Image.open(buf)
    wandb.log({"Learning Rate vs. Loss": wandb.Image(image1)})

    return {
        "net": net,
        "losses": losses,
        "activations": activations_list
    }


training_result = training_model()
model = training_result["net"]

wandb.finish()



learning rate 3e-06


In [None]:
def validate(model, device, val_loader):
    model.eval()
    val_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in val_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            val_loss += F.nll_loss(output, target, reduction='sum').item()  # sum up batch loss
            pred = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
            correct += pred.eq(target.view_as(pred)).sum().item()
    val_loss /= len(val_loader.dataset)
    print(f'\nValidation set: Average loss: {val_loss:.4f}, Accuracy: {correct}/{len(val_loader.dataset)} ({100. * correct / len(val_loader.dataset):.0f}%)\n')


In [None]:
model = Network().to(device)

#torch compile
m = torch.compile(model)

#wandb watch
wandb.watch(m)

print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')

optimizer = torch.optim.AdamW(model.parameters(), lr= 00.0001)
lossi = []
for iter in range(max_iters): 
    if iter % eval_iterval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
        
        #wandb log
        wandb.log({"steps": iter,"train_loss": losses["train"], "val_loss": losses["val"]})

    xb,yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    lr =  lr if max_iters > 20000 else lr*10    #step learing rate Decay 
    optimizer.step()
