In [21]:
import torch
from torch import nn, optim
from torch.functional import F
from torch.utils.data import DataLoader, random_split
from torchvision import transforms, datasets
import torchvision.models as models

import pandas as pd
import numpy as np 
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm  
from sklearn.decomposition import PCA
import umap
import math
import io
import umap.plot
import plotly.graph_objs as go 
import plotly.io as pio 
pio.renderers.default ='iframe'


from collections import defaultdict
from tqdm import tqdm
import wandb
import random
from PIL import Image  # Add this import statement

%matplotlib inline
import math,os,sys
import warnings 
warnings.filterwarnings('ignore')

In [22]:
from simple_cnn_dataset import TransformedDataset  
#load dataset 
data = pd.read_csv('../data/train.csv')
#data_shorted = data[:10000]
data_clp = data[:10000].reset_index(drop=True) # the index has to be readjusted otherwise it follows the whole data index... problematic
#temporarly trying to overfit with less data


#common transformation
default_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.ToTensor(),
    transforms.Normalize(0.5,0.5)
])


In [23]:
transformed_data = TransformedDataset(data_clp, default_transform)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(transformed_data))
val_size = len(transformed_data) - train_size
train_dataset, val_dataset = random_split(transformed_data, [train_size, val_size])

# Create DataLoaders for training and validation sets
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


# visualizing Datapoints

In [24]:
def show_images(images, title =''):
    num_images = len(images)
    fig,axes = plt.subplots(1, num_images,figsize=(9,3))
    for i in range(num_images):
        img = np.squeeze(images[i])
        axes[i].imshow(img,cmap='gray')
        axes[i].axis('off')
    fig.suptitle(title)
    plt.show()



# lets build Neural Network
-  Define a neural network architecture with two convolution layers and two fully connected layers
- Input to the network is an MNIST image and Output is a 64 dimensional representation. 


In [25]:
from Network import Network,Network_t ,ContrastiveLoss_with_margin, CustomVGG
from utils import init_weights, init_weights_for_gelu,initialize_weights_mod,plot_activation_stats

In [26]:
net = CustomVGG()

# Load the VGG16 model unmodiefied first layer(to work for grey -> channel from 3 ->1) and last layers(change number of classes for the last classifier)
#net = models.vgg16(pretrained=False)


device= "cpu"
if torch.cuda.is_available():
    device = "cuda"
elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    device= "mps"

#device= "cpu" #overide device for overfitting a very small data batch
#net = net.to(device)

device

'mps'

In [27]:
# Load the VGG16 model without pretrained weights
vgg16 = models.vgg16(pretrained=False)

# Modify the first convolutional layer to accept grayscale images
vgg16.features[0] = nn.Conv2d(1, 64, kernel_size=3, stride=1, padding=1)

# Optionally, initialize the weights of the modified layer
nn.init.kaiming_normal_(vgg16.features[0].weight, mode='fan_out', nonlinearity='relu')

# Modify the last fully connected layer to have the desired number of output features
# For example, if you have 10 classes:
num_classes = 10
vgg16.classifier[6] = nn.Linear(4096, num_classes)

# Optionally, initialize the weights of the modified layer
nn.init.kaiming_normal_(vgg16.classifier[6].weight, mode='fan_out', nonlinearity='relu')

#net = vgg16
# Print the modified model architecture to verify the changes
#print(net)

Parameter containing:
tensor([[ 0.4331,  0.3713, -0.5005,  ...,  0.6261, -0.2311,  0.0837],
        [-0.3836,  0.1280,  0.2263,  ..., -0.2390, -0.3197,  0.6447],
        [-0.6263,  0.1920, -0.1210,  ...,  0.4876, -0.2372, -0.1031],
        ...,
        [ 0.1582,  0.0473, -0.5436,  ..., -0.4659, -0.4897, -0.1450],
        [-0.1888,  0.2101, -0.0926,  ..., -0.2610, -0.6090,  0.4436],
        [-0.8256, -0.2650, -0.2683,  ...,  0.6475, -0.3692,  0.4248]],
       requires_grad=True)

### optimzer initialization 

In [28]:
from torch.optim.lr_scheduler import ReduceLROnPlateau,CosineAnnealingLR

In [29]:
epoch_count=10
optimizer = torch.optim.AdamW(net.parameters(), lr =1e-3,weight_decay=1e-5)
#scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=4, gamma=0.3)
scheduler = CosineAnnealingLR(optimizer, T_max=epoch_count, eta_min=1e-6)

#scheduler reduces plateau loss
#scheduler = ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=3, verbose=True)


In [30]:
import os
checkpoint_dir ='checkpoints/'

if not os.path.exists(checkpoint_dir):
    os.makedirs(checkpoint_dir)

## Testing the state of the model defination of it works for simple digit recognition

In [31]:
lre = torch.linspace(-3,0,1000)
lrse = 10**lre

lri=[]
lossi =[]

In [34]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="Contrastive_learning",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.01,
    "architecture": "vgg16 experimental model health",
    "dataset": "Mnist -dataset",
    "iteration_count": 1000,
    "batch size" : 128,
    }
)

ud=[]
def training_model(iteration_count=1000):
    net = CustomVGG()
    #net.apply(init_weights_for_gelu)
    initialize_weights_mod(net, activation_function='relu')
    net = net.to(device)
    lrs = []
    losses = []
    activations_dict = defaultdict(lambda: {'mean': [], 'var': [], 'neg_ratio': []})
    
    def get_activation_stats(name):
        def hook(model, input, output):
            mean = output.detach().mean().item()
            var = output.detach().var().item()
            neg_ratio = (output.detach() < 0).float().mean().item()
            activations_dict[name]['mean'].append(mean)
            activations_dict[name]['var'].append(var)
            activations_dict[name]['neg_ratio'].append(neg_ratio)
        return hook

    for name, layer in net.named_modules():
        if isinstance(layer, nn.ReLU):
            layer.register_forward_hook(get_activation_stats(name))

    activations_list = []
    gradients = []
    ud =[]
    iteration = 0
    while iteration < iteration_count:
        epoch_loss = 0
        batches = 0
        
        for param_group in optimizer.param_groups:
            lrs.append(param_group['lr'])

        for batch_idx, (data, target) in enumerate(train_loader):
            if iteration >= iteration_count:
                break
            data, target = data.to(device), target.to(device)
            optimizer.zero_grad()
            output = net(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            norm = torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
            lr = lrse[iteration]
            optimizer.step()


            epoch_loss += loss.item()
            #track lr stats
            lri.append(lr)
            lossi.append(loss.item())
            
            batches += 1
            iteration += 1
            
            activations_list.append({k: {stat: v[stat][-batches:] for stat in v} for k, v in activations_dict.items()})
            losses.append(loss.cpu().detach().numpy() / batches)
            
            current_lr = optimizer.param_groups[0]['lr']
            with torch.no_grad():
            #ud.append([(lr*p.grad.std()/ p.data.std().log10().item()) for p in net.parameters()])
                epsilon = 1e-8
                ud.append([(current_lr * p.grad.std() / (p.data.std() + epsilon).log10().item()) for p in net.parameters() if p.grad is not None])
    
            
            if iteration % 100 == 0:  # Log every 100 iterations
                print(f"Iteration {iteration}, Loss: {epoch_loss / batches}, Learning Rate: {lr}")
                wandb.log({'iteration': iteration, 'loss': epoch_loss / batches, 'learning rate': lr})
                
        checkpoint_path = os.path.join(checkpoint_dir, f'model_iter{iteration}.pt')
        torch.save(net.state_dict(), checkpoint_path)

    plot_activation_stats(activations_list)

    # Plot learning rate vs. loss
    plt.figure()
    plt.plot(lri, lossi)
    plt.xlabel('Learning Rate')
    plt.ylabel('Loss')
    plt.show()

    return {
        "net": net,
        "losses": losses,
        "activations": activations_dict
    }

# Run the training model function
training_result = training_model(iteration_count=1000)
model = training_result["net"]
wandb.finish()


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
iteration,▁▂▃▃▄▅▆▆▇█
learning rate,▁▁▁▁▁▁▂▃▄█
loss,▆█▃▄▅▄▇▄▁▅

0,1
Plotting data for layer,features.1
iteration,1000
learning rate,1.0
loss,0.46296


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01117111064441916, max=1.0)…

Iteration 100, Loss: 0.18143178134955265, Learning Rate: 0.001982883783057332
Iteration 200, Loss: 0.1683971770107746, Learning Rate: 0.003959110472351313
Iteration 300, Loss: 0.1318515344367673, Learning Rate: 0.007904929108917713
Iteration 400, Loss: 0.15810897760093212, Learning Rate: 0.015783313661813736
Iteration 500, Loss: 0.164185641112469, Learning Rate: 0.03151363879442215
Iteration 600, Loss: 0.14482127931533437, Learning Rate: 0.06292146444320679
Iteration 700, Loss: 0.12915182645831788, Learning Rate: 0.1256316602230072
Iteration 800, Loss: 0.17039820670404218, Learning Rate: 0.25084149837493896
Iteration 900, Loss: 0.19577626883983612, Learning Rate: 0.5008407831192017
Iteration 1000, Loss: 0.18847470923580906, Learning Rate: 1.0


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

## experimaental model with simple image classification to check model health

In [None]:
#visualize histogram of gradient
plt.figure(figsize=(20,4)) # width and height of the plot
legends = []
for i,p in enumerate(net.parameters()): 
    if p.ndim == 2:
        plt.plot([ud[j][i].cpu().numpy() for j in range(len(ud))])
        legends.append('param %d' % i)
plt.plot([0, len(ud)], [-3, -3], 'k') # those ratios should be ~1e-3, indicated on the plot with black         
plt.legend(legends);
plt.title('update to data raio distribation, LR setting')

In [None]:
import wandb
import random

# start a new wandb run to track this script
wandb.init(
    # set the wandb project where this run will be logged
    project="Contrastive_learning",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 0.01,
    "architecture": "vgg16 experimental model health",
    "dataset": "Mnist -dataset",
    "epochs": 10,
    "batch size" : 128,
    }
)

epochs =10
activations_list = []
gradients = []
ud =[]

def training_model(iteration_count=1000):
    net = CustomVGG()
    #The log="all" parameter tells wandb to log gradients and parameters, and 
    #log_freq=64 means it will log every 64 batches.
    wandb.watch(net, log="all", log_freq=64) 
    #net.apply(init_weights_for_gelu)
    net = net.to(device)
    initialize_weights_mod(net, activation_function='relu')
    
    lrs = []
    losses = []
    activations_dict = defaultdict(lambda: {'mean': [], 'var': [], 'neg_ratio': []})
    
    def get_activation_stats(name):
        def hook(model, input, output):
            mean = output.detach().mean().item()
            var = output.detach().var().item()
            neg_ratio = (output.detach() < 0).float().mean().item()
            activations_dict[name]['mean'].append(mean)
            activations_dict[name]['var'].append(var)
            activations_dict[name]['neg_ratio'].append(neg_ratio)
            # this was not printing coz of leakyReLU wasnt correctly called(was only nn.ReLU)
            #print(f'Hook called for {name}: mean={mean}, var={var}, neg_ratio={neg_ratio}')
        return hook   
    
    # Register hooks for GELU/ReLU layers depends (or whatever activation you're using)
    for name, layer in net.named_modules():
        if isinstance(layer, nn.ReLU):
            layer.register_forward_hook(get_activation_stats(name))
            #print(f'Registered hook for layer: {name}') #-----debugging print worked
            
    def capture_gradient(name):
        def hook(module, grad_input, grad_output):
            gradients.append((name, grad_output[0].detach()))
        return hook
    
    iteration = 0
    while iteration < iteration_count:
        epoch_loss = 0
        batches = 0
        

        for param_group in optimizer.param_groups:
            lrs.append(param_group['lr'])
        
        print('learning rate', lrs[-1])
        wandb.log({'learning rate': lrs[-1]})
        
        for batch_idx, (data, target) in enumerate(train_loader):
            if iteration >= iteration_count:
                break
            optimizer.zero_grad()
            output = net(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            norm = torch.nn.utils.clip_grad_norm_(net.parameters(), 1.0)
            optimizer.step()
            scheduler.step()
    
            epoch_loss += loss.item()
            batches += 1

            if batch_idx % 100 == 0:
                print(f'Train Epoch: {epoch} [{batch_idx * len(data)}/{len(train_loader.dataset)} ({100. * batch_idx / len(train_loader):.0f}%)]\tLoss: {loss.item():.6f}')
    
            
                #Compute values  for wandb logging
                current_step = batch_idx * len(data)
                total_steps = len(train_loader.dataset)
                percentage_complete = 100. * batch_idx / len(train_loader)
                loss_value = loss.item()
        
                # Log the values with wandb
                wandb.log({
                    'epoch': epoch,
                    'current_step': current_step,
                    'total_steps': total_steps,
                    'percentage_complete': percentage_complete,
                    'loss': loss_value
                })
                
            # Average the epoch loss over batches
            #fixing the different length of lr and losses items (10 vs 630) by averaging loss per epoch
        activations_list.append({k: {stat: v[stat][-batches:] for stat in v} for k, v in activations_dict.items()})
        losses.append(epoch_loss / batches)
        current_lr = optimizer.param_groups[0]['lr']
        with torch.no_grad():
            #ud.append([(lr*p.grad.std()/ p.data.std().log10().item()) for p in net.parameters()])
            epsilon = 1e-8
            ud.append([(current_lr * p.grad.std() / (p.data.std() + epsilon).log10().item()) for p in net.parameters() if p.grad is not None])
    
       
        

    plot_activation_stats(activations_list)

    plt.figure()
    plt.plot(losses, lrs)
    plt.ylabel('Learning Rate')
    plt.xlabel('Loss')
    plt.title('Learning Rate vs. Loss')
    plt.show()

    buf = io.BytesIO()
    plt.savefig(buf, format='png')
    buf.seek(0)
    
    # Convert BytesIO to PIL Image
    image1 = Image.open(buf)
    wandb.log({"Learning Rate vs. Loss": wandb.Image(image1)})

    return {
        "net": net,
        "losses": losses,
        "activations": activations_list
    }

def plot_activation_stats(activations_list):
    if not activations_list:
        print("No activation data to plot.")
        wandb.log({'message':'No activation data to plot'})
        return

    for layer_name in activations_list[0].keys():
        means = [epoch[layer_name]['mean'] for epoch in activations_list]
        vars = [epoch[layer_name]['var'] for epoch in activations_list]
        neg_ratios = [epoch[layer_name]['neg_ratio'] for epoch in activations_list]

        # logging this stat only on wandb 
        print(f'Plotting data for layer: {layer_name}')
        wandb.log({'Plotting data for layer':layer_name}) # logging this stat only on wandb 
        print(f'Means: {means}')
        wandb.log({'Means': means})
        print(f'Variances: {vars}')
        wandb.log({'Variances': vars})
        print(f'Negative Ratios: {neg_ratios}')
        wandb.log({'Negative Ratios':neg_ratios})

        plt.figure(figsize=(15, 5))
        plt.subplot(131)
        plt.plot(means)
        plt.title(f'{layer_name} - Mean Activation')
        plt.xlabel('Batch')
        plt.ylabel('Mean')
        
        plt.subplot(132)
        plt.plot(vars)
        plt.title(f'{layer_name} - Activation Variance')
        plt.xlabel('Batch')
        plt.ylabel('Variance')

        plt.subplot(133)
        plt.plot(neg_ratios)
        plt.title(f'{layer_name} - Negative Activation Ratio')
        plt.xlabel('Batch')
        plt.ylabel('Ratio')
         
        plt.tight_layout()
        

        # Save the plot to a buffer
        buf = io.BytesIO()
        plt.savefig(buf, format='png')
        buf.seek(0)

        # Convert BytesIO to PIL Image
        image = Image.open(buf)

        # Log the plot to Weights and Biases
        wandb.log({f'{layer_name} activations': wandb.Image(image)})
        plt.show() # this should be after wandb log 

        plt.close()  # Close the figure to free up memory
        buf.close()


training_result = training_model()
model = training_result["net"]

plot_activation_stats(activations_list)

wandb.finish()



In [18]:
#visualize histogram of gradient
plt.figure(figsize=(20,4)) # width and height of the plot
legends = []
for i,p in enumerate(net.parameters()): 
    if p.ndim == 2:
        plt.plot([ud[j][i].cpu().numpy() for j in range(len(ud))])
        legends.append('param %d' % i)
plt.plot([0, len(ud)], [-3, -3], 'k') # those ratios should be ~1e-3, indicated on the plot with black         
plt.legend(legends);
plt.title('update to data raio distribation, LR setting')

NameError: name 'ud' is not defined

<Figure size 2000x400 with 0 Axes>

In [None]:
import matplotlib.cm as cm

plt.figure(figsize=(20,4))
legends = []
colors = cm.rainbow(np.linspace(0, 1, len(list(net.parameters()))))
for i, (p, color) in enumerate(zip(net.parameters(), colors)):
    if p.ndim == 2:
        plt.plot([ud[j][i].cpu().numpy() for j in range(len(ud))], color=color)
        legends.append(f'param {i}')
plt.plot([0, len(ud)], [-3, -3], 'k')
plt.legend(legends)
plt.title('Update-to-Data Ratio Distribution, LR Setting')
plt.xlabel('Epochs')
plt.ylabel('Update-to-Data Ratio')
plt.yscale('log')
plt.grid(True, which="both", ls="-", alpha=0.2)
plt.show()