<a href="https://colab.research.google.com/github/melanAm/Improving-Neural-Networks/blob/main/Improving_neural_networks_by_preventing_co_adaptation_of_feature_detectors_Experiments_on_MNIST.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is the implementation of the paper "Improving neural networks by preventing
co-adaptation of feature detectors", Experiments on MNIST code

In [None]:
#import required packages
import numpy as np
import math
import torch
from torchvision import datasets,transforms
from torch.utils.data import DataLoader
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim.optimizer import _use_grad_for_differentiable
import matplotlib.pyplot as plt
import os
import copy
import json
import time

In [None]:
#hyper parameters
num_epochs = 3000
batch_size = 100
lr_init = 0.1
lr_decay_factor = 0.998
max_weight = np.sqrt(15.0)       #maximum square length of every neuron's weight vector
moment_init = 0.5
moment_final = 0.99
num_steps = 500
momentum_schedule = np.concatenate([np.linspace(moment_init,moment_final,num_steps),np.full(shape=(num_epochs-num_steps,),fill_value=moment_final)])
mean = 0
std = 0.01
num_class = 10
random_seed = 42
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Dataset

In [None]:
os.mkdir('dataset')

In [None]:
#load mnist dataset
path = '/content/dataset'
train_transform = transforms.Compose([transforms.RandomCrop(size=(28,28),padding=(2,)),transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))])
val_transform =  transforms.Compose([transforms.ToTensor(),transforms.Normalize((0.1307,),(0.3081,))])
train_dataset = datasets.MNIST(root=path,train=True,download=True,transform=train_transform)
val_dataset = datasets.MNIST(root=path,train=False,download=True,transform=val_transform)
train_loader = DataLoader(train_dataset,batch_size=batch_size,shuffle=True,num_workers=2,pin_memory=torch.cuda.is_available())
val_loader = DataLoader(val_dataset,batch_size=batch_size,shuffle=False,num_workers=2,pin_memory=torch.cuda.is_available())

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-images-idx3-ubyte.gz to /content/dataset/MNIST/raw/train-images-idx3-ubyte.gz


100%|██████████| 9.91M/9.91M [00:00<00:00, 42.7MB/s]


Extracting /content/dataset/MNIST/raw/train-images-idx3-ubyte.gz to /content/dataset/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/train-labels-idx1-ubyte.gz to /content/dataset/MNIST/raw/train-labels-idx1-ubyte.gz


100%|██████████| 28.9k/28.9k [00:00<00:00, 1.20MB/s]


Extracting /content/dataset/MNIST/raw/train-labels-idx1-ubyte.gz to /content/dataset/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-images-idx3-ubyte.gz to /content/dataset/MNIST/raw/t10k-images-idx3-ubyte.gz


100%|██████████| 1.65M/1.65M [00:00<00:00, 10.7MB/s]


Extracting /content/dataset/MNIST/raw/t10k-images-idx3-ubyte.gz to /content/dataset/MNIST/raw

Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Failed to download (trying next):
HTTP Error 403: Forbidden

Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz
Downloading https://ossci-datasets.s3.amazonaws.com/mnist/t10k-labels-idx1-ubyte.gz to /content/dataset/MNIST/raw/t10k-labels-idx1-ubyte.gz


100%|██████████| 4.54k/4.54k [00:00<00:00, 5.01MB/s]

Extracting /content/dataset/MNIST/raw/t10k-labels-idx1-ubyte.gz to /content/dataset/MNIST/raw






# Neural Network Model

In [None]:
#multi-layer perceptron input layer:784, hidden_layers:1200-1200, output_layer:10
class NeuralNet(nn.Module):
  def __init__(self):
    super(NeuralNet,self).__init__()
    self.fc1 = nn.Linear(in_features=784,out_features=1200,bias=True)
    self.fc2 = nn.Linear(in_features=1200,out_features=1200,bias=True)
    self.layer_out = nn.Linear(in_features=1200,out_features=10,bias=True)
    self.layers = [self.fc1,self.fc2,self.layer_out]
    self.initialize()

  def initialize(self):
    for layer in self.layers:
      torch.nn.init.normal_(layer.weight,mean=0.0,std=0.01)
      torch.nn.init.zeros_(layer.bias)

  def forward(self,x):
    x = x.view(-1,784)
    x = F.dropout(x,p=0.2,training=self.training)
    x = F.relu(self.fc1(x))
    x = F.dropout(x,p=0.5,training=self.training)
    x = F.relu(self.fc2(x))
    x = F.dropout(x,p=0.5,training=self.training)
    x = self.layer_out(x)
    return x

#customized SGD optimizer

In [None]:
class CSGD(optim.Optimizer):
  def __init__(self,params,lr_init,lr_decay_factor,momentum_schedule,max_weight,differentiable=False):
    self.epoch = 0
    self.lr_decay_factor = lr_decay_factor
    self.momentum_schedule = momentum_schedule
    self.max_weight = max_weight
    defaults = dict(lr=lr_init,momentum=self.momentum_schedule[0],differentiable=differentiable)
    super().__init__(params,defaults)

  def schedule(self):
      self.epoch += 1
      for group in self.param_groups:
        group["lr"] = group["lr"] * self.lr_decay_factor
        group["momentum"] = self.momentum_schedule[self.epoch]

  def _delw(self,grads,momentum_buffer_list,lr,momentum):
        lrm = -lr*(1-momentum)
        del_w = torch._foreach_mul(grads,lrm)
        if momentum_buffer_list:
          buf_x_moment = torch._foreach_mul(momentum_buffer_list,momentum)
          torch._foreach_add_(del_w,buf_x_moment)
        return del_w

  @_use_grad_for_differentiable
  def step(self):
      for group in self.param_groups:
        lr = group["lr"]
        momentum = group["momentum"]
        params = []
        grads = []
        momentum_buffer_list = []
        for p in group["params"]:
          if p.grad is not None:
            params.append(p)
            grads.append(p.grad)
            state = self.state[p]
            momentum_buffer = state.get("momentum_buffer")
            if momentum_buffer != None:
              momentum_buffer_list.append(momentum_buffer)
        del_w = self._delw(grads,momentum_buffer_list,lr,momentum)
        torch._foreach_add_(params,del_w)
        for p,momentum_buffer in zip(params,del_w):
          state = self.state[p]
          state["momentum_buffer"] = momentum_buffer
          #regularize weight vector of every neuron to have square length of max_weight
          if p.dim()>1:
            norm = torch.linalg.vector_norm(p,dim=1,keepdim=True)
            nrm_coef = norm.reciprocal().mul(self.max_weight).clamp(max=1.)
            p.mul_(nrm_coef)

In [None]:
use_gpu = True
def reproducibilitySeed():
    torch_init_seed = 42
    torch.manual_seed(torch_init_seed)
    numpy_init_seed = 42
    np.random.seed(numpy_init_seed)
    if use_gpu:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

reproducibilitySeed()

In [None]:
model = NeuralNet().to(device)

In [None]:
#cost function
criterion = nn.CrossEntropyLoss()

In [None]:
#create optimizer module
optimizer = CSGD(model.parameters(),lr_init,lr_decay_factor,momentum_schedule,max_weight,differentiable=False)

In [None]:
if os.path.exists('/content/teacher')==False:
  os.mkdir('/content/teacher')

In [None]:
def log(t):
        logname = os.path.join('/content/teacher', 'MLP_MNIST_DropoutPaper_CSGD_lr-init-10.txt')
        with open(logname, 'a') as f:
            f.write( json.dumps(t) + '\n')
        print(t)

In [None]:
def train(model,criterion,optimizer,num_epochs):
  bestparams = copy.deepcopy(model.state_dict())
  best_correct = 0
  time_start = time.time()
  for epoch in range(num_epochs):
    train_loss = 0.
    val_loss = 0.
    train_correct = 0
    val_correct = 0
    model.train()
    for x,y in train_loader:
      x = x.to(device)
      y = y.to(device)
      optimizer.zero_grad()
      y_hat = model.forward(x)
      loss = criterion(y_hat,y)
      loss.backward()
      optimizer.step()

    optimizer.schedule()

    model.eval()
    for x,y in train_loader:
      x = x.to(device)
      y = y.to(device)
      y_hat = model.forward(x)
      loss = criterion(y_hat,y)
      train_loss+= loss.item()*x.size(0)
      train_correct += (((torch.argmax(y_hat,dim=1)==y)).sum()).item()
    train_loss = train_loss/len(train_dataset)
    train_error = len(train_dataset)-train_correct
    train_acc = train_correct/len(train_dataset)

    for x,y in val_loader:
      x = x.to(device)
      y = y.to(device)
      y_hat = model.forward(x)
      loss = criterion(y_hat,y)
      val_loss += loss.item()*x.size(0)
      val_correct += (((torch.argmax(y_hat,dim=1)==y)).sum()).item()
    val_loss = val_loss/len(val_dataset)
    val_error = len(val_dataset)-val_correct
    val_acc = val_correct/len(val_dataset)

    if val_correct > best_correct:
      bestparams = copy.deepcopy(model.state_dict())
      best_correct = val_correct

    log({
            "epoch": epoch+1,
            "train_loss": train_loss,
            "train_correct": train_correct,
            "train_error": train_error,
            "train_acc" : train_acc,
            "val_loss": val_loss,
            "val_correct": val_correct,
            "val_error": val_error,
            "val_acc" : val_acc,
           })

  time_fin = time.time()-time_start
  print('time: {}'.format(time_fin))
  return bestparams

In [None]:
optimized_params = train(model,criterion,optimizer,num_epochs=num_epochs)

{'epoch': 1, 'train_loss': 0.25702127325038115, 'train_correct': 55410, 'train_error': 4590, 'train_acc': 0.9235, 'val_loss': 0.16155291713774203, 'val_correct': 9548, 'val_error': 452, 'val_acc': 0.9548}
{'epoch': 2, 'train_loss': 0.1647529796945552, 'train_correct': 57057, 'train_error': 2943, 'train_acc': 0.95095, 'val_loss': 0.11167104103136807, 'val_correct': 9676, 'val_error': 324, 'val_acc': 0.9676}
{'epoch': 3, 'train_loss': 0.12940022566666207, 'train_correct': 57702, 'train_error': 2298, 'train_acc': 0.9617, 'val_loss': 0.08768690616823732, 'val_correct': 9736, 'val_error': 264, 'val_acc': 0.9736}
{'epoch': 4, 'train_loss': 0.11397678236477077, 'train_correct': 57933, 'train_error': 2067, 'train_acc': 0.96555, 'val_loss': 0.0758698733127676, 'val_correct': 9759, 'val_error': 241, 'val_acc': 0.9759}
{'epoch': 5, 'train_loss': 0.09703863551840186, 'train_correct': 58248, 'train_error': 1752, 'train_acc': 0.9708, 'val_loss': 0.06739990880014375, 'val_correct': 9794, 'val_error':

KeyboardInterrupt: 

In [None]:
optimized_params = copy.deepcopy(model.state_dict())

In [None]:
torch.save(optimized_params, '/content/teacher/MLP_MNIST_DropoutPaper_CSGD_lr0.1.pth.tar')
torch.save(dict(params={k: v.data for k, v in optimized_params.items()}),os.path.join('/content/teacher', 'MLP_MNIST_DropoutPaper_CSGD_lr0.1.pt7'))
torch.save({k: v.data for k, v in optimized_params.items()},'/content/teacher/MLP_MNIST_DropoutPaper_CSGD_lr0.1')