In this notebook I've implemented Highwat Neural Network with 100 layers and use it on the MNIST datase.

In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as f
from torch.optim import Adam
from torch.autograd import Variable
from torchvision.datasets import MNIST
import torchvision.transforms as transforms
import numpy as np

In [1]:
def accuracy(preds, y_true):
    '''
    Use this function to check accuracy of a model trained.
    
    :param: preds - predictions generated by neural network
    :param: y_true - true/real labels for each sample in the dataset
    '''
    correct = 0 
    assert len(preds) == len(y_true)
    
    for i in range(len(preds)):
        if np.argmax(preds[i]) == y_true[i]:
            correct += 1
    return correct / len(preds)

In [19]:
#Hyperparams
learning_rate = 0.001
epochs = 10
batch_size = 128
number_of_units = 128
number_of_classes = 10
number_of_highway_layers = 100
in_size = 784
carry_bias = -20.0

#### Load nad preprocess MNIST training and testing datasets

In [3]:
train_dataset = MNIST(root='./data/', 
                      train=True, 
                      transform=transforms.ToTensor(), 
                      download=True)

In [7]:
test_dataset = MNIST(root='./data/', 
                      train=False, 
                      transform=transforms.ToTensor(), 
                      download=False)

In [8]:
train_loader = torch.utils.data.DataLoader(train_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=True, 
                                           num_workers=2)

In [9]:
test_loader = torch.utils.data.DataLoader(test_dataset, 
                                           batch_size=batch_size, 
                                           shuffle=False, 
                                           num_workers=2)

#### Create Highway Fully connected layer

In [11]:
class HighwayLayer(nn.Module):
    
    def __init__(self, num_of_units, carry_bias):
        '''
            https://arxiv.org/pdf/1505.00387.pdf
        '''

        super(HighwayLayer, self).__init__()
        self.num_of_units = num_of_units
        self.bias = carry_bias
        
        self.H = nn.Linear(num_of_units, num_of_units)
        self.T = nn.Linear(num_of_units, num_of_units, bias=False)     

    def forward(self, X):
        carry_bias = Variable(torch.ones(self.num_of_units) * self.bias).cuda()
        H = f.relu(self.H(X))
        T = f.sigmoid(self.T(X) + carry_bias)
        C = 1 - T
        
        out = (H * T) + (X * C)
        return out

#### 100 layers HIGHWAY NETWORK

In [12]:
class HighwayNetwork(nn.Module):
    
    def __init__(self, in_size, num_of_classes,
                 number_of_highway_layers, hidden_units, carry_bias):
        super(HighwayNetwork, self).__init__()
        self.hidden_units = hidden_units
        self.carry_bias = carry_bias
        self.highway_layers = []
        self.number_of_highway_layers = number_of_highway_layers
        
        self.input_layer = nn.Linear(in_size, hidden_units)
        if number_of_highway_layers > 0:
            for i in range(number_of_highway_layers):
                self.highway_layers.append(HighwayLayer(hidden_units, carry_bias).cuda())
        
        self.out_layer = nn.Linear(hidden_units, num_of_classes)
        
    def forward(self, X):
        
        out = f.relu(self.input_layer(X))
        
        if self.number_of_highway_layers > 0:
            for i in range(self.number_of_highway_layers):
                highway_layer = self.highway_layers[i]
                out = highway_layer(out)
        
    
        return f.softmax(self.out_layer(out))

In [20]:
highway = HighwayNetwork(in_size, 
                         number_of_classes, 
                         number_of_highway_layers, 
                         number_of_units, 
                         carry_bias)

In [21]:
highway.highway_layers

[HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_features=128)
   (T): Linear(in_features=128, out_features=128)
 ), HighwayLayer(
   (H): Linear(in_features=128, out_featu

In [22]:
highway.cuda()

HighwayNetwork(
  (input_layer): Linear(in_features=784, out_features=128)
  (out_layer): Linear(in_features=128, out_features=10)
)

#### Define loss function

In [23]:
criterium = nn.CrossEntropyLoss()

#### Create optimizer

In [24]:
optimizer = Adam(highway.parameters(), lr=learning_rate)

#### Defining a training loop

In [25]:
for epoch in range(epochs):
    epoch_loss = []
    epoch_accuracy = []
    for images, labels in train_loader:
        
        X_batch = Variable(images.view(-1, in_size)).cuda()
        y_batch = Variable(labels).cuda()
        
        optimizer.zero_grad()
        preds = highway(X_batch)
        epoch_accuracy.append(accuracy(preds.cpu().data.numpy(), y_batch.cpu().data.numpy()))
        loss = criterium(preds, y_batch)
        epoch_loss.append(loss.cpu().data)
        loss.backward()
        optimizer.step()
    print("Epoch: {}/{}".format(epoch+1, epochs), 
      " | Epoch loss: {}".format(np.mean(epoch_loss)), 
      " | Epoch accuracy: {}".format(np.mean(epoch_accuracy)))



Epoch: 1/10  | Epoch loss: 1.908657431602478  | Epoch accuracy: 0.653184335443038
Epoch: 2/10  | Epoch loss: 1.6190677881240845  | Epoch accuracy: 0.8863726265822784
Epoch: 3/10  | Epoch loss: 1.574652910232544  | Epoch accuracy: 0.9137658227848101
Epoch: 4/10  | Epoch loss: 1.5563260316848755  | Epoch accuracy: 0.9246439873417721
Epoch: 5/10  | Epoch loss: 1.5462031364440918  | Epoch accuracy: 0.9328520569620253
Epoch: 6/10  | Epoch loss: 1.5386292934417725  | Epoch accuracy: 0.939181170886076
Epoch: 7/10  | Epoch loss: 1.5339903831481934  | Epoch accuracy: 0.9415545886075949
Epoch: 8/10  | Epoch loss: 1.5273780822753906  | Epoch accuracy: 0.9457080696202531
Epoch: 9/10  | Epoch loss: 1.5236586332321167  | Epoch accuracy: 0.9498615506329114
Epoch: 10/10  | Epoch loss: 1.5206340551376343  | Epoch accuracy: 0.9513449367088608


### Testing the 100-Layers Highway Network

In [30]:
test_accuracy = []
for images, labels in test_loader:

    X_batch = Variable(images.view(-1, in_size)).cuda()
    y_batch = Variable(labels).cuda()

    preds = highway(X_batch)
    test_accuracy.append(accuracy(preds.cpu().data.numpy(), y_batch.cpu().data.numpy()))



In [31]:
np.mean(test_accuracy)

0.9547072784810127