### FF network using pytorch tensors, autograd.

NN abstractions available in pytorch

pytorch.nn which has functional, linear, sequential, optimizer

make code more compact, efficient, easier to read, maintain.

efficient, clean way to write code.  
make benifit of all the libraries.(to work with DL at scale.)

In [1]:
import torch

import numpy as np
import matplotlib.pyplot as plt

In [2]:
torch.manual_seed(0) #torch seed for random

<torch._C.Generator at 0x7fa148399b30>

#### data(blob)

In [3]:
# data

from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split

data, labels = make_blobs(n_samples=1000, centers=4, n_features=2, random_state=0)

XX_train, XX_val, Y_train, Y_val = train_test_split(data, labels, 
                                                  stratify=labels, random_state=0)
print(XX_train.shape, XX_val.shape, labels.shape)


(750, 2) (250, 2) (1000,)


#### data as torch tensors

In [4]:
XX_train, Y_train, XX_val, Y_val = map(torch.tensor, (XX_train, Y_train, XX_val, Y_val)) 

# map the function to all and return all. - instead of calling on each one by one.

print(XX_train.shape, Y_train.shape)

torch.Size([750, 2]) torch.Size([750])


**Using torch tensor, autograd manaully.**

##### Forward pass

In [None]:
# forward prop given input. return ouput.

# XX - a single datapoint(tensor) 
# or all datapoints(tensor)-that case returned o/p will be a set of outputs for each.(tensor)


def model(XX):
    AA1 = torch.matmul(XX, Wmat1) + Bvec1  # (N, 2) x (2, 2) -> (N, 2)
    HH1 = AA1.sigmoid()  # (N, 2)
    # sigmoid - torch fn that can be called on a tensor.

    AA2 = torch.matmul(HH1, Wmat2) + Bvec2  # (N, 2) x (2, 4) -> (N, 4)

    # softmax layer => softmax on preactivation values

    # softmax
    HH2 = AA2.exp() / AA2.exp().sum(-1).unsqueeze(-1)  # (N, 4)
    # chaining way.
    # exp() - exponential on each element.
    # sum(which axis)  '-1' -> sum along last dimension.

    # what sum returns will be one dimension lesser than the input. 
    # eg: if input is 3D tensor, sum gives 2D tensor
    # this causes a dimension mismatch in the division opearaion.

    # therefore we add back the dimension we lost while using sum 
    # - using the unsqueeze function.

    # add dimension to -1 position - as that is where we took sum.

    # unsqueeze -> add a dimension in the specified axis
    # -1 => add dimension in last

    return HH2

axis = -1  

unsqueeze(axis)

In [None]:
#sum
A3 = torch.ones(5,2,3)
print(A3.shape)
print(A3.sum(0).shape)
print(A3.sum(1).shape)
print(A3.sum(-1).shape)


#unsqueeze

A3 = torch.ones(5,2,3)
print(A3.shape)
print(A3.unsqueeze(1).shape)
print(A3.unsqueeze(-1).shape)

# add a 1 in the dimension - in the specified position.

chaining functions

calling one returns a tensor, on which another one can be called, and so on.. 

this makes code easier to follow and maintain(can understand what is going on easily)

##### Loss

In [None]:
# loss function

# cross entropy loss
# -log(prob corresponding to the true one)
# sum of such for all datapoints (sum or mean)

# Y_hat = probability distribution
# y = true 'label' (NOT one-hot)

def loss_fn(YY_hat, yy):
    return -( YY_hat[ range(yy.shape[0]), yy ].log() ).mean()


# element at 'y' position from each Y_hat
# y_hat - N x classes
# Y_hat[range(YY_hat.shape[0]), yy]  -> y-th in each row.

tensor\[ list_x, list_y ] - all element wise coordinate pairs  

tensor[:, list] - all possible combinations

In [7]:
y_hat = torch.tensor([[0.1, 0.2, 0.3, 0.4], [0.8, 0.1, 0.05, 0.05]])
y = torch.tensor([2, 0])

print( y_hat[range(y_hat.shape[0]), y] )
print( y_hat[:, y] )

tensor([0.3000, 0.8000])
tensor([[0.3000, 0.1000],
        [0.0500, 0.8000]])


##### Accuracy

In [None]:
def accuracy(YY_hat, yy):
    pp = torch.argmax(YY_hat, dim=1) #prediction = max probability
    return (pp == yy).float().mean()


# .float() convert to float.
# pred==y -> bool of correct preds.

#### parameters

In [None]:
# initialize weights(xavier)

torch.manual_seed(0)

Wmat1 = torch.randn(2, 2) / math.sqrt(2)
Wmat2.requires_grad_()  # enabling gradient after making tensor.
# in place
Bvec1 = torch.zeros(2, requires_grad=True)

Wmat2 = torch.randn(2, 4) / math.sqrt(2)
Wmat2.requires_grad_()
Bvec2 = torch.zeros(4, requires_grad=True)

# require grad - set. as we want derviatives wrt the parameters

In [None]:
# here these parameters are like global variables. 
# They are accessed in the forward pass function , .. 

##### Fit

In [None]:
learning_rate = 0.2
epochs = 10000

XX_train = XX_train.float()
Y_train = Y_train.long()  # as its an index.

# book keeping
loss_arr = []
acc_arr = []

# each epoch:
for epoch in range(epochs):
    
    # forward prop - called on whole train dataset.
    YY_hat = model(XX_train)
    # YY_hat is series of output of each. N x o/p dim
    
    # loss 'variable':
    loss = loss_fn(YY_hat, Y_train)  
    # need this line as here is the loss-variable made
    
    # gradient on loss.
    loss.backward()
    
    # loggin loss, accuracy - each epoch
    loss_arr.append(loss.item())  # item gives the value.
    acc_arr.append(accuracy(YY_hat, Y_train))

    # updating parameters
    with torch.no_grad():  # so that not treated as new variables
        Wmat1 -= Wmat1.grad * learning_rate
        Bvec1 -= Bvec1.grad * learning_rate
        Wmat2 -= Wmat2.grad * learning_rate
        Bvec2 -= Bvec2.grad * learning_rate

        # make gradients 0.
        Wmat1.grad.zero_()
        Bvec1.grad.zero_()
        Wmat2.grad.zero_()
        Bvec2.grad.zero_()

# plot the logged error, accuracy
# vs epoch
plt.plot(loss_arr, "r-")
plt.plot(acc_arr, "b-")
plt.show()

print("Loss before training: ", loss_arr[0])
print("Loss after training: ", loss_arr[-1])

above - manually using tensor. vectorised code, plus BP using backward.

In [None]:

import math


from sklearn.metrics import accuracy_score, mean_squared_error, log_loss
from tqdm import tqdm_notebook 
import seaborn as sns
import time
from IPython.display import HTML
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder




Pytorch modules specifically for writing NNs.

**NN, Optim** modules

### nn.functional

In [None]:
import torch.nn.functional as F

In [None]:
torch.manual_seed(0)
weights1 = torch.randn(2, 2) / math.sqrt(2)
weights1.requires_grad_()
bias1 = torch.zeros(2, requires_grad=True)

weights2 = torch.randn(2, 4) / math.sqrt(2)
weights2.requires_grad_()
bias2 = torch.zeros(4, requires_grad=True)

learning_rate = 0.2
epochs = 10000

loss_arr = []
acc_arr = []

for epoch in range(epochs):
    y_hat = model(X_train)
    # change is here
    # torch function to compute cross entropy
    # given turn output, model output(prob dist.)
    loss = F.cross_entropy(y_hat, Y_train)
    loss.backward()
    loss_arr.append(loss.item())
    acc_arr.append(accuracy(y_hat, Y_train))

    with torch.no_grad():
        weights1 -= weights1.grad * learning_rate
        bias1 -= bias1.grad * learning_rate
        weights2 -= weights2.grad * learning_rate
        bias2 -= bias2.grad * learning_rate
        weights1.grad.zero_()
        bias1.grad.zero_()
        weights2.grad.zero_()
        bias2.grad.zero_()

plt.plot(loss_arr, "r-")
plt.plot(acc_arr, "b-")
plt.show()
print("Loss before training", loss_arr[0])
print("Loss after training", loss_arr[-1])

### nn.parameter

In [None]:
import torch.nn as nn

inheriting from a parent class. 

parent class in paranthesis in class definition.

and in init class super().init -> like calling constructor of parent class.

In [None]:
# class for model


class FirstNetwork(nn.Module):  # inherit from nn.Module class(parent)
    def __init__(self):
        super().__init__()

        torch.manual_seed(0)

        # making weights, biases as before - but wrapping in nn.Parameter()
        # so model knows that they are parameters.
        # don't need explicit req_grad
        self.weights1 = nn.Parameter(torch.randn(2, 2) / math.sqrt(2))
        self.bias1 = nn.Parameter(torch.zeros(2))
        self.weights2 = nn.Parameter(torch.randn(2, 4) / math.sqrt(2))
        self.bias2 = nn.Parameter(torch.zeros(4))

    def forward(self, X):  # same as before
        a1 = torch.matmul(X, self.weights1) + self.bias1
        h1 = a1.sigmoid()
        a2 = torch.matmul(h1, self.weights2) + self.bias2
        h2 = a2.exp() / a2.exp().sum(-1).unsqueeze(-1)
        return h2

inherited from nn.Module

making a object - calling it like a function on something - calls the 'forward' function.

thats how nn.Module is written. it also has a forward fn.. ? and we are overriding that in the subclass we make...??


but object(..) -> what is this?

In [None]:
# seperately wrinting fit function (not in class.)


def fit(epochs=1000, learning_rate=1):
    loss_arr = []
    acc_arr = []
    # through each epoch
    for epoch in range(epochs):
        # fn is object of above class(to be made.)
        # calling fn like a fn - call forward function
        # because it inherited from nn.Module.
        y_hat = fn(X_train)

        # torch fn to compute loss
        loss = F.cross_entropy(y_hat, Y_train)

        loss_arr.append(loss.item())
        acc_arr.append(accuracy(y_hat, Y_train))

        # gradient
        loss.backward()

        # before - we manually wrote update lies for each parameters.
        # now doing it in better way using nn.Parameters
        with torch.no_grad():
            for param in fn.parameters():
                param -= learning_rate * param.grad
            fn.zero_grad()  # set all gradients(of all params) to 0.

        ## nn.Parameters - provide short hand for updating parameters
        ## and reseting gradients to 0.

    plt.plot(loss_arr, "r-")
    plt.plot(acc_arr, "b-")
    plt.show()
    print("Loss before training", loss_arr[0])
    print("Loss after training", loss_arr[-1])


# now the fit function doesn't have to care about how many sets
# of weights and biases are there in the model, because all of them
# are wrapped in parameters - they can be iterated over in fit.
# any number of sets is ok.


# fit function can remain independent of the model.
# not assuming any knowledge of the model above in fit()

# same thing - for any model

# this is design template in the framework.
# keep fit function as a core function - and pass on to it the model, optimizer, hyper parameters, etc.

# good programming style.

**fit function**

fit function can remain independent of the model.  
not assuming any knowledge of the model above in fit()  

same thing - for any model

this is design template in the framework.  
keep fit function as a core function - and pass on to it the model, optimizer, hyper parameters, etc. 

good programming style.

In [None]:
fn = FirstNetwork()
fit()

### nn.linear

instead of manually doing z = Wa + b

all these are 'programming abstractions'  
we are using common programming constucts throughout, so why not abstract them using such things.

In [None]:
class FirstNetwork_v1(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(0)

        self.lin1 = nn.Linear(2, 2)
        # automatically internally have weights,biases.
        # above- 2x2 weights, 2x1 bias
        # they will be wrapped in parameters.. also.
        self.lin2 = nn.Linear(2, 4)

    def forward(self, X):
        # instead of writing the matmul line - using linear.
        a1 = self.lin1(X)
        h1 = a1.sigmoid()
        a2 = self.lin2(h1)
        h2 = a2.exp() / a2.exp().sum(-1).unsqueeze(-1)
        return h2

In [None]:
fn = FirstNetwork_v1()
fit()

# notice not changing fit()
# nn.linear - internally does same - parameters, etc. 

### optim

In [None]:
from torch import optim

In [None]:
## fit function using optim.


def fit_v1(epochs=1000, learning_rate=1):
    loss_arr = []
    acc_arr = []

    opt = optim.SGD(fn.parameters(), lr=learning_rate)
    # stochastic gradient optimizer (?)
    # can implement multiple things with this - specified as additional args - like momentum, nestrov, ..adam, ...

    for epoch in range(epochs):
        y_hat = fn(X_train)
        # loss
        loss = F.cross_entropy(y_hat, Y_train)

        loss_arr.append(loss.item())
        acc_arr.append(accuracy(y_hat, Y_train))

        loss.backward()

        # instead of iterating through parameters and updating manually
        opt.step()  # BP step. updating all parameters.
        opt.zero_grad()  # reseting gradients of all parameters

    plt.plot(loss_arr, "r-")
    plt.plot(acc_arr, "b-")
    plt.show()
    print("Loss before training", loss_arr[0])
    print("Loss after training", loss_arr[-1])

In [None]:
fn = FirstNetwork_v1()
fit_v1()

### nn.sequential

In [None]:
class FirstNetwork_v2(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(0)

        # giving the series of operations that the data go through
        # linear => linear combination
        # call net (network)
        # this in turn do the linear, .. which in turn do parameters, ..
        self.net = nn.Sequential(nn.Linear(2, 2), nn.Sigmoid(), nn.Linear(2, 4), nn.Softmax())

    # explicit forward though its only calling net..
    # so when we call object(..) - it will be called.
    def forward(self, X):
        return self.net(X)


# defining the network using sequantial
# calling it on input.

In [None]:
fn = FirstNetwork_v2()
fit_v1()

In [None]:
# more clean fit

# inputs to fit?

# (not doing book keeping here)


def fit_v2(x, y, model, opt, loss_fn, epochs=1000):
    for epoch in range(epochs):
        loss = loss_fn(model(x), y)

        loss.backward()
        opt.step()
        opt.zero_grad()

    return loss.item()

In [None]:
fn = FirstNetwork_v2()

loss_fn = F.cross_entropy
opt = optim.SGD(fn.parameters(), lr=1)

fit_v2(X_train, Y_train, fn, opt, loss_fn)

this way of coding with a core **fit function**.  

declare model seperately - the can play with hyperparameters,optimizers, etc. in the fit function. Its takes all those stuff as input.



so far:

**Step by step abstraction**

eg: Parameters -> Linear -> sequential

- nn.functional
  functions for nn (cross_entropy,..)
- nn.Module class  
  calling object like a fn calls forward  
  - nn.Parameters - wrap all parameters with this - then can iterate over them to update them.
  shorthand for updating parameters that way, and also setting all gradients to 0 after updating.
  - nn.Linear - interanally do parameters,etc. make weights, biases,.. - easy way to do weighted sum.. 
  - nn.Sequential
- optim



- nn.init

### Running on GPU

In [None]:
device = torch.device("cuda")

# push data and model to the device.

# data to gpu
X_train=X_train.to(device)
Y_train=Y_train.to(device)

fn = FirstNetwork_v2()

# model to gpu
fn.to(device)

# model has all the other tensors - weights, etc.. 

# only that much and everything is in gpu now.

tic = time.time()
print('Final loss', fit_v2(X_train, Y_train, fn, opt, loss_fn))
toc = time.time()
print('Time taken', toc - tic)

In [None]:
# a larger model.


class FirstNetwork_v3(nn.Module):
    def __init__(self):
        super().__init__()
        torch.manual_seed(0)
        self.net = nn.Sequential(
            nn.Linear(2, 1024 * 4), nn.Sigmoid(), nn.Linear(1024 * 4, 4), nn.Softmax()
        )

    def forward(self, X):
        return self.net(X)

In [None]:
device = torch.device("cpu")

X_train=X_train.to(device)
Y_train=Y_train.to(device)
fn = FirstNetwork_v3()
fn.to(device)
tic = time.time()
print('Final loss', fit_v2(X_train, Y_train, fn, opt, loss_fn))
toc = time.time()
print('Time taken', toc - tic)

In [None]:
device = torch.device("cuda")

X_train=X_train.to(device)
Y_train=Y_train.to(device)
fn = FirstNetwork_v3()
fn.to(device)
tic = time.time()
print('Final loss', fit_v2(X_train, Y_train, fn, opt, loss_fn))
toc = time.time()
print('Time taken', toc - tic)

cuda could do it much faster.

powers of 2 - GPU architecture is orgaised that way. better performance if all work distributions are in powers of 2. also matrices allign with the memory. 

**read docs of stuff.**