# Hands-On Session No. 2
## (linear layer implementation and logistic regression)


In [0]:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np

device = 'cuda' if torch.cuda.is_available() else 'cpu'

## Prepare Data

***The batch dimension.*** Usually, the dimensions of tensors are:
First dimension is the batch ("index" of the sample), then following dimensions are for features. E.g.:

N x D: N samples, each with D features (i.e. N vectors)

N x H x W: N samples, each a 2D "image" of size HxW

In the case of images it is very common to have a "channel" dimension, e.g.:

N x C X H x W: N samples, each a 2D image with C channels. C is most commonly 1 (grayscale image) or 3 (RGB image).

(In this exercise will use NxD tensors)

***Instructions***

1. Define X1, X2 as described below

2. Run the cell and see the output figure

3. X1, X2 are both sampled from normal distribution. Change their sigmas/mean to have non-overlapping data (since it's ranodm it might sometimes overlap). Try to have the datapoints not too far from each other (to make it a bit more interesting for the classifier).

4. Set the labels: 1 for samples in X1, 0 for samples in X2

In [0]:
# Number of samples in our trainset. Better to start with a small N
# and after you finish the rest of the exercise, come back and try 
# on larger trainset size
N = torch.tensor(5)
Nhalf = int(N/2)
D = 2

# create linearly separable samples using torch.randn
# X1: a tensor of dimensions Nhalf x D
# X2: a tensor of dimensions (N - Nhalf) x D
# both sampled from a standard normal distribution
# <<<--- Only change these lines -------->>> 
X1 = None
X2 = None
# <<<------------------------------------->>>
X = torch.cat([X1, X2])

# plot data
plt.figure(figsize=(7,7))
plt.plot(X[:X1.shape[0], 0].numpy(), X[:X1.shape[0], 1].numpy(), '.r')
plt.plot(X[X1.shape[0]:, 0].numpy(), X[X1.shape[0]:, 1].numpy(), '.b')
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.gca().set_aspect('equal', adjustable='box')

# set labels and save them inside variable y
# <<<--- Only change these lines -------->>> 
y = 0
# <<<------------------------------------->>>


## Implement a full forward backward Linear layer



In [0]:
# For debugging, put this where you want a breakpoint
# import pdb; pdb.set_trace()    
# To exit breakpoint write 'exit'

class MyLinear(torch.autograd.Function):  
  @staticmethod
  def forward(ctx, x, w, b):
    """
    @param ctx: a context object, where we can save cache (e.g. tensors) for
    later use in the backward part
    @param x, w, b: layer's input, weights, bias respectively
    forward function returns the output of the layer
    """
    # implement the forward part of the layer, that is the output.
    # save in ctx whatever you would need for the backward pass,
    # like this: ctx.save_for_backward(a, b, ...) where a, b, ... are tensors
    # <<<--- Only change these lines -------->>> 

    out = None
    # ctx.save_for_backward(???)

    # <<<------------------------------------>>> 

    return out

  @staticmethod
  def backward(ctx, grad_output):
    """
    @param ctx: context object similar to forward pass (with what you saved)
    @param grad_output: a tensor containing the gradient of the loss function
    with respect to the layer's output.
    
    backward function returns the gradient of the loss w.r.t to each of the
    layer's inputs. you must compute and return it for each input to forward.
    """
    # implement the gradients of the loss per each input to forward: dx, dw, db
    # you can read what you saved earlier in ctx like this:
    # a, b, ... = ctx.saved_tensors
    # <<<--- Only change these lines -------->>> 

    # ???, ???, ... = ctx.saved_tensors
    dx = None
    dw = None
    db = None
    # <<<------------------------------------>>> 

    return dx, dw, db


## Check that the above code works

In [0]:
# number of outputs. For logistic regression we will use one output.
K = 1

# initialize parameters
# w: a tensor of dimensions D x K
# b: a tensor of dimensions K
# make sure they have requires_grad=True
# <<<--- Only change these lines -------->>> 
w = None
b = None
# <<<------------------------------------>>> 

# initialize the layer and run forward and backward passes
lin = MyLinear()
out = lin.apply(X, w, b).sigmoid()
loss = torch.nn.BCELoss()(out, y.float().unsqueeze(-1))
# run the backward pass (this will compute the gradients)
loss.backward()
print(loss)

# you can enter the gradient of each layer like this:
print(w.grad)
# if the line above throws an error it probably means you didn't
# define w to have requires_grad = True


## Check gradients
## Once you think you're done, uncomment the following lines
## to make sure the gradients you computed are correct

# from torch.autograd import gradcheck
# X_check = X.clone().double().requires_grad_(True)
# w_check = w.clone().double().requires_grad_(True)
# b_check = b.clone().double().requires_grad_(True)
# f = lin.apply
# gradcheck(f, (X_check,w_check,b_check))

## Train a linear model

In [0]:
# General Parameters
nepochs = 5
lr = 0
K = 1
print_every = 1
#####################

# Initialize paraneters
torch.manual_seed(0)
# copy here the w,b that you defined earlier
# <<<--- Only change these lines -------->>> 
w = None
b = None
# <<<------------------------------------>>> 


def zero_grad(w):
  if hasattr(w, 'grad'):
    if w.grad is not None:
      w.grad.zero_()
  
lin = MyLinear()

# Train model
for t in range(nepochs):
  out = lin.apply(X, w, b).sigmoid()
  loss = torch.nn.BCELoss()(out, y.float().unsqueeze(-1))

  if t % print_every == 0:
    print(t, 'loss:', loss.item())
  
  # compute gradients to parameters
  # make a backward pass through "loss"
  # don't forget that with pytorch tensors, gradients are summed with
  # what is already there...
  # <<<--- Only change these lines -------->>> 

 
  # <<<------------------------------------>>> 
  
  # Update parameters using the simple gradient descent method
  # <<<--- Only change these lines -------->>> 
  w.data = 0
  b.data = 0
  # <<<------------------------------------>>> 

print(t, 'loss:', loss.item())

## Verify correctness of solution

When you think you're finished trainig the model, run the following code to see if your final parameters that you found make sense.

The green line is the separating hyperplane that you've found

In [0]:
# plot data
plt.figure(figsize=(7,7))
plt.plot(X[:X1.shape[0], 0].numpy(), X[:X1.shape[0], 1].numpy(), '.r')
plt.plot(X[X1.shape[0]:, 0].numpy(), X[X1.shape[0]:, 1].numpy(), '.b')
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')

xmin = X[:, 0].min()
xmax = X[:, 0].max()
ymin = (-b - w[0] * xmin) / w[1]
ymax = (-b - w[0] * xmax) / w[1]
plt.plot([xmin, xmax], [ymin, ymax], 'g')
plt.gca().set_aspect('equal', adjustable='box')


## Now use pytorch.nn do it for you

In [0]:
# General Parameters
nepochs = 5
lr = 0
K = 1
print_every = 1
#####################


torch.manual_seed(0)

# Use torch.nn Linear layer to define a linear layer
# similar to the one we used above
# put it inside "lin" variable
# <<<--- Only change these lines -------->>> 
lin = None

# <<<------------------------------------>>> 


# you would have to define the optimizer that you will use
# use a gradient descent optimizer from torch.optim
# <<<--- Only change these lines -------->>> 
optimizer = None

# <<<------------------------------------>>> 

  
# Train model
for t in range(nepochs):
  
  # use "lin" from above to compute the output
  # of the logistic regression. don't forget the sigmoid..
  # <<<--- Only change these lines -------->>> 
  out = None

  # <<<------------------------------------>>> 

  loss = torch.nn.BCELoss()(out, y.float().unsqueeze(-1))
  if t % print_every == 0:
    print(t, 'loss:', loss.item())
  
  # compute gradients to parameters
  # call the backward pass from loss.
  # note that we didn't explicitly write any backward function.
  # that's one of the biggest advantages of using modern DL framework.
  # (remember that gradients are being summed...)
  # <<<--- Only change these lines -------->>> 


  # <<<------------------------------------>>> 


  # update parameters
  # use the optimizer to update the parameters
  # <<<--- Only change these lines -------->>> 


  # <<<------------------------------------>>> 

  
print(t, 'loss:', loss.item())

## More Questions

1. Did you manage to converge to zero loss? If not, why?
2. Change data points to overlap (change sigmas and/or means). Try to converge. Did you converge to zero loss? why did you manage (or didn't)?