In [1]:
import numpy as np
import torch

# Training Data

In [13]:
# Input (temp,rainfall,humidity)
inputs= np.array([[73,67,43],[91,88,64],[87,134,58],[102,43,37],[69,96,70]],dtype="float32")
print(inputs)
#Dtype is in floating point because our model will be predicting in floating points not in integers

[[ 73.  67.  43.]
 [ 91.  88.  64.]
 [ 87. 134.  58.]
 [102.  43.  37.]
 [ 69.  96.  70.]]


In [14]:
# Target Yield of apple and oranges

targets=np.array([[56,70],[81,101],[119,133],[22,37],[103,119]],dtype="float32")
print(targets)

[[ 56.  70.]
 [ 81. 101.]
 [119. 133.]
 [ 22.  37.]
 [103. 119.]]


In [15]:
# Converting inputs and targets to tensors
inputs=torch.from_numpy(inputs)
targets=torch.from_numpy(targets)
print(inputs)
print(targets)

tensor([[ 73.,  67.,  43.],
        [ 91.,  88.,  64.],
        [ 87., 134.,  58.],
        [102.,  43.,  37.],
        [ 69.,  96.,  70.]])
tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


Linear regression from scratch

In [23]:
#Weights and biases
w=torch.randn(2,3,requires_grad=True)
b=torch.randn(2,requires_grad=True)
print(w,"\n",b)

tensor([[ 0.2398, -0.3148,  0.0052],
        [-2.2454,  0.2593, -0.2694]], requires_grad=True) 
 tensor([-0.2600,  0.0953], requires_grad=True)


Our model is simply a function that performs a amtrix multiplication of the inptus
and the weights w(transposed) and adds the bias b (replicated) for each observation

In [24]:
def model(x):
    return x@w.t()+b
# @ in pytorch does the matrix multiplication and .t does teh transpose of any matrix

In [25]:
#Generating predictions with random weigh
preds=model(inputs)

In [26]:
preds  

tensor([[  -3.6164, -158.0243],
        [  -5.7996, -198.6517],
        [ -21.2697, -176.1247],
        [  10.8620, -227.7480],
        [ -13.5628, -148.7949]], grad_fn=<AddBackward0>)

###### There is a huge differnce between actual values and the predicted values.This because we have intialised random values to the weights and biases.
###### We will never get negative number of oranges and apples.

In [27]:
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])


In [29]:
preds-targets  # this is the difference

tensor([[ -59.6164, -228.0243],
        [ -86.7996, -299.6517],
        [-140.2697, -309.1247],
        [ -11.1380, -264.7480],
        [-116.5628, -267.7949]], grad_fn=<SubBackward0>)

## Loss Function

- We first need a way to evaluate how well our model is performing later we will improve our model.
- We will use Mean Squared Error(MSE) to check how well our model performs.

In [28]:
def mse(t1,t2):
    diff=t1-t2
    return torch.sum(diff*diff)/diff.numel()

# torch.sum will return the sum of squares of difference
# .numel will return number of elements in a tensor

In [31]:
loss=mse(preds,targets)

In [32]:
loss

tensor(42362.4766, grad_fn=<DivBackward0>)

So by looking at the loss we can intrepret that on an average each element in the prediction difffers almost about 205(square root of loss :42362.476).The result is called loss, because it indicates how bad the model is at predicting the target variables. Lower the loss better the model

##### To improve the model we will use a technique called gradient descent

- We can actually say that loss is a function of weights
- That is the reason why I have set requires_grad as True for weights and biases.
- One moe thing we need to keep in mind is inputs never changes.

In [34]:
loss.backward()

The gradients are stored in the .grad property of respective tensors. Note that the derivate of the loss w.r.t the weights is itself a matrix, with the same dimension.

In [38]:
print(w)
print(w.grad)

tensor([[ 0.2398, -0.3148,  0.0052],
        [-2.2454,  0.2593, -0.2694]], requires_grad=True)
tensor([[ -6726.6265,  -8419.5527,  -4965.1646],
        [-23258.0156, -24032.4336, -15090.6621]])


- Loss is a quadratic function of our weights and biases and our objective is to find the set of weights where the loss it lowest

- If a gradient element is positive:
  - Increasing the element's value slightly will increase the loss.
  - Decreasing the elments's value slightly will decrease the loss.

- If gradient element is negative
 - Increasing the element's value slightly will decrease the loss.
 - Increasing the element's value slightly will increaset the loss.
 
- Idea is that we always moves to the opposite of derivatives

#### This forms the basis for the optimization algorithm that we'll use to improve our model.

- Before we proceed, we reset the gradeints to zero by calling .zero_() method.
- We need to do this because PyTorch accumulates, gradients i.e the next time we call .backward on the loss,the new grasdient values will get added to the existing gradient values, which may lead to unexpected results.

In [39]:
w.grad.zero_()
b.grad.zero_()
print(w.grad)
print(b.grad)

tensor([[0., 0., 0.],
        [0., 0., 0.]])
tensor([0., 0.])


### Adjust Weights biases using gradient descent


- Now here I will use optimization algorithm which forms the base of all training in deep learning.

- The gradient descent optimization algorithm has the following steps.

   1. Generate Predictions
   2. Calculate the loss
   3. Compute graident w.r.t the weights and biases
   4. Adjust the weights by subtracting a small quantity proportional to the gradient.
   5. Reset the gradient to zero




In [40]:
## Generating prediction

preds=model(inputs)
print(preds)

tensor([[  -3.6164, -158.0243],
        [  -5.7996, -198.6517],
        [ -21.2697, -176.1247],
        [  10.8620, -227.7480],
        [ -13.5628, -148.7949]], grad_fn=<AddBackward0>)


In [41]:
# Caluclating loss
loss=mse(preds,targets)
print(loss)

tensor(42362.4766, grad_fn=<DivBackward0>)


In [42]:
# Computing Gradients
loss.backward()
print(w.grad,"\n",b.grad)

tensor([[ -6726.6265,  -8419.5527,  -4965.1646],
        [-23258.0156, -24032.4336, -15090.6621]]) 
 tensor([ -82.8773, -273.8687])


###### Finally we will update weights and biases using gradients computed above.

In [46]:
with torch.no_grad():
    w-=w.grad * 1e-5
    b-=b.grad * 1e-5
    w.grad.zero_()
    b.grad.zero_()

- torch.no_grad is used to indicate to Pytorch that it shouldn't track, calculate or modify gradients while updating weights and biases
- We multiply the gradient with a really small number(10^-5), to ensure that we don't modify the weights by a really large amount, since we only want a sa,ll step in downhill direction of the gradient. This number is called leraning rate

In [47]:
print(w,"\n",b)   # New weights and biases

tensor([[ 0.3071, -0.2306,  0.0549],
        [-2.0128,  0.4997, -0.1185]], requires_grad=True) 
 tensor([-0.2591,  0.0980], requires_grad=True)


In [49]:
## Calculating loss with newer weights and biases.
preds=model(inputs)
loss=mse(preds,targets)
print(loss)  

tensor(28822.2598, grad_fn=<DivBackward0>)


- Before loss was 42362.4766 and now it has reduced to 28822.2598
- Still we need to adjsut bias and weight to get good loss

### Training for multiple epochs

To reduce the loss further, we can repeat the process of adjusting the weights and biases using the gradients multiple times. Each iteration is called an epoch. 

In [55]:
# Training for 100 epochs
for i in range(1000):
    preds=model(inputs)
    loss=mse(preds,targets)
    loss.backward()
    with torch.no_grad():
        w-=w.grad*1e-5
        b-=b.grad*1e-5
        w.grad.zero_()
        b.grad.zero_()

In [56]:
preds=model(inputs)
loss=mse(preds,targets)
print(loss)

tensor(2.4607, grad_fn=<DivBackward0>)


In [57]:
print(preds)

tensor([[ 57.2304,  70.4539],
        [ 81.4034,  99.5389],
        [120.4147, 135.2989],
        [ 21.6300,  37.6197],
        [100.1509, 116.8400]], grad_fn=<AddBackward0>)


In [58]:
print(targets)

tensor([[ 56.,  70.],
        [ 81., 101.],
        [119., 133.],
        [ 22.,  37.],
        [103., 119.]])
