In [29]:
import torch
import torch.nn as nn
import torch.optim as optim

# **BASICS**

Inputs -> Multiply by Werights -> Sum all these Products to create a single number -> Add a bias term -> Pass through an activation function -> Output the number.

Mathematically,

Output = Activation(Sum of all(Weight[i]*Input[i]) + Bias)

**Weights -** They are learnable parameters corresponding to each input. Adjusted in training. If they are -

1. Positive -> Input Increases output.

2. Negative -> Input Decreases output.

3. Small abs value -> Not so important input.

4. Large abs value -> Important Input.




**Activation Function -** For applying non-linearity.

In [30]:
inputs = torch.arange(1,4,1, dtype = torch.float)

weights = torch.randn(3) * 0.1

bias = torch.randn(1)

weighted_inputs = inputs * weights

weighted_sum = weighted_inputs.sum() + bias

output = torch.relu(weighted_sum)

print(output)

tensor([0.3807])


**Example 1 - Neuron learning Boolean Logic**

In [31]:
def and_gate(x1: int, x2: int) -> bool:
  inputs = torch.tensor([x1, x2], dtype = torch.float)
  weights = torch.tensor([1, 1], dtype = torch.float)
  bias = -1.5
  weighted_sum = torch.dot(inputs, weights) + bias
  if weighted_sum > 0:
    return True
  else:
    return False

def or_gate(x1: int, x2: int) -> int:
  inputs = torch.tensor([x1, x2], dtype = torch.float)
  weights = torch.ones(2, dtype = torch.float)
  bias = -0.5
  weighted_sum = torch.dot(inputs, weights) + bias
  output = int((1 + torch.sign(weighted_sum))//2)
  return output


print(or_gate(1,1))
print(or_gate(1,0))
print(or_gate(0,1))
print(or_gate(0,0))


1
1
1
0


In [32]:
#nn.Linear does this entire weighted sum.

layer = nn.Linear(3,1) #3 input features combined to give 1 output in the layer.

x = torch.randn(5,3)

z = layer(x)

print(z.shape)

layer2 = nn.Linear(3, 5) #Same input, but 5 different neurons compute it this time.

z2 = layer2(x)

print(z2.shape) #5 different outputs for each input - of the 5 neurons give their own prediction for each of the 5 inputs.

torch.Size([5, 1])
torch.Size([5, 5])


# **Adding Non Linearity to our Neurons**

This is done by activation functions. Each kind of activation function shows its own distinct behaviour. Without them, no matter how many layers the result will still be linear.

**Universal Approximation Theorem -** Any neural network with atleast one hidden layer, one non linear activation function and sufficient neurons can approximate any continous function to arbitrary precision.

**Common Activation Functions**

1. ReLU(x) -> Returns max(x,0).

2. tanh(x) -> Returns tanh(x).

3. Sigmoid(x) -> Returns 1/(1 + exp(-x))

**A Single Neuron -** The following class makes a single neuron after applying all transformations.

In [33]:
class SingleNeuron(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear = nn.Linear(3,1)
    self.activation = nn.ReLU()

  def forward(self, x):
    z = self.linear(x)
    output = self.activation(z)
    return z

In [34]:
neuron = SingleNeuron()

x = torch.arange(1,4,1, dtype = torch.float)

output = neuron(x)

print(output)

tensor([-1.2742], grad_fn=<ViewBackward0>)


This can also be made using nn.Sequential instead of making a custom class.


In [35]:
weather = torch.tensor([65.0, 1013.0, 10.0])

temp_neuron = nn.Sequential(
    nn.Linear(3,1),
    nn.ReLU()
)

prediction = temp_neuron(weather)

print(f"{prediction.item():.1f} F")

0.0 F


# **BUILDING NEURONS**

**1. Using PyTorch's nn Module**

In [36]:
class neuron(nn.Module):
  def __init__(self, num_inputs):
    super().__init__()
    self.linear = nn.Linear(num_inputs, 1)
    self.activation = nn.Sigmoid()

  def forward(self, x):
    z = self.linear(x)
    output = self.activation(z)
    return output

In [37]:
neu = neuron(3)

x = torch.arange(1,4, dtype = torch.float)

output = neu(x)

print(output)

print(f"\nWeights: {neu.linear.weight}")
print(f"Bias: {neu.linear.bias}")



tensor([0.4538], grad_fn=<SigmoidBackward0>)

Weights: Parameter containing:
tensor([[-0.3881, -0.4917,  0.5310]], requires_grad=True)
Bias: Parameter containing:
tensor([-0.4069], requires_grad=True)


# **TRAINING**

In [38]:
neu = neuron(2)

X = torch.tensor([
    [1, 1],
    [1, 0],
    [0, 1],
    [0, 0]
], dtype = torch.float)

y = torch.tensor([[1], [0], [0], [0]], dtype = torch.float)

loss = nn.BCELoss()
optimizer = optim.SGD(neu.parameters(), lr = 50)

for epoch in range(10000):
  predictions = neu(X)
  loss_value = loss(predictions, y)
  optimizer.zero_grad() #clear old gradients
  loss_value.backward() #compute new gradients
  optimizer.step() #update weights

  if epoch % 1000 == 0:
    print(f"Epoch: {epoch}, Loss: {loss_value.item()}")



Epoch: 0, Loss: 0.8739743232727051
Epoch: 1000, Loss: 0.0003075942222494632
Epoch: 2000, Loss: 0.00016147890710271895
Epoch: 3000, Loss: 0.00010947955888696015
Epoch: 4000, Loss: 8.281478221761063e-05
Epoch: 5000, Loss: 6.659489736193791e-05
Epoch: 6000, Loss: 5.5677566706435755e-05
Epoch: 7000, Loss: 4.7847854148130864e-05
Epoch: 8000, Loss: 4.1961189708672464e-05
Epoch: 9000, Loss: 3.734319398063235e-05


In [39]:
print("Predictions:")
with torch.no_grad():  #disables gradient tracking
    for i, (input_vals, target_val) in enumerate(zip(X, y)):
        pred = neu(input_vals.unsqueeze(0))
        print(f"{input_vals.tolist()} → {pred.item():.3f} (target: {target_val.item()})")

Predictions:
[1.0, 1.0] → 1.000 (target: 1.0)
[1.0, 0.0] → 0.000 (target: 0.0)
[0.0, 1.0] → 0.000 (target: 0.0)
[0.0, 0.0] → 0.000 (target: 0.0)


# **TRAINING WITH JUST TENSORS**

In [40]:
class manualNeuron:
  def __init__(self, num_inputs):
    self.weights = torch.randn(num_inputs, requires_grad=True) #autograd starts tracking all operations on this tensor because of requires_grad. Used for computing gradients.
    self.bias = torch.randn(1, requires_grad = True)

  def forward(self, x):
    weighted_sum = torch.dot(self.weights, x) + self.bias
    output = 1/(1 + torch.exp(-weighted_sum))
    return output

  def parameters(self):
    return [self.weights, self.bias]


In [41]:
neu = manualNeuron(2)

for epoch in range(10001):
  total_loss = 0
  for i in range(len(X)):
    prediction = neu.forward(X[i])
    loss = -(y[i]*torch.log(prediction/(1-prediction)) + torch.log(1 - prediction))
    total_loss += loss.item()
    loss.backward()
    with torch.no_grad():
      for param in neu.parameters():
        param -= 10 * param.grad
        param.grad.zero_()

  if(epoch % 1000 == 0):
    print(f"Epoch {epoch} - Loss: {total_loss}")


Epoch 0 - Loss: 14.640341758733484
Epoch 1000 - Loss: 0.0007983411778695881
Epoch 2000 - Loss: 0.0005402818060247228
Epoch 3000 - Loss: 0.00040997528412844986
Epoch 4000 - Loss: 0.0003306364014861174
Epoch 5000 - Loss: 0.0002770490027614869
Epoch 6000 - Loss: 0.0002381254525971599
Epoch 7000 - Loss: 0.00020850081637036055
Epoch 8000 - Loss: 0.00018555224596639164
Epoch 9000 - Loss: 0.00016713384684408084
Epoch 10000 - Loss: 0.00015258989878930151


In [42]:
for i in range(len(X)):
  pred = neu.forward(X[i])
  print(f"{X[i][0]} & {X[i][1]} = {pred[0]}")

1.0 & 1.0 = 0.9999371767044067
1.0 & 0.0 = 4.4769236410502344e-05
0.0 & 1.0 = 4.479776180232875e-05
0.0 & 0.0 = 1.2589247407133325e-13


# **LOSS FUNCTION**

**Properties of a good Loss:**

1. Must be non negative.
2. Zero at perfection.
3. Gradients must exist at every point for gradient descent.
4. Monotonic i.e. more error -> more loss.
5. Smooth for stable optimization.

Examples -

1. nn.MSELoss()

    **Formula -** 1/sample.size() * sum((pred[i] - actual[i])**2)

    **Name -** Mean Square Error Loss

  Corresponds to Maximum Likelihood under Gaussian Distribution.

2. nn.BCELoss()

    **Formula -** -1/sample.size() * sum(actual[i]*log(pred[i]) + (1-actual[i])*log(1-pred[i]))

    **Name -** Binary Cross Entropy Loss

    Corresponds to Maximum Likelihood under Bernoulli Distribution

3. nn.CrossEntropyLoss()

    **Formula -** -1/sample.size() * sum(sum(actual[i][k]*log(pred[i][k])))

    **Name -** Cross Entropy Loss

    Corresponds to Negative Log Likelihood for multi class classification problems.


**Training goal - smaller loss in each iteration. Loss tells us the directions in which we need to adjust weights and thus guides learning.**

In [43]:
model = nn.Sequential(
    nn.Linear(2,1),
    nn.Sigmoid()
)

X = torch.tensor([[1,2]], dtype = torch.float)
y = torch.tensor([[1]], dtype = torch.float)

prediction = model(X)

mse = nn.MSELoss()
bce = nn.BCELoss()
ce = nn.CrossEntropyLoss()

print(mse(prediction, y))
print(bce(prediction, y))
print(ce(prediction, y))

tensor(0.3095, grad_fn=<MseLossBackward0>)
tensor(0.8126, grad_fn=<BinaryCrossEntropyBackward0>)
tensor(-0., grad_fn=<DivBackward1>)


# **CONCEPT OF LEARNING**

Mathematically, ML is an optimization problem - find parameters that minimize loss. Mathematically,


Find Θ* = argmin(L(Θ)) where -


Θ = Model Parameters.

L(Θ) = Loss Function.

Θ* = Optimal Parameters minimizing loss.

argmin = Argument that minimizes.

Imagine the Loss as a hilly landscape, start at a random point and take a step down in each iteration until you find the minimal point. The most downward direction is the opposite of gradient so this is called gradient descent. Learning is this entire process where parameters are updating.

Thus, learning process is just do a forward pass -> make predictions -> calculate new loss -> calculate gradients (backward pass) -> move in the direction opposite to gradient aka update weights -> repeat

In [44]:
model = nn.Linear(1,1)

X = torch.tensor([[1],[2],[3]], dtype = torch.float)
y = torch.tensor([[2],[4],[6]], dtype = torch.float)

loss = nn.MSELoss()

optimizer = optim.SGD(model.parameters(), lr = 0.01)

for epoch in range(10000):
  predictions = model(X)
  current_loss = loss(predictions, y)
  optimizer.zero_grad()
  current_loss.backward()
  optimizer.step()

  #if epoch%1000 == 0:
    #print(f"Epoch: {epoch}, Loss: {current_loss.item()}")

with torch.no_grad():
  #print(model.weight)
  #print(model.bias)
  print(model(torch.tensor([10], dtype = torch.float)))

tensor([20.0000])


Learning rate lr is a hyperparameter balancing speed and stability. Bigger eta = faster learning, smaller eta = more stable and lesser oscillation.

**Optimal Learning Rate Theorem -** For a quadratic Loss (w^TAw)/2, optimal lr = 2/(lambda_min + lambda_max) where lambda_min, lambda_max are smallest and largest eigenvalues of A.

Usually we don't know A so we start by lr = 0.01 or 0.001, use adaptive optimizers like Adam and RMSProp or use learning rate schedules.

**Task -** Implement a neuron that can learn a linear relationship y = mx + c from scratch.

In [45]:
#Training Data

x = torch.tensor([1,2,3,4,5,6], dtype = torch.float)
y = torch.tensor([15,25,35,45,55,65], dtype = torch.float)

#Starting weights
w = torch.randn(1, requires_grad=True)
b = torch.randn(1, requires_grad=True)

print(w)

#Hyperparameter
lr = 0.001

#Training Loop
for epoch in range(1000):
  prediction = w*x + b #The model has exactly 1 feature so taking w as a single number is fine. Else take it as a vector with size = input features and do dot product instead.
  loss = ((prediction - y)**2).mean()
  loss.backward()
  with torch.no_grad():
    w -= lr*w.grad
    b -= lr*b.grad
    w.grad.zero_()
    b.grad.zero_()
  if epoch % 100 == 0:
    print(f"Epoch: {epoch}, Loss: {loss.item()}")

tensor([-0.0337], requires_grad=True)
Epoch: 0, Loss: 1969.27783203125
Epoch: 100, Loss: 4.903125286102295
Epoch: 200, Loss: 1.8078700304031372
Epoch: 300, Loss: 1.6764503717422485
Epoch: 400, Loss: 1.5584176778793335
Epoch: 500, Loss: 1.4487032890319824
Epoch: 600, Loss: 1.346713900566101
Epoch: 700, Loss: 1.2519042491912842
Epoch: 800, Loss: 1.1637693643569946
Epoch: 900, Loss: 1.081840991973877


In [46]:
print(prediction)
print(y)

tensor([13.2472, 23.7814, 34.3156, 44.8498, 55.3840, 65.9182],
       grad_fn=<AddBackward0>)
tensor([15., 25., 35., 45., 55., 65.])


# **MAKING PREDICTIONS**

In [53]:
neuron = nn.Sequential(
    nn.Linear(3,1)
)

x = torch.tensor([
    [1,2,3],
    [0.5, 1.5, 2.5],
    [10, 11, 12],
    [101,102,103],
    [1.21, 2.21, 3.21]
], dtype = torch.float)

y = torch.tensor([
    [6],
    [4.5],
    [33],
    [306],
    [6.63]
])

loss = nn.MSELoss()
optimizer = optim.SGD(neuron.parameters(), lr = 0.0001)

for epoch in range(1000):
  predictions = neuron(x)
  loss_val = loss(predictions, y)
  optimizer.zero_grad()
  loss_val.backward()
  optimizer.step()

  if epoch % 100 == 0:
    print(f"Epoch: {epoch}, Loss: {loss_val.item()}")

print(neuron(x))

Epoch: 0, Loss: 25256.03125
Epoch: 100, Loss: 0.45262080430984497
Epoch: 200, Loss: 0.41453108191490173
Epoch: 300, Loss: 0.3796462416648865
Epoch: 400, Loss: 0.3476983308792114
Epoch: 500, Loss: 0.31843769550323486
Epoch: 600, Loss: 0.29163965582847595
Epoch: 700, Loss: 0.26709598302841187
Epoch: 800, Loss: 0.2446199208498001
Epoch: 900, Loss: 0.22403264045715332
tensor([[  6.5173],
        [  5.0203],
        [ 33.4636],
        [305.9207],
        [  7.1461]], grad_fn=<AddmmBackward0>)


In [56]:
class spamNeuron(nn.Module):
  def __init__(self, input_num):
    super().__init__()
    self.linear = nn.Linear(input_num, 1)
    self.activation = nn.Sigmoid()

  def forward(self, x):
    logit = self.linear(x)
    prediction = self.activation(logit)
    return prediction

In [71]:
spam_detector = spamNeuron(100)

email = torch.randn(1, 100)

prediction = spam_detector(email)

if prediction > 0.5:
  print("Spam")
else:
  print("Not Spam")

Spam


**Note -** When making predictions use torch.no_grad() to disable gradient computation. This is because during training, PyTorch computes gradients and stores the entire computation graph in memory. Explictly asking it not to compute it saves this storage.