In [1]:
import torch
import torch.nn as nn
import torch.optim as optim

# **BASICS**

Inputs -> Multiply by Werights -> Sum all these Products to create a single number -> Add a bias term -> Pass through an activation function -> Output the number.

Mathematically,

Output = Activation(Sum of all(Weight[i]*Input[i]) + Bias)

**Weights -** They are learnable parameters corresponding to each input. Adjusted in training. If they are -

1. Positive -> Input Increases output.

2. Negative -> Input Decreases output.

3. Small abs value -> Not so important input.

4. Large abs value -> Important Input.




**Activation Function -** For applying non-linearity.

In [2]:
inputs = torch.arange(1,4,1, dtype = torch.float)

weights = torch.randn(3) * 0.1

bias = torch.randn(1)

weighted_inputs = inputs * weights

weighted_sum = weighted_inputs.sum() + bias

output = torch.relu(weighted_sum)

print(output)

tensor([1.0998])


**Example 1 - Neuron learning Boolean Logic**

In [3]:
def and_gate(x1: int, x2: int) -> bool:
  inputs = torch.tensor([x1, x2], dtype = torch.float)
  weights = torch.tensor([1, 1], dtype = torch.float)
  bias = -1.5
  weighted_sum = torch.dot(inputs, weights) + bias
  if weighted_sum > 0:
    return True
  else:
    return False

def or_gate(x1: int, x2: int) -> int:
  inputs = torch.tensor([x1, x2], dtype = torch.float)
  weights = torch.ones(2, dtype = torch.float)
  bias = -0.5
  weighted_sum = torch.dot(inputs, weights) + bias
  output = int((1 + torch.sign(weighted_sum))//2)
  return output


print(or_gate(1,1))
print(or_gate(1,0))
print(or_gate(0,1))
print(or_gate(0,0))


1
1
1
0


In [21]:
#nn.Linear does this entire weighted sum.

layer = nn.Linear(3,1) #3 input features combined to give 1 output in the layer.

x = torch.randn(5,3)

z = layer(x)

print(z.shape)

layer2 = nn.Linear(3, 5) #Same input, but 5 different neurons compute it this time.

z2 = layer2(x)

print(z2.shape) #5 different outputs for each input - of the 5 neurons give their own prediction for each of the 5 inputs.

torch.Size([5, 1])
torch.Size([5, 5])


# **Adding Non Linearity to our Neurons**

This is done by activation functions. Each kind of activation function shows its own distinct behaviour. Without them, no matter how many layers the result will still be linear.

**Universal Approximation Theorem -** Any neural network with atleast one hidden layer, one non linear activation function and sufficient neurons can approximate any continous function to arbitrary precision.

**Common Activation Functions**

1. ReLU(x) -> Returns max(x,0).

2. tanh(x) -> Returns tanh(x).

3. Sigmoid(x) -> Returns 1/(1 + exp(-x))

**A Single Neuron -** The following class makes a single neuron after applying all transformations.

In [5]:
class SingleNeuron(nn.Module):
  def __init__(self):
    super().__init__()
    self.linear = nn.Linear(3,1)
    self.activation = nn.ReLU()

  def forward(self, x):
    z = self.linear(x)
    output = self.activation(z)
    return z

In [6]:
neuron = SingleNeuron()

x = torch.arange(1,4,1, dtype = torch.float)

output = neuron(x)

print(output)

tensor([0.5369], grad_fn=<ViewBackward0>)


This can also be made using nn.Sequential instead of making a custom class.


In [7]:
weather = torch.tensor([65.0, 1013.0, 10.0])

temp_neuron = nn.Sequential(
    nn.Linear(3,1),
    nn.ReLU()
)

prediction = temp_neuron(weather)

print(f"{prediction.item():.1f} F")

0.0 F


# **BUILDING NEURONS**

**1. Using PyTorch's nn Module**

In [8]:
class neuron(nn.Module):
  def __init__(self, num_inputs):
    super().__init__()
    self.linear = nn.Linear(num_inputs, 1)
    self.activation = nn.Sigmoid()

  def forward(self, x):
    z = self.linear(x)
    output = self.activation(z)
    return output

In [9]:
neu = neuron(3)

x = torch.arange(1,4, dtype = torch.float)

output = neu(x)

print(output)

print(f"\nWeights: {neu.linear.weight}")
print(f"Bias: {neu.linear.bias}")



tensor([0.5729], grad_fn=<SigmoidBackward0>)

Weights: Parameter containing:
tensor([[-0.1170, -0.2441,  0.3263]], requires_grad=True)
Bias: Parameter containing:
tensor([-0.0800], requires_grad=True)


# **TRAINING**

In [10]:
neu = neuron(2)

X = torch.tensor([
    [1, 1],
    [1, 0],
    [0, 1],
    [0, 0]
], dtype = torch.float)

y = torch.tensor([[1], [0], [0], [0]], dtype = torch.float)

loss = nn.BCELoss()
optimizer = optim.SGD(neu.parameters(), lr = 50)

for epoch in range(10000):
  predictions = neu(X)
  loss_value = loss(predictions, y)
  optimizer.zero_grad()
  loss_value.backward()
  optimizer.step()

  if epoch % 1000 == 0:
    print(f"Epoch: {epoch}, Loss: {loss_value.item()}")



Epoch: 0, Loss: 0.6041615009307861
Epoch: 1000, Loss: 0.0002849958837032318
Epoch: 2000, Loss: 0.0001550256129121408
Epoch: 3000, Loss: 0.0001064603275153786
Epoch: 4000, Loss: 8.108050678856671e-05
Epoch: 5000, Loss: 6.54860123177059e-05
Epoch: 6000, Loss: 5.491523916134611e-05
Epoch: 7000, Loss: 4.726626502815634e-05
Epoch: 8000, Loss: 4.148826701566577e-05
Epoch: 9000, Loss: 3.698470754898153e-05


In [11]:
print("Predictions:")
with torch.no_grad():
    for i, (input_vals, target_val) in enumerate(zip(X, y)):
        pred = neu(input_vals.unsqueeze(0))
        print(f"{input_vals.tolist()} → {pred.item():.3f} (target: {target_val.item()})")

Predictions:
[1.0, 1.0] → 1.000 (target: 1.0)
[1.0, 0.0] → 0.000 (target: 0.0)
[0.0, 1.0] → 0.000 (target: 0.0)
[0.0, 0.0] → 0.000 (target: 0.0)


# **TRAINING WITH JUST TENSORS**

In [12]:
class manualNeuron:
  def __init__(self, num_inputs):
    self.weights = torch.randn(num_inputs, requires_grad=True) #autograd starts tracking all operations on this tensor because of requires_grad. Used for computing gradients.
    self.bias = torch.randn(1, requires_grad = True)

  def forward(self, x):
    weighted_sum = torch.dot(self.weights, x) + self.bias
    output = 1/(1 + torch.exp(-weighted_sum))
    return output

  def parameters(self):
    return [self.weights, self.bias]


In [13]:
neu = manualNeuron(2)

for epoch in range(10001):
  total_loss = 0
  for i in range(len(X)):
    prediction = neu.forward(X[i])
    loss = -(y[i]*torch.log(prediction/(1-prediction)) + torch.log(1 - prediction))
    total_loss += loss.item()
    loss.backward()
    with torch.no_grad():
      for param in neu.parameters():
        param -= 10 * param.grad
        param.grad.zero_()

  if(epoch % 1000 == 0):
    print(f"Epoch {epoch} - Loss: {total_loss}")


Epoch 0 - Loss: 4.81676295195939
Epoch 1000 - Loss: 0.0004359752438176656
Epoch 2000 - Loss: 0.00025851294049061835
Epoch 3000 - Loss: 0.0002047460038738791
Epoch 4000 - Loss: 0.00017732665946823545
Epoch 5000 - Loss: 0.0001592062326380983
Epoch 6000 - Loss: 0.00014430467490456067
Epoch 7000 - Loss: 0.00013280069470056333
Epoch 8000 - Loss: 0.0001232637332577724
Epoch 9000 - Loss: 0.00011539573824848048
Epoch 10000 - Loss: 0.00010734897659858689


In [14]:
for i in range(len(X)):
  pred = neu.forward(X[i])
  print(f"{X[i][0]} & {X[i][1]} = {pred[0]}")

1.0 & 1.0 = 0.9999555349349976
1.0 & 0.0 = 3.1009276426630095e-05
0.0 & 1.0 = 3.23840431519784e-05
0.0 & 0.0 = 4.460907825857424e-14


# **LOSS FUNCTION**

**Properties of a good Loss:**

1. Must be non negative.
2. Zero at perfection.
3. Gradients must exist at every point for gradient descent.
4. Monotonic i.e. more error -> more loss.
5. Smooth for stable optimization.

Examples - nn.MSELoss(), nn.BCELoss(), nn.CELoss() (cross entropy loss for multi class classification/Softmax Regression)

Training goal - smaller loss in each iteration