### A.2.3 Common Pytorch Tensor Operations

In [1]:
import torch
tensor2d = torch.tensor([[1,2,3], [4,5,6]])
tensor2d

tensor([[1, 2, 3],
        [4, 5, 6]])

In [2]:
tensor2d.shape

torch.Size([2, 3])

In [3]:
tensor2d.reshape(3,2)

tensor([[1, 2],
        [3, 4],
        [5, 6]])

In [4]:
tensor2d.view(3,2) # requires original data to be contiguous

tensor([[1, 2],
        [3, 4],
        [5, 6]])

In [5]:
tensor2d.T

tensor([[1, 4],
        [2, 5],
        [3, 6]])

In [6]:
tensor2d.matmul(tensor2d.T)

tensor([[14, 32],
        [32, 77]])

In [7]:
tensor2d @ tensor2d.T

tensor([[14, 32],
        [32, 77]])

More operations at https://pytorch.org/docs/stable/tensors.html

### A.3 Seeing models as _computation graphs_

In [8]:
import torch.nn.functional as F

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2])
b = torch.tensor([0.0])
z = x1*w1 + b
a = torch.sigmoid(z)
loss = F.binary_cross_entropy(a, y)

### A.4 Automatic differentiation made easy

In [9]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)
z = x1 * w1 + b
a = torch.sigmoid(z)
loss = F.binary_cross_entropy(a, y)

# The manual way:
grad_L_w1 = grad(loss, w1, retain_graph=True) # By default, PyTorch destroys the computation graph after calculating the gradients to free memory. However, since we will reuse this computation graph shortly, we set retain_graph=True so that it stays in memory.
grad_L_b = grad(loss, b, retain_graph=True)
print(grad_L_w1)
print(grad_L_b)

# The automatic way, which saves the gradients in the input's .grad attribute
loss.backward()
print(w1.grad)
print(b.grad)


(tensor([-0.0898]),)
(tensor([-0.0817]),)
tensor([-0.0898])
tensor([-0.0817])


### A.5 Implementing multilayer neural networks
In this example, a multilayer perceptron (a.k.a fully connected a.k.a feedforward network)

In [10]:
from torch.nn import Module, Sequential, Linear, ReLU

class NeuralNetwork(Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()
        self.layers = Sequential(
            # 1st hidden layer
            Linear(num_inputs, 30),
            ReLU(), # non-linear activation functions are placed between the hidden layers
            # 2nd hidden layer
            Linear(30, 20),
            ReLU(),
            # output layer
            Linear(20, num_outputs)
        )

    def forward(self, x):
        # x: (batch_size, num_inputs) -> output: (batch_size, num_outputs)
        logits = self.layers(x) # outputs of last layer are called logits
        return logits

In [11]:
model = NeuralNetwork(50, 3)
print(model)

NeuralNetwork(
  (layers): Sequential(
    (0): Linear(in_features=50, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=20, bias=True)
    (3): ReLU()
    (4): Linear(in_features=20, out_features=3, bias=True)
  )
)


In [12]:
# Getting the total number of trainable params
num_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
print("Trainable params:", num_params)

Trainable params: 2213


In [13]:
print(model.layers[0].weight)
print()
print(model.layers[0].weight.shape)

Parameter containing:
tensor([[-0.0460, -0.0679,  0.0799,  ...,  0.0864, -0.0699, -0.0984],
        [ 0.0379,  0.0621, -0.1312,  ...,  0.0939,  0.0421,  0.1116],
        [ 0.0713, -0.0677, -0.0935,  ..., -0.0614, -0.0957,  0.0416],
        ...,
        [ 0.0415,  0.0839, -0.0165,  ...,  0.0928,  0.1229,  0.0811],
        [-0.0401,  0.1312, -0.0407,  ...,  0.0186,  0.0294, -0.0465],
        [-0.1235, -0.0216,  0.0852,  ...,  0.0022, -0.0244, -0.0191]],
       requires_grad=True)

torch.Size([30, 50])


In [14]:
# using the forward pass
torch.set_printoptions(linewidth=115)
torch.manual_seed(123)
X = torch.rand((1, 50))
print(X, "\n")
# %timeit out = model(X)
# print(out)

tensor([[0.2961, 0.5166, 0.2517, 0.6886, 0.0740, 0.8665, 0.1366, 0.1025, 0.1841, 0.7264, 0.3153, 0.6871, 0.0756,
         0.1966, 0.3164, 0.4017, 0.1186, 0.8274, 0.3821, 0.6605, 0.8536, 0.5932, 0.6367, 0.9826, 0.2745, 0.6584,
         0.2775, 0.8573, 0.8993, 0.0390, 0.9268, 0.7388, 0.7179, 0.7058, 0.9156, 0.4340, 0.0772, 0.3565, 0.1479,
         0.5331, 0.4066, 0.2318, 0.4545, 0.9737, 0.4606, 0.5159, 0.4220, 0.5786, 0.9455, 0.8057]]) 



In [15]:
# Running only feedforward without building the computation graph (useful for prediction/inference)
# with torch.no_grad():
    # %timeit out = model(X)
# print(out)

In [16]:
# it's common practice to return the logits from the last layer 
# because pytorch mixes softmax with the negative log likelihood class. 
# So we need to compute our probabilities manually
with torch.no_grad():
    out = torch.softmax(model(X), dim=1)
print(out)    

tensor([[0.3010, 0.2558, 0.4433]])


### A.6 Setting up efficient data loaders

In [17]:
# Creating toy dataset
X_train = torch.tensor([[-1.2, 3.1], [-0.9, 2.9], [-0.5, 2.6], [2.3, -1.1], [2.7, -1.5]])
y_train = torch.tensor([0, 0, 0, 1, 1])
X_test = torch.tensor(
    [
        [-0.8, 2.8],
        [2.6, -1.6],
    ]
)
y_test = torch.tensor([0, 1])

# Creating toy dataloader
from torch.utils.data import Dataset 
class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y
    def __getitem__(self, index):
        one_x = self.features[index]
        one_y = self.labels[index]
        return one_x, one_y
    def __len__(self):
        return self.labels.shape[0] # total length of the dataset (the rows)

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

In [18]:
# setting up dataloader
from torch.utils.data import DataLoader
torch.manual_seed(123)

train_loader = DataLoader(
    dataset=train_ds,
    batch_size=2,
    shuffle=True,
    num_workers=0 # Num procs. for data loading. When =0, all loading is done in main process.
)

test_loader = DataLoader(
    dataset=test_ds,
    batch_size=2,
    shuffle=False,
    num_workers=0 # In jupyter notebooks, setting this to >0 can lead to resource-sharing problems
)

for idx, (x, y) in enumerate(train_loader):
    print(f"Batch {idx}:\n", x, "\n", y, "\n")

Batch 0:
 tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) 
 tensor([1, 0]) 

Batch 1:
 tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) 
 tensor([0, 0]) 

Batch 2:
 tensor([[ 2.7000, -1.5000]]) 
 tensor([1]) 



In [19]:
# Notice that the last batch has only one record, as opposed to two in the others.
# In practice, having a smaller batch like this can disrupt training convergence.
# So we usually drop it like so:
train_loader = DataLoader(dataset=train_ds, batch_size=2, shuffle=True, num_workers=0, drop_last=True)

### A.7 A typical training loop

In [None]:
import torch.nn.functional as F
torch.manual_seed(123)

model = NeuralNetwork(2, 2)
optimizer = torch.optim.SGD(model.parameters(), lr=0.5)
num_epochs = 3

for epoch in range(num_epochs):
    model.train()
    for idx, (features, labels) in enumerate(train_loader):
        logits = model(features)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad() # zero out grads from last step
        loss.backward() # calculate gradients
        optimizer.step() # update params based on grads
        # print results
        print(f"Epoch: {epoch+1:01d}/{num_epochs:01d}" f" | Batch {idx+1:01d}/{len(train_loader):01d}" f" | Train Loss: {loss:.2f}")

Epoch: 1/3 | Batch 1/2 | Train Loss: 0.75
Epoch: 1/3 | Batch 2/2 | Train Loss: 0.65
Epoch: 2/3 | Batch 1/2 | Train Loss: 0.44
Epoch: 2/3 | Batch 2/2 | Train Loss: 0.13
Epoch: 3/3 | Batch 1/2 | Train Loss: 0.03
Epoch: 3/3 | Batch 2/2 | Train Loss: 0.00


In [None]:
# trainable params in this model
sum(p.numel() for p in model.parameters() if p.requires_grad)

752

In [36]:
# Making predictions
model.eval()
with torch.no_grad():
    outputs = model(X_train)
print(outputs)

# Checking class probabilities
torch.set_printoptions(sci_mode=False, precision=4)
probs = torch.softmax(outputs, dim=1)
print("\n", probs)

# Getting predictions
preds = torch.argmax(probs, dim=1)
print("\n", preds)

tensor([[ 2.8569, -4.1618],
        [ 2.5382, -3.7548],
        [ 2.0944, -3.1820],
        [-1.4814,  1.4816],
        [-1.7176,  1.7342]])

 tensor([[0.9991, 0.0009],
        [0.9982, 0.0018],
        [0.9949, 0.0051],
        [0.0491, 0.9509],
        [0.0307, 0.9693]])

 tensor([0, 0, 0, 1, 1])


### A.8 Saving and loading models

In [None]:
# saving the model
torch.save(model.state_dict(), "./intro_model.pth")

# loading the model
model = NeuralNetwork(2, 2) # we need an instance of the model in memory in order to apply the saved state
model.load_state_dict(torch.load("./intro_model.pth"))

<All keys matched successfully>

### A.9 GPU Training
See the [github repo](https://github.com/rasbt/LLMs-from-scratch/blob/82010e2c7729c4582afd5cb155c9d654f62ba43a/appendix-A/01_main-chapter-code/DDP-script.py)