In [4]:
import torch
torch.__version__

'2.6.0+cu124'

In [5]:
torch.cuda.is_available()

# Ensure GPU or CPU intigration based on vaiability
#-----------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#-----------------------------------------------------------------------------

## Different way to multiply 1D or 2D Vectors

In [6]:
# For any Dimensional Vector-- Dot Product

import time

torch.manual_seed(123)
start = time.time()
x1 = torch.rand(100, 50)
x2 = torch.rand(50, 50)
x1@x2.T
end = time.time()
print(f"Time required to multiply two matrix is {end-start} sec.")

Time required to multiply two matrix is 0.021788358688354492 sec.


In [7]:
import time
import random
torch.manual_seed(123)
start = time.time()
x1 = torch.rand(100,50)
x2 = torch.rand(50,50)
torch.matmul(x1, x2.T)
end = time.time()
print(f"Time required to multiply two matrix is {end-start} sec.")

Time required to multiply two matrix is 0.0003390312194824219 sec.


In [8]:
# Only for 1D Vector-- Dot Product

import time
import random
torch.manual_seed(123)
start = time.time()
x1 = torch.rand(50)

torch.manual_seed(23)
x2 = torch.rand(50)
torch.dot(x1, x2)
end = time.time()
print(f"Time required to multiply two matrix is {end-start} sec.")

Time required to multiply two matrix is 0.0006654262542724609 sec.


In [9]:
# An array 1D
y = torch.tensor([0, 1, 2, 2])
print("The row shape of the vector/array: ",y.size(0))

The row shape of the vector/array:  4


In [10]:
# Reshaping the array to an array of column num =1 and row autocalculated
y_trans = y.view(-1,1)
print(y_trans)
print(y_trans.dtype)

tensor([[0],
        [1],
        [2],
        [2]])
torch.int64


In [11]:
# make sure the values are long() or integer
y_trans_long = y.view(-1,1).long()
print(y_trans_long)
print(y_trans_long.dtype)

tensor([[0],
        [1],
        [2],
        [2]])
torch.int64


In [12]:
# A zeros array 2D
zeros_array = torch.zeros(y.size(0), 4)
print(zeros_array)

tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]])


In [13]:
#It "scatters" along dimension 1 (columns) the value 2 at the indices given by tensor having 
# rows of zeros_array
exm_tensor = torch.ones(4,3)
exm_tensor.scatter_(1, torch.tensor([[1],[1],[2],[1]]), 2).float()

tensor([[1., 2., 1.],
        [1., 2., 1.],
        [1., 1., 2.],
        [1., 2., 1.]])

In [14]:
# Apply same principle to zeros_array
hot_encode = zeros_array.scatter_(1, y_trans_long, 1).float()
print(hot_encode)

tensor([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]])


## Using the above concept let's build the function for one-hot-encoding

In [15]:
# num_classes represnts how many class we want as output. These are the columns

def one_hot_encoding(index_array, num_classes):
    
    # Reshaping the array to an array of column num =1 and row autocalculated
    y = torch.tensor(index_array)
    y_trans = y.view(-1,1)
    # make sure the values are long() or integer
    y_trans_long = y.view(-1,1).long()

    # An 2D zeros array having rows= rows of index_array and columns = num_classes
    zeros_array = torch.zeros(y.size(0), num_classes)

    #It "scatters" along dimension 1 (columns) the value 2 at the indices given by tensor having 
    # rows of zeros_array. Applied to tensors
    hot_encode = zeros_array.scatter_(1, y_trans_long, 1).float()
    
    return hot_encode
    
y = [0, 1, 2, 2]

y_enc = one_hot_encoding(index_array = y, num_classes= 4)

print(f"One hot encoding matrix is \n{y_enc}")
print(hot_encode.dtype)

One hot encoding matrix is 
tensor([[1., 0., 0., 0.],
        [0., 1., 0., 0.],
        [0., 0., 1., 0.],
        [0., 0., 1., 0.]])
torch.float32


## Softmax function

In [16]:
Z = torch.tensor( [[-0.3,  -0.5, -0.5],
                   [-0.4,  -0.1, -0.5],
                   [-0.3,  -0.94, -0.5],
                   [-0.99, -0.88, -0.5]])

Z

tensor([[-0.3000, -0.5000, -0.5000],
        [-0.4000, -0.1000, -0.5000],
        [-0.3000, -0.9400, -0.5000],
        [-0.9900, -0.8800, -0.5000]])

In [17]:
#Next, we convert them to "probabilities" via softmax: 
def softmax(z):
    return (torch.exp(z.t()) / torch.sum(torch.exp(z), dim=1)).t()

smax = softmax(Z)
print('softmax:\n', smax)

softmax:
 tensor([[0.3792, 0.3104, 0.3104],
        [0.3072, 0.4147, 0.2780],
        [0.4263, 0.2248, 0.3490],
        [0.2668, 0.2978, 0.4354]])


In [18]:
# The probabilties can then be converted back to class labels based on the largest probability
# in each row:

def to_classlabel(z):
    return torch.argmax(z, dim= 1)

print('predicted class labels: ', to_classlabel(smax))
print('true class labels: ', to_classlabel(y_enc))

predicted class labels:  tensor([0, 1, 0, 2])
true class labels:  tensor([0, 1, 2, 2])


### GPU number system
GPU architectures are optimized for 32-bit computations, and using this data type can significantly speed up model training and inference.

In [19]:
sam_vec1 = torch.tensor([0,1.0,2.0,4.0])
sam_vec1.dtype

torch.float32

In [20]:
sam_vec2 = sam_vec1.to(torch.float64)
sam_vec2.dtype

torch.float64

## Reshaping operation

In [21]:
tensor2d = torch.tensor([[1, 2, 3], 
                         [4, 5, 6]])
print(tensor2d)

tensor([[1, 2, 3],
        [4, 5, 6]])


In [22]:
# tensor2d_re6 = tensor2d.view(6,1)
tensor2d_re6 = tensor2d.view(-1,1)
tensor2d_re6

tensor([[1],
        [2],
        [3],
        [4],
        [5],
        [6]])

In [23]:
tensor2d_re3 = tensor2d.view(3,2)
tensor2d_re3

tensor([[1, 2],
        [3, 4],
        [5, 6]])

## Computation graphs
A computational graph is a directed graph that allows us to express and visualize mathematical expressions. In the context of deep learning, a computation graph lays out the sequence of calculations needed to compute the output of a neural network—we will need this to compute the required gradients for backpropagation, the main training algorithm for neural networks.

In [24]:
import torch.nn.functional as F #1

y = torch.tensor([1.0])         #2
x1 = torch.tensor([1.1])        #3
w1 = torch.tensor([2.2])        #4
b = torch.tensor([0.0])         #5
z = x1 * w1 + b                 #6
a = torch.sigmoid(z)            #7. Make everything within 0 and 1. Good for Logistic Regression
loss = F.binary_cross_entropy(a, y)

print(loss)

tensor(0.0852)


In [25]:
print(f"Value of z: {z} \nValue of a: {a}")

Value of z: tensor([2.4200]) 
Value of a: tensor([0.9183])


## Computing gradients via autograd
If we carry out computations in PyTorch, it will build a **computational graph** internally by default if one of its terminal nodes has the **requires_grad** attribute set to True.

#1 By default, PyTorch destroys the computation graph after calculating the gradients to free memory. However, since we will reuse this computation graph shortly, we set **retain_graph=True** so that it stays in memory

In [26]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2], requires_grad=True)
b = torch.tensor([0.0], requires_grad=True)

z = x1 * w1 + b 
a = torch.sigmoid(z)

loss = F.binary_cross_entropy(a, y)

grad_L_w1 = grad(loss, w1, retain_graph=True)   #1
grad_L_b = grad(loss, b, retain_graph=True)

In [27]:
print(grad_L_w1)
print(grad_L_b)

(tensor([-0.0898]),)
(tensor([-0.0817]),)


In [28]:
# Pytorch autmatic loss calculator
loss.backward()
print(w1.grad)
print(b.grad)

tensor([-0.0898])
tensor([-0.0817])


## Implementing multilayer neural networks

In [29]:
import torch 

# Simple neural network model
class simpleNNmodel(torch.nn.Module):
    def __init__(self, num_inputs, num_outputs):
        super().__init__()

        self.layers = torch.nn.Sequential(
            # 1st hidden layer with non-linear activation
            torch.nn.Linear(num_inputs, 30),
            torch.nn.ReLU(),

            # 2nd hidden layer with non-linear activation
            torch.nn.Linear(30, 10),
            torch.nn.ReLU(),

            # ouput layers
            torch.nn.Linear(10, num_outputs)
        )
    def forward(self, x):
        logits = self.layers(x)
        return logits

In [30]:
torch.manual_seed(123)
model = simpleNNmodel(num_inputs = 20, num_outputs=1)
print(model)

simpleNNmodel(
  (layers): Sequential(
    (0): Linear(in_features=20, out_features=30, bias=True)
    (1): ReLU()
    (2): Linear(in_features=30, out_features=10, bias=True)
    (3): ReLU()
    (4): Linear(in_features=10, out_features=1, bias=True)
  )
)


In [31]:
num_params = sum([p.numel() for p in model.parameters()])
print(num_params)

951


In [32]:
# print(model.layers[0].weight)

In [33]:
print(model.layers[0].weight.shape)
print(model.layers[0].bias.shape)

torch.Size([30, 20])
torch.Size([30])


In [34]:
# Use the model
torch.manual_seed(123)
X = torch.randn((1, 50))

model = simpleNNmodel(num_inputs = 50, num_outputs=3)

out = model(X)
print(out)

tensor([[-0.3963,  0.1176,  0.0802]], grad_fn=<AddmmBackward0>)


**Addmm** stands for matrix multiplication (mm) followed by an addition (Add).
If we use model for prediction, no need to create computational graph in the background to conduct backpropagation. Therefore, **no_grad()** can be used for effective memory use.

In [35]:
with torch.no_grad():
    out = torch.softmax(model(X), dim=1)
print(out)

tensor([[0.2335, 0.3904, 0.3761]])


##  Setting up efficient data loaders

In [36]:
with open ('the-verdict.txt', 'r') as f:
    texts = f.read()

In [37]:
print(texts[:500])

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no great surprise to me to hear that, in the height of his glory, he had dropped his painting, married a rich widow, and established himself in a villa on the Riviera. (Though I rather thought it would have been Rome or Florence.)

"The height of his glory"--that was what the women called it. I can hear Mrs. Gideon Thwing--his last Chicago sitter--deploring his unaccountable abdication. "Of course it'


In [38]:
len(texts)

20479

In [39]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")

In [40]:
tokens = tokenizer.encode(texts)
print(tokens[:20])

[40, 367, 2885, 1464, 1807, 3619, 402, 271, 10899, 2138, 257, 7026, 15632, 438, 2016, 257, 922, 5891, 1576, 438]


In [41]:
print(tokenizer.decode(tokens[:20]))

I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--


### Dataset for testing codes from book

In [42]:
# Ensure GPU or CPU intigration based on vaiability
#-----------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#-----------------------------------------------------------------------------

X_train = torch.tensor([
    [-1.2, 3.1],
    [-0.9, 2.9],
    [-0.5, 2.6],
    [2.3, -1.1],
    [2.7, -1.5]
])
y_train = torch.tensor([0, 0, 0, 1, 1])

X_test = torch.tensor([
    [-0.8, 2.8],
    [2.6, -1.6],
])
y_test = torch.tensor([0, 1])

In [43]:
import torch
from torch.utils.data import Dataset, DataLoader

class ToyDataset(Dataset):
    def __init__(self, X, y):
        self.features = X
        self.labels = y

    def __getitem__(self, index):        #1
        one_x = self.features[index]     #1
        one_y = self.labels[index]       #1
        return one_x, one_y              #1

    def __len__(self):
        return self.labels.shape[0]      #2

train_ds = ToyDataset(X_train, y_train)
test_ds = ToyDataset(X_test, y_test)

the __getitem__ method, we define instructions for returning exactly one item from the dataset via an index. This refers to the features and the class label corresponding to a single training example or test instance. (The data loader will provide this index, which we will cover shortly.)

Finally, the __len__ method contains instructions for retrieving the length of the dataset. Here, we use the .shape attribute of a tensor to return the number of rows in the feature array. In the case of the training dataset, we have five rows, which we can double-check:

In [44]:
for i in range(len(train_ds)):
    print(train_ds[i])

(tensor([-1.2000,  3.1000]), tensor(0))
(tensor([-0.9000,  2.9000]), tensor(0))
(tensor([-0.5000,  2.6000]), tensor(0))
(tensor([ 2.3000, -1.1000]), tensor(1))
(tensor([ 2.7000, -1.5000]), tensor(1))


In [45]:
torch.manual_seed(123)

train_loader = DataLoader(train_ds, batch_size = 2, shuffle=True, num_workers=0)
test_loader = DataLoader(test_ds, batch_size = 2, shuffle=False, num_workers=0)

for idx, (x, y)  in enumerate(train_loader):
    print(f"Batch {idx+1}:\n",x,y)

Batch 1:
 tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2:
 tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])
Batch 3:
 tensor([[ 2.7000, -1.5000]]) tensor([1])


In [46]:
torch.manual_seed(123)
# Dropping last item of length < batch_size

train_loader = DataLoader(train_ds, batch_size = 2, shuffle=True, num_workers=0, drop_last=True)
test_loader = DataLoader(test_ds, batch_size = 2, shuffle=False, num_workers=0, drop_last=True)

for idx, (x, y)  in enumerate(train_loader):
    print(f"Batch {idx+1}:\n",x,y)

Batch 1:
 tensor([[ 2.3000, -1.1000],
        [-0.9000,  2.9000]]) tensor([1, 0])
Batch 2:
 tensor([[-1.2000,  3.1000],
        [-0.5000,  2.6000]]) tensor([0, 0])


### Training the Neural Model

In [47]:
import torch
import torch.nn.functional as F

torch.manual_seed(123)

model = simpleNNmodel(num_inputs =2, num_outputs=2)

# Ensure GPU or CPU intigration based on vaiability
#-----------------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
#-----------------------------------------------------------------------------

optimizer = torch.optim.SGD(model.parameters(), lr=0.5)

num_epoch = 3
for epoch in range(num_epoch):
    model.train()

    for batch, (features, labels) in enumerate(train_loader):
        #-----------------------------------------------------------------------------
        features, labels = features.to(device), labels.to(device)
        #-----------------------------------------------------------------------------
        logits = model(features)
        loss = F.cross_entropy(logits, labels)
        optimizer.zero_grad()

        loss.backward()
        optimizer.step()

        print(f"Epoch {epoch+1}/{num_epoch} batch {batch+1}/{len(train_loader)} loss: {loss:.2f}")

Epoch 1/3 batch 1/2 loss: 0.62
Epoch 1/3 batch 2/2 loss: 1.29
Epoch 2/3 batch 1/2 loss: 0.50
Epoch 2/3 batch 2/2 loss: 0.11
Epoch 3/3 batch 1/2 loss: 0.25
Epoch 3/3 batch 2/2 loss: 0.01


In [51]:
model.eval()
with torch.no_grad():
    X_train = X_train.to(device)  # Ensure input is on the same device
    outputs = model(X_train)
print(outputs)

probas = torch.softmax(outputs, dim=1)
print(probas)


tensor([[ 3.5207, -2.6019],
        [ 3.1539, -2.2931],
        [ 2.6345, -1.8633],
        [-0.0796,  1.0342],
        [-0.0980,  1.2070]], device='cuda:0')
tensor([[0.9978, 0.0022],
        [0.9957, 0.0043],
        [0.9890, 0.0110],
        [0.2472, 0.7528],
        [0.2133, 0.7867]], device='cuda:0')


In [53]:
predictions = torch.argmax(probas, dim=1)
print(predictions)

# comparing prediction with true values
predictions == y_train.to(device)

tensor([0, 0, 0, 1, 1], device='cuda:0')


tensor([True, True, True, True, True], device='cuda:0')

### Computing Prediction Accuracy

## Working with Multiple GPUs

DDP does not function properly within interactive Python environments like Jupyter notebooks, which don’t handle multiprocessing in the same way a standalone Python script does. Therefore, the following code should be executed as a script, not within a notebook interface like Jupyter. DDP needs to spawn multiple processes, and each process should have its own Python interpreter instance.

In [None]:
# For Mac GPU
device = torch.device(
    "mps" if torch.backends.mps.is_available() else "cpu"
)
# For Nvidia GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

## Creating dataset and dataloader for NLP

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

In [54]:
class GPTdataset(Dataset):
    def __init__(self, texts, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []
        tokens = tokenizer.encode(texts)
        
        for i in range(0, len(tokens)-max_length, stride):
            input_chunk = tokens[i : i+max_length]
            target_chunk = tokens[i+1 : i+max_length+1]
        
            input_tensors = torch.tensor(input_chunk)
            self.input_ids.append(input_tensors)
        
            target_tensors = torch.tensor(target_chunk)
            self.target_ids.append(target_tensors)
            
    def __getitem__(self, index):
        return self.input_ids[index], self.target_ids[index]
        
    def __len__(self):
        return len(self.input_ids)

In [55]:
dataset = GPTdataset(texts[:100], tokenizer=tokenizer,max_length=4, stride=4)

print(len(dataset))
print("*"*100)
for i in range(len(dataset)):
    print(dataset[i])

6
****************************************************************************************************
(tensor([  40,  367, 2885, 1464]), tensor([ 367, 2885, 1464, 1807]))
(tensor([1807, 3619,  402,  271]), tensor([ 3619,   402,   271, 10899]))
(tensor([10899,  2138,   257,  7026]), tensor([ 2138,   257,  7026, 15632]))
(tensor([15632,   438,  2016,   257]), tensor([ 438, 2016,  257,  922]))
(tensor([ 922, 5891, 1576,  438]), tensor([5891, 1576,  438,  568]))
(tensor([568, 340, 373, 645]), tensor([340, 373, 645, 308]))


In [56]:
def GPTdataloader(texts, max_length=8, stride=8,
                 batch_size = 2, shuffle=True, num_workers=0, drop_last=True):
    
    tokenizer = tiktoken.get_encoding("gpt2")
    dataset = GPTdataset(texts, tokenizer=tokenizer, max_length=max_length, stride=stride)
    dataloader = DataLoader(
                            dataset = dataset,
                            batch_size = batch_size,
                            shuffle =shuffle,
                            num_workers =num_workers,
                            drop_last =drop_last
                            )
    return dataloader
    

In [57]:
dataloader = GPTdataloader(texts[:100], max_length=8, stride=8,
              batch_size = 2, shuffle=True, num_workers=0, drop_last=True)

In [58]:
data_iter = iter(dataloader)    
first_batch = next(data_iter)
print('input:',first_batch[0],"\n",'target:', first_batch[-1])

input: tensor([[10899,  2138,   257,  7026, 15632,   438,  2016,   257],
        [   40,   367,  2885,  1464,  1807,  3619,   402,   271]]) 
 target: tensor([[ 2138,   257,  7026, 15632,   438,  2016,   257,   922],
        [  367,  2885,  1464,  1807,  3619,   402,   271, 10899]])
