In [26]:
import torch
import torch.nn as nn
import math

### nn.Embedding(vocab_size, d_model)

- **Input to `nn.Embedding`**: The input to an `nn.Embedding` layer is a tensor of indices with shape `(..., I)`, where `...` represents any number of preceding dimensions, and `I` is the size of the last dimension, containing the indices.
  
- **Output of `nn.Embedding`**: The `nn.Embedding` layer replaces each index in its last dimension with an embedding vector of size `E` (where `E` is the embedding size specified when the `nn.Embedding` layer was created). So, for each index in the input tensor, you get an `E`-dimensional vector in the output tensor. This does indeed increase the dimensionality of the input tensor by one.

- **Output Shape**: If the input tensor to the `nn.Embedding` layer has a shape of `(..., I)`, the output tensor will have a shape of `(..., I, E)`. Each index in the last dimension of the input tensor is replaced by an `E`-dimensional embedding vector, so the last dimension of the input tensor (`I`) directly corresponds to the second-to-last dimension of the output tensor, and the embedding size `E` becomes the size of the new last dimension.



#### Key Difference in Input Types

- **`nn.Linear`** expects a tensor of floating-point numbers and applies a linear transformation.
- **`nn.Embedding`** expects a tensor of indices (integer values) and retrieves embeddings from an internal lookup table.

Despite these differences, the commonality is that both layers transform the last dimension of their input tensor: `nn.Linear` transforms feature vectors, while `nn.Embedding` transforms indices into embeddings.

In [27]:
# Define the embedding layer
batch_size = 4
seq_len = 2
vocab_size = 10  # Number of words in the vocabulary
d_model = 10  # Dimensionality of the embedding vectors
embedding_layer = nn.Embedding(vocab_size, d_model)

# Example indices for 2 words from the vocabulary
word_indices_1 = torch.tensor([2, 3], dtype=torch.long)
word_indices_2 = torch.tensor([3, 6], dtype=torch.long)
word_indices_3 = torch.tensor([5, 2], dtype=torch.long)

batch_indices = torch.tensor([[2, 3], [3, 6],[5, 2]])


# Get the embedding vectors for these words
embedding_vectors = embedding_layer(batch_indices)


print("Batch_indices:\n", batch_indices.size())
print("Embedding Vectors:\n", embedding_vectors.size())

Batch_indices:
 torch.Size([3, 2])
Embedding Vectors:
 torch.Size([3, 2, 10])


In [28]:
batch_indices

tensor([[2, 3],
        [3, 6],
        [5, 2]])

### unsqueeze(0) and unsqueeze(1)

In [1]:
import torch
'''
unsqueeze(0) adds a new dimension at the beginning of the tensor's shape. Often used when you need to add a batch dimension to a tensor for operations that expect batches.
unsqueeze(1) adds a new dimension as the second dimension of the tensor's shape.
'''
x = torch.tensor([
    [1, 2, 3, 4],
    [5, 6, 7, 8],
    [9, 10, 11, 12]
])
print("Original tensor shape:", x.shape)  # Outputs: torch.Size([3, 4])

x_unsqueeze_0 = x.unsqueeze(0)
print("Shape after unsqueeze(0):", x_unsqueeze_0.shape)  # Outputs: torch.Size([1, 3, 4])

x_unsqueeze_1 = x.unsqueeze(1)
print("Shape after unsqueeze(1):", x_unsqueeze_1.shape)  # Outputs: torch.Size([3, 1, 4])


Original tensor shape: torch.Size([3, 4])
Shape after unsqueeze(0): torch.Size([1, 3, 4])
Shape after unsqueeze(1): torch.Size([3, 1, 4])


In [9]:
import torch

# Tensor with shape (3, 4)
a = torch.zeros(3, 4)
print(a.squeeze(0).shape)  # Output: torch.Size([3, 4])

# Tensor with shape (1, 3, 4)
b = torch.zeros(1, 3, 4)
print(b.squeeze(0).shape)  # Output: torch.Size([3, 4])

# Tensor with shape (1, 3, 1, 4)
c = torch.zeros(1, 3, 1, 4)
print(c.squeeze().shape)  # Output: torch.Size([3, 4])


torch.Size([3, 4])
torch.Size([3, 4])
torch.Size([3, 4])


In [30]:
x = torch.arange(0, 10, 2) # [5]
y = torch.arange(0, 3) # [3]
x.unsqueeze(1) * y.unsqueeze(0) # [5, 1] * [1, 3]

tensor([[ 0,  0,  0],
        [ 0,  2,  4],
        [ 0,  4,  8],
        [ 0,  6, 12],
        [ 0,  8, 16]])

### Tensor slicing

In [31]:
import torch
# Input embeddings tensor with shape [batch_size, seq_len, d_model]
# Here, batch_size=2, seq_len=3, d_model=4
x = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0]],
                  [[4.0, 5.0, 6.0, 7.0], [5.0, 6.0, 7.0, 8.0], [6.0, 7.0, 8.0, 9.0]]])

# Simplified positional encodings tensor with shape [1, seq_len, d_model]
pe = torch.tensor([[[0.1, 0.2, 0.3, 0.4], [0.2, 0.3, 0.4, 0.5], [0.3, 0.4, 0.5, 0.6]]])

x_with_pe = x + pe[:,:x.shape[1] ,:]

print("Input Embeddings x:\n", x.size())
print("\nPositional Encodings pe:\n", pe.size())
print("\nx with Positional Encodings:\n", x_with_pe.size())
#print("\nx after Dropout:\n", x_after_dropout)


Input Embeddings x:
 torch.Size([2, 3, 4])

Positional Encodings pe:
 torch.Size([1, 3, 4])

x with Positional Encodings:
 torch.Size([2, 3, 4])


### Mean of tensor

In [32]:
x.size()

torch.Size([2, 3, 4])

In [33]:
x.mean(dim = -1, keepdim=True).size() # Take the mean on the last dimension

torch.Size([2, 3, 1])

### Tensor operations

In [34]:
import torch

# Example tensors
A = torch.rand(2, 4, 3)  # Shape: (batch_size, seq_len, d_model)
B = torch.rand(2, 4, 1)  # Shape: (batch_size, seq_len, 1)
# Manually broadcasting B to match A's shape
d_model = A.shape[2]  # Get the size of the last dimension of A
B_expanded = B.expand(-1, -1, d_model)  # Expand B to match A's shape, -1 is used to indicate dimensions that should not be changed, and d_model is the desired size of the last dimension. 
# Adding the manually broadcasted tensors
C = A + B_expanded


In [35]:
C ==  A+B


tensor([[[True, True, True],
         [True, True, True],
         [True, True, True],
         [True, True, True]],

        [[True, True, True],
         [True, True, True],
         [True, True, True],
         [True, True, True]]])

### nn.Linear(), nn.Dropout
#### `nn.Linear(d_model, d_ff,)`

The `nn.Linear` layer, also known as a fully connected or dense layer, performs a linear transformation on the incoming data. It applies a transformation to the input data using a matrix multiplication with its weight matrix and adds a bias.

- **Parameters**:
  - `d_model`: The size of each input sample (number of input features).
  - `d_ff`: The size of each output sample (number of output features).

- **Operation**: Given an input `x` of shape `[batch_size, seq_len, d_model]`, the `nn.Linear(d_model, d_ff)` layer transforms `x` to a new shape `[batch_size, seq_len, d_ff]` by applying the following linear transformation:

\[ \text{output} = x \cdot W^T + b \]

where:
- `x` is the input matrix.
- `W` is the weight matrix of the layer (of shape `[d_model, d_ff]`).
- `b` is the bias vector (of shape `[d_ff]`).

- **Example**:
  - If `d_model = 4` and `d_ff = 8`, then the layer will transform input data from 4-dimensional space to 8-dimensional space at each position in the sequence.

#### `nn.Dropout(dropout)`

`nn.Dropout` is a regularization technique used to prevent overfitting in neural networks. During training, it randomly zeros some of the elements of the input tensor with probability `dropout`, and scales up the remaining elements by `1/(1-dropout)` to maintain the average activation value. During evaluation, `Dropout` does not modify the input and becomes a no-op.

- **Parameter**:
  - `dropout`: The probability of an element to be zeroed.

- **Operation**: Randomly zeroes some of the elements of the input tensor with probability `dropout`.

- **Example**:
  - If `dropout = 0.2`, then on average 20% of the input elements are set to zero during training.




In [36]:
class FeedForwardBlock(nn.Module):
    '''
    A fully connected feed-forward network, which is is applied to each position separately and identically.
    Consists of two linear transformations with ReLU activation between
    '''
    def __init__(self, d_model: int, d_ff: int, dropout: float)->None:
        '''
        d_model: the dim of input and output
        d_ff: the dim of inner-layer
        '''
        super().__init__()
        self.linear_1 = nn.Linear(d_model, d_ff,) # W1 and b1
        self.linear_2 = nn.Linear(d_ff, d_model) # W2 and b2
        self.dropout = nn.Dropout(dropout)

        
    def forward(self, x):
        # (batch_size, seq_len, d_model) --> (batch_size, seq_len, d_ff) --> (batch_size, seq_len, d_model)
        x = self.linear_1(x) # (batch_size, seq_len, d_model) --> (batch_size, seq_len, d_ff)
        x = torch.relu(x)
        x = self.dropout(x)
        x = self.linear_2(x)
        return x

### Attention

In [37]:
import torch.nn.functional as F

class Attention(nn.Module):
    
    def __init__(self, d_model: int)->None:
        super().__init__()
        self.d_model = d_model
        self.Q = nn.Linear(d_model, d_model) # W_q
        self.K = nn.Linear(d_model, d_model) # W_k
        self.V = nn.Linear(d_model, d_model) # W_v

    def forward(self, x):
        '''
        x: (batch_size, seq_len, d_model)
        '''
        Q = self.Q(x) # (batch_size, seq_len, d_model)
        K = self.K(x) # (batch_size, seq_len, d_model)
        V = self.V(x) # (batch_size, seq_len, d_model)

        attention_score = F.softmax((Q @ K.transpose(1, 2)) / torch.sqrt(torch.tensor(self.d_model)), dim=-1) # (batch_size, seq_len, seq_len)
        #print(attention_score)
        output = attention_score @ V # (batch_size, seq_len, d_model)
        return attention_score, output

# Here, batch_size=2, seq_len=3, d_model=4
x = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0]],
                  [[4.0, 5.0, 6.0, 7.0], [5.0, 6.0, 7.0, 8.0], [6.0, 7.0, 8.0, 9.0]]])

att = Attention(d_model=4)
att(x)

(tensor([[[0.5621, 0.2892, 0.1488],
          [0.6794, 0.2376, 0.0831],
          [0.7727, 0.1837, 0.0437]],
 
         [[0.8420, 0.1360, 0.0220],
          [0.8914, 0.0979, 0.0108],
          [0.9257, 0.0691, 0.0052]]], grad_fn=<SoftmaxBackward0>),
 tensor([[[-1.1423, -0.6524, -0.8483, -3.1029],
          [-1.0977, -0.6293, -0.8202, -2.9807],
          [-1.0654, -0.6126, -0.7997, -2.8920]],
 
         [[-1.7739, -0.9792, -1.2479, -4.8360],
          [-1.7592, -0.9716, -1.2385, -4.7955],
          [-1.7494, -0.9666, -1.2324, -4.7688]]], grad_fn=<UnsafeViewBackward0>))

### View

In [38]:
import torch

# Original tensor of shape [4, 3]
x = torch.arange(12).view(4, 3)
print("Original tensor:\n", x)

# Reshape tensor to shape [2, 6] using view
y = x.view(2, 6)
print("Reshaped tensor:\n", y)

# Reshape with inferred dimension
# Here, -1 will be inferred as 4 to keep the total number of elements the same
z = x.view(3, -1)
print("Reshaped tensor with inferred dimension:\n", z)


Original tensor:
 tensor([[ 0,  1,  2],
        [ 3,  4,  5],
        [ 6,  7,  8],
        [ 9, 10, 11]])
Reshaped tensor:
 tensor([[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11]])
Reshaped tensor with inferred dimension:
 tensor([[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]])


In [39]:
# Here, batch_size=2, seq_len=3, d_model=4
x = torch.tensor([[[1.0, 2.0, 3.0, 4.0], [2.0, 3.0, 4.0, 5.0], [3.0, 4.0, 5.0, 6.0]],
                  [[4.0, 5.0, 6.0, 7.0], [5.0, 6.0, 7.0, 8.0], [6.0, 7.0, 8.0, 9.0]]])
x.transpose(1,2).contiguous().view(x.shape[0], 4, -1)

tensor([[[1., 2., 3.],
         [2., 3., 4.],
         [3., 4., 5.],
         [4., 5., 6.]],

        [[4., 5., 6.],
         [5., 6., 7.],
         [6., 7., 8.],
         [7., 8., 9.]]])

In [27]:
import torch # type: ignore
a = torch.tensor(0.9).unsqueeze(0)
b = torch.tensor(0.9).unsqueeze(0)
c = torch.tensor(0.9).unsqueeze(0)
x = torch.cat((a,b,c))
torch.count_nonzero(x)

tensor(3)

### String.format() & Path

In [44]:
from pathlib import Path

a = Path("../abc/word_{}.csv".format(10))
if not Path("../abc/word_{}.csv".format(10)).exists():
    print("File doesn't exists.")


File doesn't exists.


### load_dataset from HuggingFace

In [2]:
from datasets import load_dataset # type: ignore
config = {
    "lang_src": 'en',
    "lang_tgt": 'it'
}

ds_raw = load_dataset(path="Helsinki-NLP/opus_books", name=f'{config["lang_src"]}-{config["lang_tgt"]}', split='train') # 'en-it'

In [3]:
len(ds_raw)

32332

In [4]:
# Select the first N samples
N = 5000  # Number of samples you want to select
subsampled_ds_raw = ds_raw.shuffle(seed=42).select(range(N))

In [5]:
len(subsampled_ds_raw)

5000

In [7]:
# import shutil
# from datasets import config # type:ignore
# '''
# Remove the entire cache directory specified by config.HF_DATASETS_CACHE from the Hugging Face datasets library. 
# This means it deletes all cached datasets and files stored by the datasets library, 
# '''
# cache_dir = config.HF_DATASETS_CACHE
# print(cache_dir)

# # Remove the cache directory
# shutil.rmtree(cache_dir, ignore_errors=True)

# print(f"Cache directory {cache_dir} removed.")



### yield generator

In [51]:
def sent_gen():
    i = 0
    for item in ds_raw:
        if i < 10:
            yield item['translation']['en']
        i+=1


In [54]:
gen = sent_gen() 
print(next(gen))
print(next(gen))
print(next(gen))


Source: Project Gutenberg
Jane Eyre
Charlotte Bronte


In [61]:
a = (i**2 for i in [1,2,3])
for i in a:
    print(i)

1
4
9


### torch.cat()

In [68]:
a = torch.tensor([1,2,3,4,5], dtype=torch.int32)
b = torch.tensor([101,1], dtype=torch.int64)
c = torch.Tensor([0] * 5)

In [120]:
torch.tensor([1]).int()

tensor([1], dtype=torch.int32)

In [103]:
assert torch.tensor([[1,2],[2,3]]).dim() == 2

### torch.triu

In [119]:
import torch
def causal_mask(size):
    mask = torch.triu(torch.ones(1, size, size), diagonal=1).type(torch.int)
    return mask == 0

causal_mask(4)

tensor([[[ True, False, False, False],
         [ True,  True, False, False],
         [ True,  True,  True, False],
         [ True,  True,  True,  True]]])

In [115]:
import torch

# Create a square matrix of size 5x5
matrix = torch.ones(5, 5)

# Apply torch.triu() to keep the upper triangular part
upper_triangular_matrix = (torch.triu(matrix,diagonal=1).to(torch.int32)==0)

print(upper_triangular_matrix)


tensor([[ True, False, False, False, False],
        [ True,  True, False, False, False],
        [ True,  True,  True, False, False],
        [ True,  True,  True,  True, False],
        [ True,  True,  True,  True,  True]])


### tensor & tensor, (tensor == value), tensor.masked_fill_


In [139]:
a = torch.tensor([1,2,3,4,5], dtype=torch.int32)
b = torch.tensor([101,1], dtype=torch.int64)
c = torch.tensor([0] * 5)
seq_len = a.size(0) + b.size(0) + c.size(0)

print((torch.cat([b,a,c]) !=0).unsqueeze(0).unsqueeze(0).int())

tensor([[[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]]], dtype=torch.int32)


In [137]:
((torch.cat([b,a,c]) !=0).unsqueeze(0).unsqueeze(0) & causal_mask(seq_len)).size()

torch.Size([1, 12, 12])

### Attention_mask

In [58]:
import torch

# Simulating decoder_input != self.pad_token).unsqueeze(0)
mask_seq_len = torch.tensor([True, True, False])  # Shape: (3)

# Simulating causal_mask(decoder_input.size(0))
causal_mask = torch.tensor([[[True, False, False], [True, True, False], [True, True, True]]])  # Shape: (1, 3, 3)

attention_score = torch.tensor(range(0, 36)).view(1, 4, 3, 3) # (batch_size, h, seq_len, seq_len)
encoder_mask =  mask_seq_len.unsqueeze(0).unsqueeze(0).int() # (1, 1, seq_len)
decoder_mask = mask_seq_len.unsqueeze(0) & causal_mask # (1, seq_len, seq_len)


In [59]:
print(attention_score)

tensor([[[[ 0,  1,  2],
          [ 3,  4,  5],
          [ 6,  7,  8]],

         [[ 9, 10, 11],
          [12, 13, 14],
          [15, 16, 17]],

         [[18, 19, 20],
          [21, 22, 23],
          [24, 25, 26]],

         [[27, 28, 29],
          [30, 31, 32],
          [33, 34, 35]]]])


In [60]:

print(attention_score.masked_fill_(encoder_mask==0, -1e9))

tensor([[[[          0,           1, -1000000000],
          [          3,           4, -1000000000],
          [          6,           7, -1000000000]],

         [[          9,          10, -1000000000],
          [         12,          13, -1000000000],
          [         15,          16, -1000000000]],

         [[         18,          19, -1000000000],
          [         21,          22, -1000000000],
          [         24,          25, -1000000000]],

         [[         27,          28, -1000000000],
          [         30,          31, -1000000000],
          [         33,          34, -1000000000]]]])


### nn.CrossEntropyLoss

In [88]:
import torch
import torch.nn as nn

batch_size = 2
seq_len = 3
vocab_size = 5

# Simulating model predictions with softmax applied on the last dimension
model_prediction = torch.rand(batch_size, seq_len, vocab_size).softmax(dim=-1) # (batch_size, seq_len, vocab_size)

# Correcting the labels to be within the valid range [0, vocab_size-1]
# Assuming vocab_size = 5, valid indices are 0, 1, 2, 3, and 4
label = torch.tensor([2, 3, 0, 4, 1, 0]).view(batch_size, seq_len) # (batch_size, seq_len)

# Define the loss function with ignore_index=0
loss_fn = nn.CrossEntropyLoss(ignore_index=0)

# Compute the loss
loss = loss_fn(model_prediction.view(-1, vocab_size), label.view(-1))

print(loss.item())


1.5849608182907104


In [89]:
# ignore_index=0 manully, slicing to ignore the 0
nn.CrossEntropyLoss()(model_prediction[:,:2,:].reshape(-1, vocab_size), label[:, :2].reshape(-1)).item()

1.5849608182907104

### torch.empty(1,1).fill_(sos_idx).type_as(source)

### torch.max(tensor, dim=1)

In [16]:
import torch
a = torch.rand(2,3,4)[:,-1]
print(a)
print(torch.max(a,dim=1))

tensor([[0.4300, 0.5427, 0.1185, 0.6023],
        [0.4877, 0.6191, 0.5670, 0.9608]])
torch.return_types.max(
values=tensor([0.6023, 0.9608]),
indices=tensor([3, 3]))
