In [1]:
import torch

x = torch.tensor([[1.0, 2.0], [3.0, 4.0]])
print(x)
print(x.shape)
print(x.device)  # En qué dispositivo está (CPU o GPU)

tensor([[1., 2.],
        [3., 4.]])
torch.Size([2, 2])
cpu


In [None]:
a = torch.tensor([2.0, 3.0], requires_grad=True)
b = a * 2
c = b.sum()
print(c)
c.backward()  # Calcula gradientes
print(a.grad)  # da [2, 2] porque d(a*2)/da = 2

tensor(10., grad_fn=<SumBackward0>)
tensor([2., 2.])


nn.Embedding

In [26]:
import torch
import torch.nn as nn
import torch.nn.functional as F

torch.manual_seed(0)

<torch._C.Generator at 0x7abff4d24390>

In [27]:
# Create a simple embedding layer

vocab_size = 10        # number of token IDs: 0..9
embedding_dim = 4      # size of the embedding vector

emb = nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dim)

print("Embedding weight matrix shape:", emb.weight.shape)
print(emb.weight)

Embedding weight matrix shape: torch.Size([10, 4])
Parameter containing:
tensor([[-1.1258, -1.1524, -0.2506, -0.4339],
        [ 0.8487,  0.6920, -0.3160, -2.1152],
        [ 0.3223, -1.2633,  0.3500,  0.3081],
        [ 0.1198,  1.2377,  1.1168, -0.2473],
        [-1.3527, -1.6959,  0.5667,  0.7935],
        [ 0.5988, -1.5551, -0.3414,  1.8530],
        [-0.2159, -0.7425,  0.5627,  0.2596],
        [-0.1740, -0.6787,  0.9383,  0.4889],
        [ 1.2032,  0.0845, -1.2001, -0.0048],
        [-0.5181, -0.3067, -1.5810,  1.7066]], requires_grad=True)


In [28]:
# Pass a single token ID

token_id = torch.tensor(3)
vec = emb(token_id)

print("Token ID:", token_id.item())
print("Output vector shape:", vec.shape)
print(vec)

Token ID: 3
Output vector shape: torch.Size([4])
tensor([ 0.1198,  1.2377,  1.1168, -0.2473], grad_fn=<EmbeddingBackward0>)


In [30]:
# Pass a sequence of token IDs (1D tensor)

seq = torch.tensor([1, 4, 3, 9])
vecs = emb(seq)

print("Input sequence:", seq)
print("Output shape:", vecs.shape)
print(vecs)

Input sequence: tensor([1, 4, 3, 9])
Output shape: torch.Size([4, 4])
tensor([[ 0.8487,  0.6920, -0.3160, -2.1152],
        [-1.3527, -1.6959,  0.5667,  0.7935],
        [ 0.1198,  1.2377,  1.1168, -0.2473],
        [-0.5181, -0.3067, -1.5810,  1.7066]], grad_fn=<EmbeddingBackward0>)


In [31]:
# Pass a batch of sequences (2D tensor)

batch = torch.tensor([
    [1, 4, 3, 9, 0],
    [2, 2, 5, 0, 0],
])

vecs_batch = emb(batch)

print("Batch of token IDs:\n", batch)
print("Output shape:", vecs_batch.shape)  # (batch_size, seq_len, embedding_dim)
print(vecs_batch)

Batch of token IDs:
 tensor([[1, 4, 3, 9, 0],
        [2, 2, 5, 0, 0]])
Output shape: torch.Size([2, 5, 4])
tensor([[[ 0.8487,  0.6920, -0.3160, -2.1152],
         [-1.3527, -1.6959,  0.5667,  0.7935],
         [ 0.1198,  1.2377,  1.1168, -0.2473],
         [-0.5181, -0.3067, -1.5810,  1.7066],
         [-1.1258, -1.1524, -0.2506, -0.4339]],

        [[ 0.3223, -1.2633,  0.3500,  0.3081],
         [ 0.3223, -1.2633,  0.3500,  0.3081],
         [ 0.5988, -1.5551, -0.3414,  1.8530],
         [-1.1258, -1.1524, -0.2506, -0.4339],
         [-1.1258, -1.1524, -0.2506, -0.4339]]], grad_fn=<EmbeddingBackward0>)


In [32]:
# Create embedding with a padding index

pad_id = 0

emb_pad = nn.Embedding(
    num_embeddings=vocab_size,
    embedding_dim=embedding_dim,
    padding_idx=pad_id
)

print("Embedding weights:")
print(emb_pad.weight)

print("\nEmbedding for padding index (should be zeros):")
print(emb_pad.weight[pad_id])

Embedding weights:
Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.5943,  1.5419,  0.5073, -0.5910],
        [-1.3253,  0.1886, -0.0691, -0.4949],
        [-1.4959, -0.1938,  0.4455,  1.3253],
        [ 1.5091,  2.0820,  1.7067,  2.3804],
        [-1.1256, -0.3170, -1.0925, -0.0852],
        [ 1.6459, -1.3602,  0.3446,  0.5199],
        [-2.6133, -1.6965, -0.2282,  0.2800],
        [ 0.2469,  0.0769,  0.3380,  0.4544],
        [ 0.4569, -0.8654,  0.7813, -0.9268]], requires_grad=True)

Embedding for padding index (should be zeros):
tensor([0., 0., 0., 0.], grad_fn=<SelectBackward0>)


In [34]:
# Pass sequences containing padding (ID 0)

batch_with_pad = torch.tensor([
    [1, 2, 3, 0, 0],
    [4, 5, 0, 0, 0],
])

vecs_with_pad = emb_pad(batch_with_pad)

print("Batch with padding:\n", batch_with_pad)
print("Embeddings (notice the 0-vectors for padding positions):")
print(vecs_with_pad)

print("\nEmbedding values for padding positions (all zeros):")
print(vecs_with_pad[batch_with_pad == pad_id])

Batch with padding:
 tensor([[1, 2, 3, 0, 0],
        [4, 5, 0, 0, 0]])
Embeddings (notice the 0-vectors for padding positions):
tensor([[[ 0.5943,  1.5419,  0.5073, -0.5910],
         [-1.3253,  0.1886, -0.0691, -0.4949],
         [-1.4959, -0.1938,  0.4455,  1.3253],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]],

        [[ 1.5091,  2.0820,  1.7067,  2.3804],
         [-1.1256, -0.3170, -1.0925, -0.0852],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000],
         [ 0.0000,  0.0000,  0.0000,  0.0000]]], grad_fn=<EmbeddingBackward0>)

Embedding values for padding positions (all zeros):
tensor([[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]], grad_fn=<IndexBackward0>)


In [40]:
# Cell: Visualizing how dropout works on a single vector

import torch
import torch.nn as nn

torch.manual_seed(0)   # for reproducibility

# Create a dropout layer with p = 0.5
drop = nn.Dropout(p=0.5)

# A simple vector input
x = torch.tensor([[3.0, 5.0, 2.0, 7.0, 4.0, 1.0]])

# Apply dropout
y = drop(x)

# Manually build the dropout mask (for understanding)
# PyTorch internally draws random Bernoulli values (0 or 1)
mask = (y != 0).float() * (1 - drop.p)

# The inverted-dropout scaling factor (1 / (1 - p))
scale = 1.0 / (1.0 - drop.p)

# Print everything
print("Input x:")
print(x)

print("\nDropout output y:")
print(y)

print("\nDropout mask (1 = kept unit, 0 = dropped):")
print((y != 0).float())

print("\nApplied scale factor:", scale)

print("\nReconstructed output (mask * x * scale):")
print(mask * x * scale)

Input x:
tensor([[3., 5., 2., 7., 4., 1.]])

Dropout output y:
tensor([[0., 0., 4., 0., 0., 0.]])

Dropout mask (1 = kept unit, 0 = dropped):
tensor([[0., 0., 1., 0., 0., 0.]])

Applied scale factor: 2.0

Reconstructed output (mask * x * scale):
tensor([[0., 0., 2., 0., 0., 0.]])
