In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [6]:
class CNNTransformerModel(nn.Module):
    def __init__(self, input_channels=3, image_height=60, image_width=200, transformer_hidden_size=128):
        super(CNNTransformerModel, self).__init__()

        # CNN layers to process the input images
        self.cnn = nn.Sequential(
            nn.Conv2d(input_channels, 16, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(16, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Conv2d(32, 64, kernel_size=3, stride=1, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            nn.Flatten()
        )

        # Transformer layers to process sequential data (in this case, the output of CNN)
        self.transformer = nn.TransformerEncoderLayer(d_model=image_height * image_width, nhead=8)
        self.transformer_encoder = nn.TransformerEncoder(self.transformer, num_layers=2)

        # Fully connected layers to predict 2 values
        self.fc1 = nn.Linear(image_height * image_width, transformer_hidden_size)
        self.fc2 = nn.Linear(transformer_hidden_size, 2)

    def forward(self, x):
        # Assuming x is of shape (batch_size, 3, 60, 200)

        # Process the images using the CNN
        cnn_output = self.cnn(x)

        # Reshape the CNN output to be compatible with the Transformer input
        cnn_output = cnn_output.permute(1, 0, 2)

        # Process the CNN output through the Transformer
        transformer_output = self.transformer_encoder(cnn_output)

        # Reshape the Transformer output to be compatible with the fully connected layers
        transformer_output = transformer_output.permute(1, 0, 2).reshape(x.size(0), -1)

        # Predict 2 values using the fully connected layers
        output = F.relu(self.fc1(transformer_output))
        output = self.fc2(output)

        return output

In [7]:
model = CNNTransformerModel()
model

CNNTransformerModel(
  (cnn): Sequential(
    (0): Conv2d(3, 16, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): ReLU()
    (2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (3): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU()
    (5): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (6): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (7): ReLU()
    (8): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (9): Flatten(start_dim=1, end_dim=-1)
  )
  (transformer): TransformerEncoderLayer(
    (self_attn): MultiheadAttention(
      (out_proj): NonDynamicallyQuantizableLinear(in_features=12000, out_features=12000, bias=True)
    )
    (linear1): Linear(in_features=12000, out_features=2048, bias=True)
    (dropout): Dropout(p=0.1, inplace=False)
    (linear2): Linear(in_features=2048, out_features=12000, bias=True)
    (norm1): Lay

In [8]:
x = torch.rand((1,3,60,80))
print(x.shape)
model(x)

torch.Size([1, 3, 60, 80])


RuntimeError: permute(sparse_coo): number of dimensions in the tensor input does not match the length of the desired ordering of dimensions i.e. input.dim() = 2 is not equal to len(dims) = 3

# All you need is Attention Tutorial

In [1]:
import torch
import torch.nn as nn 

In [3]:
class SelfAttention(nn.Module):
    def __init__(self, embed_size, heads) -> None:
        super(SelfAttention).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size//heads

        assert (self.head_dim * heads == embed_size), "Embed size needs to be div by heads"

        self.values = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.keys = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.queries = nn.Linear(self.head_dim, self.head_dim, bias=False)
        self.fc_out = nn.Linear(heads * self.head_dim, embed_size)
    
    def forward(self, values, keys, query, mask):
        N = query.shape[0]
        value_len, key_len, query_len = values.shape[1], keys.shape[1], query.shape[1]

        # split embedding into selfheads pieces
        values = values.reshape(N, value_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        queries = query.reshape(N, key_len, self.heads, self.head_dim)

        energy = torch.einsum("nqhd,nkhd -> nhqk", [queries, keys])
        # queries shape (N, query_len, heads, heads_dim)
        # keys : (N, key_len, heads, heads_dim)
        # energy : (N, heads, query_len, key_len)
        if mask is not None:
            energy = energy.masked_fill(mask==0,float("-1e20"))
        
        attention = torch.softmax(energy/(self.embed_size ** (1/2)), dim=3)
        out = torch.einsum("nhql,nlhd->nqhd", [attention, values]).reshape(
            N, query_len, self.heads*self.head_dim
        )

        out = self.fc_out(out)

In [None]:
class TransformerBlock(nn.Module):
    def __init__(self, embed_size, heads, dropout, forward_expansion) -> None:
        super(TransformerBlock, self).__init__()
        self.attention = SelfAttention(embed_size, heads)
        self.norm1 = nn.LayerNorm(embed_size)
        self.norm2 = nn.LayerNorm(embed_size)

        self.feed_forward = nn.Sequential(
            nn.Linear(embed_size, forward_expansion*embed_size)
            nn.ReLU()
            nn.Linear(forward_expansion*embed_size, embed_size)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, value, key, query, mask):
        attention = self.attention(value, key, query, mask)
        x = self.dropout(self.norm1(attention + query))
        forward = self.feed_forward(x)
        out = self.dropout(self.norm2(forward + x))
        return out