In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install calflops

In [2]:
from calflops import calculate_flops
from torchvision import models

model = models.alexnet()
batch_size = 1
input_shape = (batch_size, 3, 224, 224)
flops, macs, params = calculate_flops(model=model, 
                                      input_shape=input_shape,
                                      output_as_string=True,
                                      output_precision=4)
print("Alexnet FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))


------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                  61.1 M  
fwd MACs:                                                               714.188 MMACs
fwd FLOPs:                                                              1.4297 GFLOPS
fwd+bwd MACs:                                                           2.1426 GMACs
fwd+bwd FLOPs:                                                          4.2892 GFLOPS

-------------------------------- Detailed Calculated FLOPs Results --------------------------------
Each modul

In [6]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np

class MyModel(nn.Module):
    def __init__(self, video_length, num_features, num_classes):
        super(MyModel, self).__init__()
        # GRU layer
        self.gru = nn.GRU(input_size=num_features, hidden_size=16, batch_first=True)
        # Regularization via weight decay in the optimizer
        # Apply LeakyReLU separately due to PyTorch's design
        self.leaky_relu = nn.LeakyReLU(0.2)
        # Dense layers
        self.dense1 = nn.Linear(16, 256)
        self.dense2 = nn.Linear(256, num_classes)

    def forward(self, x):
        # Forward pass through GRU
        out, _ = self.gru(x)
        # Since return_sequences=False, only take the output of the last time step
        out = out[:, -1, :]
        out = self.leaky_relu(out)
        out = self.dense1(out)
        out = F.gelu(out)
        out = self.dense2(out)
        out = F.softmax(out, dim=-1)
        return out

# Constants
VIDEO_LENGTH = 60  # Assuming this is defined somewhere in your config
NUM_FEATURES = 1629  # Number of input features
NUM_CLASSES = 5  # Number of output classes

# Instantiate the model
model = MyModel(video_length=VIDEO_LENGTH, num_features=NUM_FEATURES, num_classes=NUM_CLASSES)


batch = np.random.randn(1, VIDEO_LENGTH, NUM_FEATURES)
model(torch.Tensor(batch))

flops, macs, params = calculate_flops(model=model, 
                                      input_shape=batch.shape,
                                      output_as_string=True,
                                      output_precision=4)
print("GRU FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))


------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                  84.69 K 
fwd MACs:                                                               5.376 KMACs
fwd FLOPs:                                                              9.4929 MFLOPS
fwd+bwd MACs:                                                           16.128 KMACs
fwd+bwd FLOPs:                                                          28.4788 MFLOPS

-------------------------------- Detailed Calculated FLOPs Results --------------------------------
Each module

In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = nn.MultiheadAttention(embed_dim=embed_dim, num_heads=num_heads)
        self.ffn = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.layernorm1 = nn.LayerNorm(embed_dim)
        self.layernorm2 = nn.LayerNorm(embed_dim)
        self.dropout1 = nn.Dropout(rate)
        self.dropout2 = nn.Dropout(rate)

    def forward(self, x):
        # Attention part
        attn_output, _ = self.att(x, x, x)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(x + attn_output)
        # Feed forward part
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        out2 = self.layernorm2(out1 + ffn_output)
        return out2

class TokenAndPositionEmbedding(nn.Module):
    def __init__(self, maxlen, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = nn.Embedding(maxlen, embed_dim)
        self.pos_emb = nn.Embedding(maxlen, embed_dim)

    def forward(self, x):
        #positions = torch.arange(0, x.size(1)).unsqueeze(0).repeat(x.size(0), 1).to(x.device)
        #print(positions.shape)
        #pos = self.pos_emb(positions)
        #print(x.shape, pos.shape)
        #x = x + pos
        return x

class TransformerModel(nn.Module):
    def __init__(self, input_shape, num_heads, ff_dim, num_classes):
        super(TransformerModel, self).__init__()
        self.embedding_layer = TokenAndPositionEmbedding(input_shape[0], input_shape[1])
        self.transformer_block = TransformerBlock(input_shape[1], num_heads, ff_dim)
        self.pooling = nn.AdaptiveAvgPool1d(1)
        self.dropout = nn.Dropout(0.1)
        self.dense1 = nn.Linear(input_shape[1], 128)
        self.outputs = nn.Linear(128, num_classes)

    def forward(self, x):
        x = self.embedding_layer(x)
        x = self.transformer_block(x)
        x = x.permute(0, 2, 1)
        x = self.pooling(x).squeeze(-1)
        x = self.dropout(x)
        x = F.relu(self.dense1(x))
        x = self.dropout(x)
        x = F.softmax(self.outputs(x), dim=-1)
        return x

# Assuming the shape of your input and number of unique labels are defined
batch = np.random.randn(1, VIDEO_LENGTH, NUM_FEATURES)
num_classes = NUM_CLASSES  # Number of output classes
model = TransformerModel(batch.shape, 2, 64, num_classes)

model(torch.Tensor(batch))

AssertionError: was expecting embedding dimension of 60, but got 1629

In [38]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super(PositionalEncoding, self).__init__()
        # Create a long enough 'position encoding' matrix
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        # Register buffer: Not a parameter, but should be part of the state
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

class TransformerModel(nn.Module):
    def __init__(self, input_dim, d_model, nhead, num_encoder_layers, dim_feedforward, num_classes, dropout=0.1):
        super(TransformerModel, self).__init__()
        self.pos_encoder = PositionalEncoding(d_model)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead,
                                                        dim_feedforward=dim_feedforward, dropout=dropout)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=num_encoder_layers)
        self.input_proj = nn.Linear(input_dim, d_model)  # Project input feature dimension to model dimension
        print(d_model, num_classes)
        self.output_proj = nn.Linear(d_model, num_classes)  # Final classification layer
        self.dropout = nn.Dropout(dropout)

    def forward(self, src):
        # Project the src from input_dim to d_model
        src = self.input_proj(src)
        # Add positional encoding
        src = self.pos_encoder(src)
        # Transformer Encoder
        output = self.transformer_encoder(src)
        # Average Pooling across the sequence dimension (frames)
        output = output.mean(dim=1)
        output = self.dropout(output)
        # Classification head
        output = self.output_proj(output)
        return output

# Parameters
input_dim = 1639  # Dimension of each input vector (already embedded)
d_model = 64  # Dimensionality of the model (should be a multiple of nhead)
nhead = 2  # Number of attention heads
num_encoder_layers = 6  # Number of transformer layers
dim_feedforward = 128  # Dimension of feedforward network
num_classes = 5  # Number of classes for classification
dropout = 0.1  #

# Initialize model
model = TransformerModel(input_dim, d_model, nhead, num_encoder_layers, dim_feedforward, num_classes, dropout)

# Example input: batch_size x frames x features
batch_size = 1
frames = 60
features = 1639  # Features per frame

# Create a random sample
src = torch.randn(batch_size, frames, features)


flops, macs, params = calculate_flops(model=model, 
                                      input_shape=tuple(src.shape),
                                      output_as_string=True,
                                      output_precision=4)
print("Alexnet FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))

64 5

------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                  339.59 K
fwd MACs:                                                               54.2717 MMACs
fwd FLOPs:                                                              109.235 MFLOPS
fwd+bwd MACs:                                                           162.815 MMACs
fwd+bwd FLOPs:                                                          327.704 MFLOPS

-------------------------------- Detailed Calculated FLOPs Results --------------------------------
Ea

In [40]:
import torch
import torchvision.models as models
import torch.nn as nn

# Load the pre-trained EfficientNet B0 model
model = models.efficientnet_b0(pretrained=True)

# Number of features in the last layer (bottleneck layer output size)
num_ftrs = model.classifier[1].in_features

# Replace the classifier head with a new one adjusted to 5 classes
model.classifier[1] = nn.Linear(num_ftrs, 5)

# Example of how to forward a sample through the model (assuming you have a transformed input)
dummy_input = torch.randn(1, 3, 224, 224)  # Adjust size as necessary for your application

flops, macs, params = calculate_flops(model=model, 
                                      input_shape=tuple(dummy_input.shape),
                                      output_as_string=True,
                                      output_precision=4)
print("EfficientNetB0 FLOPs:%s   MACs:%s   Params:%s \n" %(flops, macs, params))


------------------------------------- Calculate Flops Results -------------------------------------
Notations:
number of parameters (Params), number of multiply-accumulate operations(MACs),
number of floating-point operations (FLOPs), floating-point operations per second (FLOPS),
fwd FLOPs (model forward propagation FLOPs), bwd FLOPs (model backward propagation FLOPs),
default model backpropagation takes 2.00 times as much computation as forward propagation.

Total Training Params:                                                  4.01 M  
fwd MACs:                                                               1.5382 GMACs
fwd FLOPs:                                                              3.1644 GFLOPS
fwd+bwd MACs:                                                           4.6145 GMACs
fwd+bwd FLOPs:                                                          9.4931 GFLOPS

-------------------------------- Detailed Calculated FLOPs Results --------------------------------
Each module