In [1]:
import torch 
import torch.nn as nn
import torch.nn.functional as F

# Optic Flow Decoder

In [2]:
class OpticFlowDecoder(nn.Module):
    def __init__(self, in_channels, input_size):
        super(OpticFlowDecoder, self).__init__()

        # First layer: input D=34 channels, output 8 channels
        self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=8, kernel_size=5, padding=2)
        self.batchnorm1 = nn.BatchNorm2d(8)

        # Second layer: input 8 channels, output 3 channels (optic flow + normalization)
        self.conv2 = nn.Conv2d(in_channels=8, out_channels=3, kernel_size=5, padding=2)
        self.batchnorm2 = nn.BatchNorm2d(3)

        # Activation function: Softplus
        self.softplus = nn.Softplus()

        # Dropout layer for regularization
        self.dropout = nn.Dropout(p=0.5)

        # Initialize weights homogeneously at 0.001
        self._initialize_weights()

    def _initialize_weights(self):
        for layer in self.modules():
            if isinstance(layer, nn.Conv2d):
                nn.init.constant_(layer.weight, 0.001)
                if layer.bias is not None:
                    nn.init.constant_(layer.bias, 0.001)
            elif isinstance(layer, nn.BatchNorm2d):
                nn.init.constant_(layer.weight, 1)
                if layer.bias is not None:
                    nn.init.constant_(layer.bias, 0)

    def forward(self, x):
        # First conv layer
        x = self.batchnorm1(self.conv1(x))
        x = self.softplus(x)
        x = self.dropout(x)

        # Second conv layer
        x = self.batchnorm2(self.conv2(x))
        x = self.softplus(x)
        x = self.dropout(x)

        # Split the output into the 2D flow (2 channels) and normalization (1 channel)
        flow = x[:, :2, :, :]  # First 2 channels are the flow (x and y)
        normalization = x[:, 2:3, :, :]  # Third channel is the normalization

        flow = flow / (normalization + 1e-8)

        # Resize flow to match original input size
        flow = F.interpolate(flow, size=input_size, mode='bilinear')

        return flow


# 2 layers

In [8]:
class Net_2_layers(nn.Module):
    def __init__(self, input_size):
        super(Net_2_layers, self).__init__()
        
        # First convolutional layer
        self.conv1 = nn.Conv2d(in_channels=6, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.relu = nn.ReLU(inplace=True)

        # First residual block
        self.conv2_1 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1, bias=False)
        self.bn2_1 = nn.BatchNorm2d(64)
        self.conv2_2 = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, padding=1, bias=False)
        self.bn2_2 = nn.BatchNorm2d(64)

        # Second residual block
        self.conv3_1 = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=1, bias=False)
        self.bn3_1 = nn.BatchNorm2d(128)
        self.conv3_2 = nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn3_2 = nn.BatchNorm2d(128)
        
        # Adjust input channels to match the output channels (1x1 conv)
        self.adjust_channels = nn.Conv2d(in_channels=64, out_channels=128, kernel_size=1, stride=2, padding=0, bias=False)
        
        # OpticFlowDecoder
        self.optic_flow_decoder = OpticFlowDecoder(in_channels=128, input_size=input_size)

    def forward(self, x):
        # if input is list, combine batch dimension
        is_list = isinstance(x, tuple) or isinstance(x, list)
        if is_list:
            batch_dim = x[0].shape[1]
            x = torch.cat(x, dim=1)
        print(f"After concatenation: {x.shape}")

        # First conv layer
        x = self.relu(self.bn1(self.conv1(x)))
        print(f"After conv1: {x.shape}")

        # First residual block
        residual = x  # Store residual
        x = self.relu(self.bn2_1(self.conv2_1(x)))
        x = self.bn2_2(self.conv2_2(x))
        x += residual  # Add skip connection
        x = self.relu(x)
        print(f"After first residual block: {x.shape}")

        # Second residual block
        residual = x  # Store residual again
        x = self.relu(self.bn3_1(self.conv3_1(x)))
        x = self.bn3_2(self.conv3_2(x))
        
        # Adjust the dimensions of the residual if necessary
        residual = self.adjust_channels(residual)
        x += residual  # Add skip connection
        x = self.relu(x)
        print(f"After second residual block: {x.shape}")

        # Pass through OpticFlowDecoder
        flow = self.optic_flow_decoder(x)

        return flow

# Example usage
input_size = 16
model = Net_2_layers(input_size)
input_tensors = (torch.randn(1, 3, input_size, input_size), torch.randn(1, 3, input_size, input_size))
output = model(input_tensors)
print(f"Final output shape: {output.shape}")


After concatenation: torch.Size([1, 6, 16, 16])
After conv1: torch.Size([1, 64, 16, 16])
After first residual block: torch.Size([1, 64, 16, 16])
After second residual block: torch.Size([1, 128, 8, 8])
Final output shape: torch.Size([1, 2, 16, 16])


# 16 x 16 pixels, 3 layers

In [31]:
class Net_3_16(nn.Module):
    def __init__(self):
        super(Net_3_16, self).__init__()
        resnet = models.resnet18(weights=None)

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=1, padding=3, bias=False)  # Modified stride=1
        self.bn1 = resnet.bn1
        self.relu = resnet.relu

        self.layer1 = resnet.layer1  # First residual block
        self.layer2 = resnet.layer2  # Second residual block
        self.layer3 = resnet.layer3  # Third residual block

        # Reduce channels to 34 before passing to OpticFlowDecoder
        self.conv_reduce = nn.Conv2d(in_channels=256, out_channels=34, kernel_size=1, stride=1, padding=0)

        # OpticFlowDecoder
        self.optic_flow_decoder = OpticFlowDecoder()

    def forward(self, x): 
        x = self.conv1(x)
        print(f"After conv1: {x.shape}")
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        print(f"After layer1: {x.shape}")
        x = self.layer2(x)
        print(f"After layer2: {x.shape}")
        x = self.layer3(x)
        print(f"After layer3: {x.shape}")
        
         # Reduce channels to match OpticFlowDecoder input
        x = self.conv_reduce(x)

        # Pass through OpticFlowDecoder
        flow = self.optic_flow_decoder(x)

        # Resize flow to match original input size
        flow = F.interpolate(flow, size=(16, 16), mode='bilinear')

        

        return flow

# Example usage
model = Net_3_16()
input_tensor = torch.randn(1, 3, 16, 16) 
output = model(input_tensor)
print(f"Final output shape: {output.shape}")


After conv1: torch.Size([1, 64, 16, 16])
After layer1: torch.Size([1, 64, 16, 16])
After layer2: torch.Size([1, 128, 8, 8])
After layer3: torch.Size([1, 256, 4, 4])
Final output shape: torch.Size([1, 2, 16, 16])


# 32 x 32 pixels, 2 layers

In [32]:
class Net_2_32(nn.Module):
    def __init__(self):
        super(Net_2_32, self).__init__()
        resnet = models.resnet18(weights=None)

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=1, padding=3, bias=False)  # Modified stride=1
        self.bn1 = resnet.bn1
        self.relu = resnet.relu

        self.layer1 = resnet.layer1  # First residual block
        self.layer2 = resnet.layer2  # Second residual block

        # Reduce channels to 34 before passing to OpticFlowDecoder
        self.conv_reduce = nn.Conv2d(in_channels=128, out_channels=34, kernel_size=1, stride=1, padding=0)

        # OpticFlowDecoder
        self.optic_flow_decoder = OpticFlowDecoder()

    def forward(self, x):
        x = self.conv1(x)
        print(f"After conv1: {x.shape}")
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        print(f"After layer1: {x.shape}")
        x = self.layer2(x)
        print(f"After layer2: {x.shape}")

         # Reduce channels to match OpticFlowDecoder input
        x = self.conv_reduce(x)

        # Pass through OpticFlowDecoder
        flow = self.optic_flow_decoder(x)

        # Resize flow to match original input size
        flow = F.interpolate(flow, size=(32, 32), mode='bilinear')

        return flow

# Example usage
model = Net_2_32()
input_tensor = torch.randn(1, 3, 32, 32)
output = model(input_tensor)
print(f"Final output shape: {output.shape}")


After conv1: torch.Size([1, 64, 32, 32])
After layer1: torch.Size([1, 64, 32, 32])
After layer2: torch.Size([1, 128, 16, 16])
Final output shape: torch.Size([1, 2, 32, 32])


# 32 x 32 pixels, 3 layers

In [33]:
class Net_3_32(nn.Module):
    def __init__(self):
        super(Net_3_32, self).__init__()
        resnet = models.resnet18(weights=None)

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=1, padding=3, bias=False)  # Modified stride=1
        self.bn1 = resnet.bn1
        self.relu = resnet.relu

        self.layer1 = resnet.layer1  # First residual block
        self.layer2 = resnet.layer2  # Second residual block
        self.layer3 = resnet.layer3  # Third residual block

        # Reduce channels to 34 before passing to OpticFlowDecoder
        self.conv_reduce = nn.Conv2d(in_channels=256, out_channels=34, kernel_size=1, stride=1, padding=0)

        # OpticFlowDecoder
        self.optic_flow_decoder = OpticFlowDecoder()

    def forward(self, x): 
        x = self.conv1(x)
        print(f"After conv1: {x.shape}")
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        print(f"After layer1: {x.shape}")
        x = self.layer2(x)
        print(f"After layer2: {x.shape}")
        x = self.layer3(x)
        print(f"After layer3: {x.shape}")
        
         # Reduce channels to match OpticFlowDecoder input
        x = self.conv_reduce(x)

        # Pass through OpticFlowDecoder
        flow = self.optic_flow_decoder(x)

        # Resize flow to match original input size
        flow = F.interpolate(flow, size=(32, 32), mode='bilinear')

        return flow

# Example usage
model = Net_3_32()
input_tensor = torch.randn(1, 3, 32, 32)  
output = model(input_tensor)
print(f"Final output shape: {output.shape}")


After conv1: torch.Size([1, 64, 32, 32])
After layer1: torch.Size([1, 64, 32, 32])
After layer2: torch.Size([1, 128, 16, 16])
After layer3: torch.Size([1, 256, 8, 8])
Final output shape: torch.Size([1, 2, 32, 32])


# 64 x 64 pixels, 2 layers

In [36]:
class Net_2_64(nn.Module):
    def __init__(self):
        super(Net_2_64, self).__init__()
        resnet = models.resnet18(weights=None)

        
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=1, padding=3, bias=False)  # Modified stride=1
        self.bn1 = resnet.bn1
        self.relu = resnet.relu

       
        self.layer1 = resnet.layer1  # First residual block
        self.layer2 = resnet.layer2  # Second residual block

        # Reduce channels to 34 before passing to OpticFlowDecoder
        self.conv_reduce = nn.Conv2d(in_channels=128, out_channels=34, kernel_size=1, stride=1, padding=0)

        # OpticFlowDecoder
        self.optic_flow_decoder = OpticFlowDecoder()

    def forward(self, x):
      
        x = self.conv1(x)
        print(f"After conv1: {x.shape}")
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        print(f"After layer1: {x.shape}")
        x = self.layer2(x)
        print(f"After layer2: {x.shape}")

         # Reduce channels to match OpticFlowDecoder input
        x = self.conv_reduce(x)

        # Pass through OpticFlowDecoder
        flow = self.optic_flow_decoder(x)

        # Resize flow to match original input size
        flow = F.interpolate(flow, size=(64, 64), mode='bilinear')

        return flow

# Example usage
model = Net_2_64()
input_tensor = torch.randn(1, 3, 64, 64)
output = model(input_tensor)
print(f"Final output shape: {output.shape}")


After conv1: torch.Size([1, 64, 64, 64])
After layer1: torch.Size([1, 64, 64, 64])
After layer2: torch.Size([1, 128, 32, 32])
Final output shape: torch.Size([1, 2, 64, 64])


# 64 x 64 pixels, 3 layers

In [38]:
class Net_3_64(nn.Module):
    def __init__(self):
        super(Net_3_64, self).__init__()
  
        resnet = models.resnet18(weights=None)


        self.conv1 = nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=1, padding=3, bias=False)  # Modified stride=1
        self.bn1 = resnet.bn1
        self.relu = resnet.relu


        self.layer1 = resnet.layer1  # First residual block
        self.layer2 = resnet.layer2  # Second residual block
        self.layer3 = resnet.layer3  # Third residual block

        # Reduce channels to 34 before passing to OpticFlowDecoder
        self.conv_reduce = nn.Conv2d(in_channels=256, out_channels=34, kernel_size=1, stride=1, padding=0)

        # OpticFlowDecoder
        self.optic_flow_decoder = OpticFlowDecoder()

    def forward(self, x): 
        x = self.conv1(x)
        print(f"After conv1: {x.shape}")
        x = self.bn1(x)
        x = self.relu(x)
        
        x = self.layer1(x)
        print(f"After layer1: {x.shape}")
        x = self.layer2(x)
        print(f"After layer2: {x.shape}")
        x = self.layer3(x)
        print(f"After layer3: {x.shape}")

         # Reduce channels to match OpticFlowDecoder input
        x = self.conv_reduce(x)

        # Pass through OpticFlowDecoder
        flow = self.optic_flow_decoder(x)

        # Resize flow to match original input size
        flow = F.interpolate(flow, size=(64, 64), mode='bilinear')

        return flow

# Example usage
model = Net_3_64()
input_tensor = torch.randn(1, 3, 64, 64) 
output = model(input_tensor)
print(f"Final output shape: {output.shape}")


After conv1: torch.Size([1, 64, 64, 64])
After layer1: torch.Size([1, 64, 64, 64])
After layer2: torch.Size([1, 128, 32, 32])
After layer3: torch.Size([1, 256, 16, 16])
Final output shape: torch.Size([1, 2, 64, 64])


# Questions:
- Is this the right amount of layers? (I added first input layer, and then first 2-3 layers from ResNet18) Or should the first layer just be layer1 from ResNet?
- The ResNet layers have downsampling in them, did you mean no downsampling in the input layer or in general?
- ResNet input layer uses stride=2, padding=3, kernel=7. Since this is for smaller resolution of images, should we change kernel=3 (padding=1), (I already changed stride=1).
- For the Optic Flow Decoder, I used what was described in the fly paper. It takes 34 features, Why?
- Is the fly decoder what I was supposed to use, or is there another one I should try?
- NN is taking one frame at a time, when are the two frames passed to the optic flow decoder?
- I am reducing the channels after the ResNet layers to fit D=34, is that ok, or should I change the OFDecoder to take in what the CNN outputs (64 or 128)?
- I am not outputting .flo files, how do I compare to my ouptut/ should I be also outputting .flo files? https://stackoverflow.com/questions/28013200/reading-middlebury-flow-files-with-python-bytes-array-numpy 

