In [1]:
import os
import shutil
import cv2
import math
import random
import numpy as np
import datetime as dt
import tensorflow 
import keras
from collections import deque
import matplotlib.pyplot as plt
plt.style.use("seaborn")

%matplotlib inline
 
from sklearn.model_selection import train_test_split
 


In [2]:
from IPython.display import HTML
from base64 import b64encode

# To Show a Video in Notebook
def Play_Video(filepath):
    html = ''
    video = open(filepath,'rb').read()
    src = 'data:video/avi;base64,' + b64encode(video).decode()
    html += '<video width=640 muted controls autoplay loop><source src="%s" type="video/avi"></video>' % src 
    return HTML(html)

In [3]:
# Classes Directories
Videos_Dir = "/kaggle/input/hockey-fight-vidoes/data/"
files_names_list = os.listdir(Videos_Dir)

# Randomly select a video file from the Classes Directory.

Random_Video = random.choice(files_names_list)

In [4]:
Play_Video(f"{Videos_Dir}/{Random_Video}")

In [5]:
# Specify the height and width to which each video frame will be resized in our dataset.
IMAGE_HEIGHT , IMAGE_WIDTH = 128,128
 
# Specify the number of frames of a video that will be fed to the model as one sequence.
SEQUENCE_LENGTH = 16
 

DATASET_DIR = "/kaggle/input/hockey-fight-vidoes/data/"
 
CLASSES_LIST = ["NonViolence","Violence"]

In [6]:
def frames_extraction(video_path):
 
    frames_list = []
    
    # Read the Video File
    video_reader = cv2.VideoCapture(video_path)
 
    # Get the total number of frames in the video.
    video_frames_count = int(video_reader.get(cv2.CAP_PROP_FRAME_COUNT))
 
    # Calculate the the interval after which frames will be added to the list.
    skip_frames_window = max(int(video_frames_count/SEQUENCE_LENGTH), 1)
 
    # Iterate through the Video Frames.
    for frame_counter in range(SEQUENCE_LENGTH):
 
        # Set the current frame position of the video.
        video_reader.set(cv2.CAP_PROP_POS_FRAMES, frame_counter * skip_frames_window)
 
        # Reading the frame from the video. 
        success, frame = video_reader.read() 
 
        if not success:
            break
 
        # Resize the Frame to fixed height and width.
        resized_frame = cv2.resize(frame, (IMAGE_HEIGHT, IMAGE_WIDTH))
        
        # Normalize the resized frame
        normalized_frame = resized_frame / 255
        
        # Append the normalized frame into the frames list
        frames_list.append(normalized_frame)
    
 
    video_reader.release()
 
    return frames_list

In [7]:
# # vf='/kaggle/input/hockey-fight-vidoes/data/fi411_xvid.avi'
# fc = frames_extraction(vf)

In [8]:
# x=frames[0]

In [9]:
# plt.imshow(x)

In [10]:
# x.shape

In [11]:
# from PIL import Image
# im = Image.fromarray(x)
# im.save("viot4.jpeg")

In [12]:
def create_dataset():
 
    features = []
    labels = []
    video_files_paths = []
    
    # Iterating through all the classes.
   
    files_list = os.listdir(DATASET_DIR)
     # Iterate through all the files present in the files list.
    c=0
    for file_name in files_list:
        
        if c%50 == 0:
            print(c)
        c = c+1
       
         #if c>450 and c<500:
#             continue
#         if c>950 and c<1000:
#             continue
        
        # Get the complete video path.
        video_file_path = os.path.join(DATASET_DIR, file_name)
 
            # Extract the frames of the video file.
        frames = frames_extraction(video_file_path)
 
            # Check if the extracted frames are equal to the SEQUENCE_LENGTH specified.
            # So ignore the vides having frames less than the SEQUENCE_LENGTH.
        if len(frames) == SEQUENCE_LENGTH:
 
                # Append the data to their repective lists.
            features.append(frames)
            if file_name[0:2] == 'fi':
                labels.append(1)
                
            elif file_name[0:2] == 'no':
                labels.append(0)
                
            video_files_paths.append(video_file_path)
    
    features = np.asarray(features)                                # 10-16-64-64-3
    #features = np.moveaxis(np.asarray(features), -1, 1)              #for 10,3,16,64,64 for 3d cnn
    features = np.moveaxis(features, -1, 2)              #for 10,16,3,64,64 for 2dcnn+lstm
    labels = np.array(labels)  

    return features, labels, video_files_paths

In [13]:
# Create the dataset.
features, labels, video_files_paths = create_dataset()

0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950


In [14]:
# Split the Data into Train ( 90% ) and Test Set ( 10% ).
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.18,
                                                                            shuffle = True, random_state = 42)

In [15]:
del features
del labels

In [16]:
print(X_train.shape,y_train.shape )
print(X_test.shape, y_test.shape)

(820, 16, 3, 128, 128) (820,)
(180, 16, 3, 128, 128) (180,)


In [17]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import DataLoader, Dataset # Gives easier dataset managment and creates mini batches
from torchvision.datasets import ImageFolder
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
from PIL import Image

device(type='cuda')

In [19]:

# ImageLoader Class

class vv(Dataset):
    def __init__(self, images, labels, transform=None):
        self.images = images
        self.labels=labels
        self.transform = transform
        
    
    def __getitem__(self, index):
        label = self.labels[index]
        image = self.images[index]
        image = image.astype('float32')
        if self.transform is not None:
            image = self.transform(image)
        

        return image, label

    def __len__(self):
        return len(self.images)
    
    


train_dataset = vv(X_train, y_train)
test_dataset = vv(X_test, y_test)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=True)

a = iter(train_loader)
img,mask = a.next()
print(img.shape,mask.shape)

torch.Size([16, 16, 3, 128, 128]) torch.Size([16])


# temporal Vit

In [11]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # use gpu or cpu
device

device(type='cuda')

In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import torch
import torchvision
import torch.nn as nn # All neural network modules, nn.Linear, nn.Conv2d, BatchNorm, Loss functions
import torchvision.datasets as datasets # Has standard datasets we can import in a nice way
import torchvision.transforms as transforms # Transformations we can perform on our dataset
import torch.nn.functional as F # All functions that don't have any parameters
from torch.utils.data import DataLoader, Dataset # Gives easier dataset managment and creates mini batches
from torchvision.datasets import ImageFolder
import torch.optim as optim # For all Optimization algorithms, SGD, Adam, etc.
from PIL import Image

In [3]:
pip install vit-pytorch


Collecting vit-pytorch
  Downloading vit_pytorch-1.6.5-py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
  Downloading vit_pytorch-1.6.4-py3-none-any.whl (100 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m100.3/100.3 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Downloading vit_pytorch-1.6.3-py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading vit_pytorch-1.6.2-py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 kB[0m [31m6.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading vit_pytorch-1.6.1-py3-none-any.whl (98 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.5/98.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Downloading vit_pytorch-1.

In [4]:
import torch
from vit_pytorch import ViT



In [5]:
vit = ViT(
            image_size = 128,
            patch_size = 16,
            num_classes = 1024,
            dim = 1024,
            depth = 16,
            heads = 16,
            mlp_dim = 2048,
            dropout = 0.1,
            emb_dropout = 0.1
        )

In [6]:
vit # 1 x 16 # 1

ViT(
  (to_patch_embedding): Sequential(
    (0): Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=16, p2=16)
    (1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (2): Linear(in_features=768, out_features=1024, bias=True)
    (3): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (transformer): Transformer(
    (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (layers): ModuleList(
      (0): ModuleList(
        (0): Attention(
          (norm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (attend): Softmax(dim=-1)
          (dropout): Dropout(p=0.1, inplace=False)
          (to_qkv): Linear(in_features=1024, out_features=3072, bias=False)
          (to_out): Sequential(
            (0): Linear(in_features=1024, out_features=1024, bias=True)
            (1): Dropout(p=0.1, inplace=False)
          )
        )
        (1): FeedForward(
          (net): Sequential(
            (

In [7]:
def frameStacker(frames): # Shape: 16 (batch size) x 3 (channels) x 128 (height) x 128 (width)
    stacked_frames = frames.view(4, 4, 3, 128, 128)
    stacked_frames = stacked_frames.permute(0, 2, 1, 3, 4).contiguous()
    stacked_frames = stacked_frames.view(4 * 128, 4 * 128, 3)
    stacked_frames = stacked_frames.permute(2, 0, 1) # # Reshape the tensor to 3 x (4*128) x (4*128)
    # print(stacked_frames.shape)
    return stacked_frames

In [8]:
class VVViT(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=256):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(VVViT, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        #resnet = models.resnet152(pretrained=True)
        #modules = list(resnet.children())[:-1]      # delete the last fc layer.
        #self.resnet = nn.Sequential(*modules)
        
        '''
        vit = ViT(
            image_size = 128,
            patch_size = 16,
            num_classes = 1024,
            dim = 1024,
            depth = 16,
            heads = 16,
            mlp_dim = 2048,
            dropout = 0.1,
            emb_dropout = 0.1
        )
        '''
        c = 3
        vit_temp = ViT(
            image_size = 512,
            patch_size = 128,
            num_classes = 1024,
            dim = 1024,
            depth = 16,
            heads = 16,
            mlp_dim = 2048,
            dropout = 0.1,
            emb_dropout = 0.1
        )
        vit_spat = ViT(
            image_size = 128,
            patch_size = 16,
            num_classes = 1024,
            dim = 1024,
            depth = 16,
            heads = 16,
            mlp_dim = 2048,
            dropout = 0.1,
            emb_dropout = 0.1
        )
    
        #modules = list(mlp_mixer.children())[:-2]
        #self.mlp_mixer = nn.Sequential(*modules)
        finput_size = 1024
        foutput_size = 16 * 3 * 128 * 128
        self.vit_temp = vit_temp
        self.timetospace = nn.Linear(finput_size, foutput_size)
        self.vit_spat = vit_spat
        self.fc1 = nn.Linear(1024, fc_hidden1)
        #self.fc1 = nn.Linear(mlp_mixer. , fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        print(x_3d.shape)
        # cnn_embed_seq = []
        x_temp = []
        for t in range(x_3d.size(0)):
            # ResNet CNN            
            x = frameStacker(x_3d[t, :, :, :, :]) # 3 x 512 x 512
            x_temp.append(x) # 10 x 3 x 512 x 512
        x_temp_torch = torch.stack(x_temp, dim=0)
        print(x_temp_torch.shape) # 10 x 3 x 512 x 512
        x = self.vit_temp(x_temp_torch)  # ResNet # t = num of frame # 10 x 16 x 3 x 128 x 128  -- > # 10 x 16 x 3 x 128 x 128
        print(x.shape)
#         with torch.no_grad():
#             x = self.vit_temp(x_temp_torch)  # ResNet # t = num of frame # 10 x 16 x 3 x 128 x 128  -- > # 10 x 16 x 3 x 128 x 128
#             print(x.shape)
#             x = self.timetospace(x)
#             print(x.shape)
#         print(x.shape) 
#         cnn_embed_seq = []
#         for t in range(x.size(1)):
#             # ResNet CNN
#             with torch.no_grad():
#                 print(x[:, t, :, :, :].shape)
#                 x = self.vit_spat(x[:, t, :, :, :])  # problem here
#                 x = x.view(x.size(0), -1)             # flatten output of conv

#             # FC layers
#             x = self.bn1(self.fc1(x))
#             x = F.relu(x)
#             x = self.bn2(self.fc2(x))
#             x = F.relu(x)
#             x = F.dropout(x, p=self.drop_p, training=self.training)
#             x = self.fc3(x)

#             cnn_embed_seq.append(x)

#         # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
#         cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
#         # cnn_embed_seq: shape=(batch, time_step, input_size)
        
#         print(cnn_embed_seq.shape) 
#         return cnn_embed_seq

            
        
# #         # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
# #         cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
# #         # cnn_embed_seq: shape=(batch, time_step, input_size)

#         return cnn_embed_seq



        

In [12]:
model = nn.Sequential(VVViT()).to(device)
model(torch.randn(10, 16, 3, 128, 128).to(device))

torch.Size([10, 16, 3, 128, 128])
torch.Size([10, 3, 512, 512])
torch.Size([10, 1024])


In [13]:
pip install torchsummary 

Collecting torchsummary
  Downloading torchsummary-1.5.1-py3-none-any.whl (2.8 kB)
Installing collected packages: torchsummary
Successfully installed torchsummary-1.5.1
[0mNote: you may need to restart the kernel to use updated packages.


In [14]:
from torchsummary import summary
from vit_pytorch import ViT

# Define your ViT model
vit = ViT(
    image_size=512,
    patch_size=128,
    num_classes=1024,
    dim=1024,
    depth=16,
    heads=16,
    mlp_dim=2048,
    dropout=0.1,
    emb_dropout=0.1
)

# Move the model to the desired device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
vit = vit.to(device)

# Use torchsummary to print the model summary
summary(vit, (3, 512,512))  # Assuming RGB images with size 128x128


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
         Rearrange-1            [-1, 16, 49152]               0
         LayerNorm-2            [-1, 16, 49152]          98,304
            Linear-3             [-1, 16, 1024]      50,332,672
         LayerNorm-4             [-1, 16, 1024]           2,048
           Dropout-5             [-1, 17, 1024]               0
         LayerNorm-6             [-1, 17, 1024]           2,048
            Linear-7             [-1, 17, 3072]       3,145,728
           Softmax-8           [-1, 16, 17, 17]               0
           Dropout-9           [-1, 16, 17, 17]               0
           Linear-10             [-1, 17, 1024]       1,049,600
          Dropout-11             [-1, 17, 1024]               0
        Attention-12             [-1, 17, 1024]               0
        LayerNorm-13             [-1, 17, 1024]           2,048
           Linear-14             [-1, 1

In [16]:
import torch.nn as nn
import torchvision.models as models
from torchsummary import summary

class YourModel(nn.Module):
    def __init__(self):
        super(YourModel, self).__init__()
        
        # Load pre-trained ResNet152 model
        resnet = models.resnet152(pretrained=True)
        
        # Extract all layers except the last fully connected layer
        modules = list(resnet.children())[:-1]
        
        # Create a sequential model with all layers except the last fully connected layer
        self.resnet = nn.Sequential(*modules)

    def forward(self, x):
        # Forward pass through the ResNet model
        x = self.resnet(x)
        return x

# Create an instance of YourModel
your_model = YourModel()

# Move the model to the desired device (CPU or GPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
your_model = your_model.to(device)

# Print the model summary
summary(your_model, (3, 128,128))  # Assuming RGB images with size 224x224


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1           [-1, 64, 64, 64]           9,408
       BatchNorm2d-2           [-1, 64, 64, 64]             128
              ReLU-3           [-1, 64, 64, 64]               0
         MaxPool2d-4           [-1, 64, 32, 32]               0
            Conv2d-5           [-1, 64, 32, 32]           4,096
       BatchNorm2d-6           [-1, 64, 32, 32]             128
              ReLU-7           [-1, 64, 32, 32]               0
            Conv2d-8           [-1, 64, 32, 32]          36,864
       BatchNorm2d-9           [-1, 64, 32, 32]             128
             ReLU-10           [-1, 64, 32, 32]               0
           Conv2d-11          [-1, 256, 32, 32]          16,384
      BatchNorm2d-12          [-1, 256, 32, 32]             512
           Conv2d-13          [-1, 256, 32, 32]          16,384
      BatchNorm2d-14          [-1, 256,

In [18]:
import torch
import torch.nn as nn

# Assuming input tensor shape is (16, 2048)
input_size = 2048
output_size = 49152
batch_size = 16

# Create a linear layer to perform the transformation
linear_layer = nn.Linear(input_size, output_size)

# Generate a random input tensor for demonstration
input_tensor = torch.randn((10,batch_size, input_size))

# Apply the linear layer to the input tensor
output_tensor = linear_layer(input_tensor)

# Print the shapes of input and output tensors
print("Input Tensor Shape:", input_tensor.shape)
print("Output Tensor Shape:", output_tensor.shape)


Input Tensor Shape: torch.Size([10, 16, 2048])
Output Tensor Shape: torch.Size([10, 16, 49152])


In [None]:
wt

# 2d cnn + lstm

In [None]:
import os
import numpy as np
from PIL import Image
from torch.utils import data
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import json
from matplotlib import pyplot as plt
from skimage import color
from skimage.feature import hog
from sklearn import svm
from sklearn.metrics import classification_report,accuracy_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

from subprocess import check_output
print(check_output(["ls", "../input"]).decode("utf8"))

# Any results you write to the current directory are saved as output.

# mlp mixer + resnet with lstm


In [None]:
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        
#         mlp_mixer = MLPMixer(image_size = (96,96),channels = 3,patch_size = 8,dim = 512,depth = 16,num_classes = 1024)
        
#         self.mlp_mixer = mlp_mixer
        self.fc11 = nn.Linear(1024, fc_hidden1)
        
        self.fc1 = nn.Linear(resnet.fc.in_features, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        cnn_embed_seq2 = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.resnet(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)
        #print(cnn_embed_seq.shape)
#         for t in range(x_3d.size(1)):
#             # ResNet CNN
#             with torch.no_grad():
#                 x = self.mlp_mixer(x_3d[:, t, :, :, :])  # ResNet
#                 x = x.view(x.size(0), -1)             # flatten output of conv

#             # FC layers
#             x = self.bn1(self.fc11(x))
#             x = F.relu(x)
#             x = self.bn2(self.fc2(x))
#             x = F.relu(x)
#             x = F.dropout(x, p=self.drop_p, training=self.training)
#             x = self.fc3(x)

#             cnn_embed_seq2.append(x)

#         # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
#         cnn_embed_seq2 = torch.stack(cnn_embed_seq2, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)
        #print(cnn_embed_seq2.shape)
        #return cnn_embed_seq + cnn_embed_seq2
    
        return cnn_embed_seq 

    



class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=2):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

        
model = nn.Sequential(ResCNNEncoder(),
                      DecoderRNN()
                        )


In [None]:
x=torch.rand(16,16,3,128,128)
y=model(x)
y.shape

In [None]:
model.to(device)

# mlp mixer + lstm

In [None]:
import os
import numpy as np
from PIL import Image
from torch.utils import data
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torchvision.transforms as transforms
from tqdm import tqdm
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [None]:
pip install mlp-mixer-pytorch

In [None]:
import torch
from mlp_mixer_pytorch import MLPMixer

In [None]:
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=256):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        #resnet = models.resnet152(pretrained=True)
        #modules = list(resnet.children())[:-1]      # delete the last fc layer.
        #self.resnet = nn.Sequential(*modules)
        mlp_mixer = MLPMixer(image_size = (96,96),channels = 3,patch_size = 8,dim = 512,depth = 16,num_classes = 1024)
        #modules = list(mlp_mixer.children())[:-2]
        #self.mlp_mixer = nn.Sequential(*modules)
        self.mlp_mixer = mlp_mixer
        self.fc1 = nn.Linear(1024, fc_hidden1)
        #self.fc1 = nn.Linear(mlp_mixer. , fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.mlp_mixer(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq


class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=256, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=2):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

        

In [None]:
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        resnet = models.resnet152(pretrained=True)
        modules = list(resnet.children())[:-1]      # delete the last fc layer.
        self.resnet = nn.Sequential(*modules)
        
        mlp_mixer = MLPMixer(image_size = (96,96),channels = 3,patch_size = 8,dim = 512,depth = 16,num_classes = 1024)
        
        self.mlp_mixer = mlp_mixer
        self.fc11 = nn.Linear(1024, fc_hidden1)
        
        self.fc1 = nn.Linear(resnet.fc.in_features, fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        cnn_embed_seq2 = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.resnet(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)
        
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.mlp_mixer(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc11(x))     
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq2.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq2 = torch.stack(cnn_embed_seq2, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq + cnn_embed_seq2
    

    



class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=2):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

        

In [None]:
model = nn.Sequential(ResCNNEncoder(),
                      DecoderRNN()
                        )

In [None]:
model.to(device)

# unet+mlp-mixer+lstm - 95%

In [None]:
def double_conv(in_ch, out_ch):
    conv = nn.Sequential(
        nn.Conv2d(in_channels=in_ch,out_channels=out_ch,kernel_size=5,stride=1,padding=1),
        nn.BatchNorm2d(out_ch),                                                            
        nn.ReLU(inplace=True),
        nn.Conv2d(in_channels=out_ch,out_channels=out_ch,kernel_size=5,stride=1,padding=1), 
        nn.BatchNorm2d(out_ch),                                                            
        nn.ReLU(inplace=True)
    )
    
    return conv

#def cropper(og_tensor, target_tensor):
#    og_shape = og_tensor.shape[2]
#    target_shape = target_tensor.shape[2]
#    delta = (og_shape - target_shape) // 2
#    cropped_og_tensor = og_tensor[:,:,delta:og_shape-delta,delta:og_shape-delta]
#    return cropped_og_tensor
 

def addPadding(srcShapeTensor, tensor_whose_shape_isTobechanged):

    if(srcShapeTensor.shape != tensor_whose_shape_isTobechanged.shape):
        target = torch.zeros(srcShapeTensor.shape)
        target[:, :, :tensor_whose_shape_isTobechanged.shape[2],
               :tensor_whose_shape_isTobechanged.shape[3]] = tensor_whose_shape_isTobechanged
        return target.to(device)
    return tensor_whose_shape_isTobechanged.to(device)
    
def padder(left_tensor, right_tensor): 
    # left_tensor is the tensor on the encoder side of UNET
    # right_tensor is the tensor on the decoder side  of the UNET
    
    if left_tensor.shape != right_tensor.shape:
        padded = torch.zeros(left_tensor.shape)
        padded[:, :, :right_tensor.shape[2], :right_tensor.shape[3]] = right_tensor
        return padded.to(device)
    
    return right_tensor.to(device)

In [None]:
class UNET(nn.Module):
    def __init__(self):
        super(UNET,self).__init__()
      
        
        self.max_pool = nn.MaxPool2d(kernel_size=2,stride=2)
        
        self.down_conv_1 = double_conv(in_ch=3,out_ch=64)
        self.down_conv_2 = double_conv(in_ch=64,out_ch=128)
        self.down_conv_3 = double_conv(in_ch=128,out_ch=256)
        self.down_conv_4 = double_conv(in_ch=256,out_ch=512)
        self.down_conv_5 = double_conv(in_ch=512,out_ch=1024)
        #print(self.down_conv_1)
        
        self.up_conv_trans_1 = nn.ConvTranspose2d(in_channels=1024,out_channels=512,kernel_size=2,stride=2)
        self.up_conv_trans_2 = nn.ConvTranspose2d(in_channels=512,out_channels=256,kernel_size=2,stride=2)
        self.up_conv_trans_3 = nn.ConvTranspose2d(in_channels=256,out_channels=128,kernel_size=2,stride=2)
        self.up_conv_trans_4 = nn.ConvTranspose2d(in_channels=128,out_channels=64,kernel_size=2,stride=2)
        
        self.up_conv_1 = double_conv(in_ch=1024,out_ch=512)
        self.up_conv_2 = double_conv(in_ch=512,out_ch=256)
        self.up_conv_3 = double_conv(in_ch=256,out_ch=128)
        self.up_conv_4 = double_conv(in_ch=128,out_ch=64)
        
        self.conv_1x1 = nn.Conv2d(in_channels=64,out_channels=2,kernel_size=1,stride=1)
        
    def forward(self,x):
        
        # encoding
        x1 = self.down_conv_1(x)
        print("X1", x1.shape)
        p1 = self.max_pool(x1)
        print("p1", p1.shape)
        x2 = self.down_conv_2(p1)
        print("X2", x2.shape)
        p2 = self.max_pool(x2)
        print("p2", p2.shape)
        x3 = self.down_conv_3(p2)
        print("X2", x3.shape)
        p3 = self.max_pool(x3)
        print("p3", p3.shape)
        x4 = self.down_conv_4(p3)
        print("X4", x4.shape)
        p4 = self.max_pool(x4)
        print("p4", p4.shape)
        
        # decoding
        
        d2 = self.up_conv_trans_2(x4)
        crop2 = padder(x3,d2)
        cat2 = torch.cat([x3,crop2],dim=1)
        uc2 = self.up_conv_2(cat2)
        print("uc2", uc2.shape)
        
        d3 = self.up_conv_trans_3(uc2)
        crop3 = padder(x2,d3)
        cat3 = torch.cat([x2,crop3],dim=1)
        uc3 = self.up_conv_3(cat3)
        print("uc3", uc3.shape)
        
        d4 = self.up_conv_trans_4(uc3)
        crop4 = padder(x1,d4)
        cat4 = torch.cat([x1,crop4],dim=1)
        uc4 = self.up_conv_4(cat4)
        print("uc4", uc4.shape)
        
        conv_1x1 = self.conv_1x1(uc4)
        return uc4
        #print(conv_1x1.shape)

In [None]:
!pip install -q -U segmentation-models-pytorch albumentations > /dev/null
import segmentation_models_pytorch as smp

In [None]:
ENCODER = 'resnet50'
ENCODER_WEIGHTS = 'imagenet'

ACTIVATION = 'sigmoid' # could be None for logits or 'softmax2d' for multiclass segmentation

# create segmentation model with pretrained encoder
UNET = smp.Unet(
    encoder_name=ENCODER, 
    encoder_weights=ENCODER_WEIGHTS, 
    classes=3, 
    activation=ACTIVATION,
)

preprocessing_fn = smp.encoders.get_preprocessing_fn(ENCODER, ENCODER_WEIGHTS)

In [None]:
class DoubleConv(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DoubleConv, self).__init__()
        self.double_conv = nn.Sequential(
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=1),
            nn.BatchNorm2d(out_channels),
            nn.ReLU(inplace=True),
        )

    def forward(self, x):
        return self.double_conv(x)
    
    
class DownBlock(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(DownBlock, self).__init__()
        self.double_conv = DoubleConv(in_channels, out_channels)
        self.down_sample = nn.MaxPool2d(2)

    def forward(self, x):
        skip_out = self.double_conv(x)
        down_out = self.down_sample(skip_out)
        return (down_out, skip_out)

    
class UpBlock(nn.Module):
    def __init__(self, in_channels, out_channels, up_sample_mode):
        super(UpBlock, self).__init__()
        if up_sample_mode == 'conv_transpose':
            self.up_sample = nn.ConvTranspose2d(in_channels-out_channels, in_channels-out_channels, kernel_size=2, stride=2)        
        elif up_sample_mode == 'bilinear':
            self.up_sample = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        else:
            raise ValueError("Unsupported `up_sample_mode` (can take one of `conv_transpose` or `bilinear`)")
        self.double_conv = DoubleConv(in_channels, out_channels)

    def forward(self, down_input, skip_input):
        x = self.up_sample(down_input)
        x = torch.cat([x, skip_input], dim=1)
        return self.double_conv(x)

    
class UNet(nn.Module):
    def __init__(self, out_classes=3, up_sample_mode='conv_transpose'):
        super(UNet, self).__init__()
        self.up_sample_mode = up_sample_mode
        # Downsampling Path
        self.down_conv1 = DownBlock(3, 64)
        self.down_conv2 = DownBlock(64, 128)
        self.down_conv3 = DownBlock(128, 256)
        self.down_conv4 = DownBlock(256, 512)
        # Bottleneck
        self.double_conv = DoubleConv(512, 1024)
        # Upsampling Path
        self.up_conv4 = UpBlock(512 + 1024, 512, self.up_sample_mode)
        self.up_conv3 = UpBlock(256 + 512, 256, self.up_sample_mode)
        self.up_conv2 = UpBlock(128 + 256, 128, self.up_sample_mode)
        self.up_conv1 = UpBlock(128 + 64, 64, self.up_sample_mode)
        # Final Convolution
        self.conv_last = nn.Conv2d(64, out_classes, kernel_size=1)

    def forward(self, x):
        x, skip1_out = self.down_conv1(x)
        x, skip2_out = self.down_conv2(x)
        x, skip3_out = self.down_conv3(x)
        x, skip4_out = self.down_conv4(x)
        x = self.double_conv(x)
        x = self.up_conv4(x, skip4_out)
        x = self.up_conv3(x, skip3_out)
        x = self.up_conv2(x, skip2_out)
        x = self.up_conv1(x, skip1_out)
        x = self.conv_last(x)
        return x
    

# Get UNet model
model = UNet()

In [None]:
m=UNet()

In [None]:
x=torch.rand(10,3,96,96)

In [None]:
y=m(x)
y.shape

In [None]:
#new
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        self.unet = UNet()
        
        mlp_mixer = MLPMixer(image_size = (96,96),channels = 3,patch_size = 16,dim = 512,depth = 16,num_classes = 1024)
        
        self.mlp_mixer = mlp_mixer
        self.fc11 = nn.Linear(1024, fc_hidden1)
        
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        cnn_embed_seq2 = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.unet(x_3d[:, t, :, :, :])  # ResNet

            cnn_embed_seq.append(x)
        
        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)
        #print(cnn_embed_seq.shape)
        for t in range(cnn_embed_seq.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.mlp_mixer(cnn_embed_seq[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc11(x))     
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq2.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq2 = torch.stack(cnn_embed_seq2, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq2
    

    



class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=2):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

        

In [None]:
#new
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=300):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        self.unet = UNet()
        
        mlp_mixer = MLPMixer(image_size = (96,96),channels = 3,patch_size = 8,dim = 512,depth = 16,num_classes = 1024)
        
        self.mlp_mixer = mlp_mixer
        self.fc11 = nn.Linear(1024, fc_hidden1)
        
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        cnn_embed_seq2 = []
        for t in range(x_3d.size(1)):
            # UNet CNN
            with torch.no_grad():
                x = self.unet(x_3d[:, t, :, :, :])  # unet

            cnn_embed_seq.append(x)
        
        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)
        #print(cnn_embed_seq.shape)
        for t in range(cnn_embed_seq.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.mlp_mixer(cnn_embed_seq[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc11(x))     
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq2.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq2 = torch.stack(cnn_embed_seq2, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq2
    

    



class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=300, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=2):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

        

In [None]:
model = nn.Sequential(ResCNNEncoder(),
                      DecoderRNN()
                        )



In [None]:
model.to(device)

# 3dcnn 96%-batch size 16, shape-128*128 self.layer6, fc4

In [None]:
keepprobab = 1
class CNN(torch.nn.Module):

    def __init__(self,flag=0):
        super(CNN, self).__init__()
        self.flag=flag
        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv3d(3, 32, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv3d(32, 64, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer3 = torch.nn.Sequential(
            torch.nn.Conv3d(64, 128, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))

        self.layer4 = torch.nn.Sequential(
            torch.nn.Conv3d( 128, 256, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer5 = torch.nn.Sequential(
            torch.nn.Conv3d( 256,512, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer6 = torch.nn.Sequential(
            torch.nn.Conv3d(512, 1024, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer7 = torch.nn.Sequential(
            torch.nn.Conv3d(1024, 2048 ,kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer8 = torch.nn.Sequential(
            torch.nn.Conv3d(2048,2048, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.fc1 = torch.nn.Linear(16384, 1024, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        self.fc11 = torch.nn.Linear( 1024,512, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        
        self.fc2 = torch.nn.Linear(1024, 256, bias=True)
        # initialize parameters
        torch.nn.init.xavier_uniform_(self.fc2.weight) 
        
        self.fc3 = torch.nn.Linear(256, 128, bias=True)
        self.fc4 = torch.nn.Linear(128, 2, bias=True)
        
    def forward(self, y):
        
        if self.flag==1:
            print(y.shape)
        y = torch.moveaxis(y,1,-1)
        if self.flag==1:
            print(y.shape)
        output = self.layer1(y)
        if self.flag==1:
            print(output.shape)
        output = self.layer2(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer3(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer4(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer5(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer6(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer7(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer8(output)
        if self.flag==1:
            print(output.shape)
         # Flatten them for FC
        output = output.view(output.size(0), -1)
        if self.flag==1:
            print(output.shape)
        output = self.fc1(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc2(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc3(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc4(output)
        if self.flag==1:
            print(output.shape)
        return output
model = CNN()


In [None]:
m=CNN(1)

In [None]:
x=torch.rand(16,16,3,128,128)
y=m(x)
y.shape

In [None]:
keepprobab = 1
class CNN(torch.nn.Module):

    def __init__(self,flag=0):
        super(CNN, self).__init__()
        self.flag=flag
        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv3d(3, 32, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv3d(32, 64, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer3 = torch.nn.Sequential(
            torch.nn.Conv3d(64, 128, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))

        self.layer4 = torch.nn.Sequential(
            torch.nn.Conv3d( 128, 256, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer5 = torch.nn.Sequential(
            torch.nn.Conv3d( 256,512, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.fc1 = torch.nn.Linear(25600, 1024, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        
        self.fc2 = torch.nn.Linear(1024, 256, bias=True)
        # initialize parameters
        torch.nn.init.xavier_uniform_(self.fc2.weight) 
        
        self.fc3 = torch.nn.Linear(256, 128, bias=True)
        
    def forward(self, y):
        
        if self.flag==1:
            print(y.shape)
        y = torch.moveaxis(y,1,-1)
        if self.flag==1:
            print(y.shape)
        output = self.layer1(y)
        if self.flag==1:
            print(output.shape)
        output = self.layer2(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer3(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer4(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer5(output)
        if self.flag==1:
            print(output.shape)
         # Flatten them for FC
        output = output.view(output.size(0), -1)
        if self.flag==1:
            print(output.shape)
        output = self.fc1(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc2(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc3(output)
        if self.flag==1:
            print(output.shape)
        return output
model = CNN()


In [None]:
keepprobab = 1
class CNN(torch.nn.Module):

    def __init__(self,flag=0):
        super(CNN, self).__init__()
        self.flag=flag
        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv3d(3, 32, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv3d(32, 64, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer3 = torch.nn.Sequential(
            torch.nn.Conv3d(64, 128, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))

        self.layer4 = torch.nn.Sequential(
            torch.nn.Conv3d( 128, 256, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer5 = torch.nn.Sequential(
            torch.nn.Conv3d( 256,512, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.fc1 = torch.nn.Linear(25600, 1024, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        
        self.fc2 = torch.nn.Linear(1024, 256, bias=True)
        # initialize parameters
        torch.nn.init.xavier_uniform_(self.fc2.weight) 
        
        self.fc3 = torch.nn.Linear(256, 128, bias=True)
        
    def forward(self, y):
        
        if self.flag==1:
            print(y.shape)
        y = torch.moveaxis(y,1,-1)
        if self.flag==1:
            print(y.shape)
        output = self.layer1(y)
        if self.flag==1:
            print(output.shape)
        output = self.layer2(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer3(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer4(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer5(output)
        if self.flag==1:
            print(output.shape)
         # Flatten them for FC
        output = output.view(output.size(0), -1)
        if self.flag==1:
            print(output.shape)
        output = self.fc1(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc2(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc3(output)
        if self.flag==1:
            print(output.shape)
        return output
model = CNN()

In [None]:
model.to(device)

# vit+lstm 90%

In [None]:
pip install vit-pytorch


In [None]:
import torch
from vit_pytorch import ViT



In [None]:
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, drop_p=0.3, CNN_embed_dim=256):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        #resnet = models.resnet152(pretrained=True)
        #modules = list(resnet.children())[:-1]      # delete the last fc layer.
        #self.resnet = nn.Sequential(*modules)
        mlp_mixer = ViT(
            image_size = 128,
            patch_size = 16,
            num_classes = 1024,
            dim = 1024,
            depth = 16,
            heads = 16,
            mlp_dim = 2048,
            dropout = 0.1,
            emb_dropout = 0.1
        )
        #modules = list(mlp_mixer.children())[:-2]
        #self.mlp_mixer = nn.Sequential(*modules)
        self.mlp_mixer = mlp_mixer
        self.fc1 = nn.Linear(1024, fc_hidden1)
        #self.fc1 = nn.Linear(mlp_mixer. , fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
    def forward(self, x_3d):
        cnn_embed_seq = []
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.mlp_mixer(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)

        return cnn_embed_seq


class DecoderRNN(nn.Module):
    def __init__(self, CNN_embed_dim=256, h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.1, num_classes=2):
        super(DecoderRNN, self).__init__()

        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc1 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc2 = nn.Linear(self.h_FC_dim, self.num_classes)

    def forward(self, x_RNN):
        
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(x_RNN, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc1(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = self.fc2(x)

        return x

        

In [None]:
model = nn.Sequential(ResCNNEncoder(),
                      DecoderRNN()
                        )

model.to(device)

# merge 3dcnn + (vitLstm)

In [None]:
pip install vit-pytorch


In [None]:
import torch
from vit_pytorch import ViT



In [None]:
keepprobab = 1
class CNN(torch.nn.Module):

    def __init__(self,flag=0):
        super(CNN, self).__init__()
        self.flag=flag
        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv3d(3, 32, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv3d(32, 64, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer3 = torch.nn.Sequential(
            torch.nn.Conv3d(64, 128, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))

        self.layer4 = torch.nn.Sequential(
            torch.nn.Conv3d( 128, 256, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer5 = torch.nn.Sequential(
            torch.nn.Conv3d( 256,512, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.fc1 = torch.nn.Linear(25600, 1024, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        
        self.fc2 = torch.nn.Linear(1024, 256, bias=True)
        # initialize parameters
        torch.nn.init.xavier_uniform_(self.fc2.weight) 
        
        self.fc3 = torch.nn.Linear(256, 128, bias=True)
        
    def forward(self, y):
        
        if self.flag==1:
            print(y.shape)
        y = torch.moveaxis(y,1,-1)
        if self.flag==1:
            print(y.shape)
        output = self.layer1(y)
        if self.flag==1:
            print(output.shape)
        output = self.layer2(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer3(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer4(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer5(output)
        if self.flag==1:
            print(output.shape)
         # Flatten them for FC
        output = output.view(output.size(0), -1)
        if self.flag==1:
            print(output.shape)
        output = self.fc1(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc2(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc3(output)
        if self.flag==1:
            print(output.shape)
        return output


In [None]:
keepprobab = 1
class CNN(torch.nn.Module):

    def __init__(self,flag=0):
        super(CNN, self).__init__()
        self.flag=flag
        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv3d(3, 32, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv3d(32, 64, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer3 = torch.nn.Sequential(
            torch.nn.Conv3d(64, 128, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))

        self.layer4 = torch.nn.Sequential(
            torch.nn.Conv3d( 128, 256, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer5 = torch.nn.Sequential(
            torch.nn.Conv3d( 256,512, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer6 = torch.nn.Sequential(
            torch.nn.Conv3d(512, 1024, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.fc1 = torch.nn.Linear(18432, 1024, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        
        self.fc2 = torch.nn.Linear(1024, 256, bias=True)
        # initialize parameters
        torch.nn.init.xavier_uniform_(self.fc2.weight) 
        
        self.fc3 = torch.nn.Linear(256, 128, bias=True)
        self.fc4 = torch.nn.Linear(128, 2, bias=True)
        
    def forward(self, y):
        
        if self.flag==1:
            print(y.shape)
        y = torch.moveaxis(y,1,-1)
        if self.flag==1:
            print(y.shape)
        output = self.layer1(y)
        if self.flag==1:
            print(output.shape)
        output = self.layer2(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer3(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer4(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer5(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer6(output)
        if self.flag==1:
            print(output.shape)
         # Flatten them for FC
        output = output.view(output.size(0), -1)
        if self.flag==1:
            print(output.shape)
        output = self.fc1(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc2(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc3(output)
        if self.flag==1:
            print(output.shape)
        
        return output



In [None]:
class ResCNNEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, CNN_embed_dim=256 , h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=2):
        """Load the pretrained ResNet-152 and replace top fc layer."""
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p

        #resnet = models.resnet152(pretrained=True)
        #modules = list(resnet.children())[:-1]      # delete the last fc layer.
        #self.resnet = nn.Sequential(*modules)
        mlp_mixer = ViT(
            image_size = 128,
            patch_size = 16,
            num_classes = 1024,
            dim = 1024,
            depth = 16,
            heads = 16,
            mlp_dim = 2048,
            dropout = 0.1,
            emb_dropout = 0.1
        )
        
        cnn_3d = CNN()
        self.cnn_3d= cnn_3d
        #modules = list(mlp_mixer.children())[:-2]
        #self.mlp_mixer = nn.Sequential(*modules)
        self.mlp_mixer = mlp_mixer
        self.fc1 = nn.Linear(1024, fc_hidden1)
        #self.fc1 = nn.Linear(mlp_mixer. , fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc4 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc5 = nn.Linear(self.h_FC_dim, self.num_classes)
        self.fc6 = nn.Linear(self.h_FC_dim, 64)
        self.fc7 = nn.Linear(64, self.num_classes)

        
    def forward(self, x_3d):
        cnn_embed_seq = []
        x1=self.cnn_3d(x_3d)
        #print(x1.shape)
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.mlp_mixer(x_3d[:, t, :, :, :])  # ResNet
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)
            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)
        #print(cnn_embed_seq.shape)
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(cnn_embed_seq, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc4(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        #print(x.shape)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = x + x1
        x = self.fc5(x)
        x = F.relu(x)
        
        
        return x
        

        

In [None]:
from vit_pytorch import ViT
keepprobab = 1
class CNN(torch.nn.Module):

    def __init__(self,flag=0):
        super(CNN, self).__init__()
        self.flag=flag
        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv3d(3, 32, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv3d(32, 64, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2),
            torch.nn.Dropout(p=1 - keepprobab))
        self.layer3 = torch.nn.Sequential(
            torch.nn.Conv3d(64, 128, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))

        self.layer4 = torch.nn.Sequential(
            torch.nn.Conv3d( 128, 256, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.layer5 = torch.nn.Sequential(
            torch.nn.Conv3d( 256,512, kernel_size=(3,3,3), stride=1, padding=1),
            torch.nn.ReLU(),
            torch.nn.MaxPool3d(kernel_size=(2,2,2), stride=2, padding=1),
            torch.nn.Dropout(p=1 - keepprobab))
        
        self.fc1 = torch.nn.Linear(25600, 1024, bias=True)
        torch.nn.init.xavier_uniform(self.fc1.weight)
        
        self.fc2 = torch.nn.Linear(1024, 256, bias=True)
        # initialize parameters
        torch.nn.init.xavier_uniform_(self.fc2.weight) 
        
        self.fc3 = torch.nn.Linear(256, 128, bias=True)
        
    def forward(self, y):
        
        if self.flag==1:
            print(y.shape)
        y = torch.moveaxis(y,1,-1)
        if self.flag==1:
            print(y.shape)
        output = self.layer1(y)
        if self.flag==1:
            print(output.shape)
        output = self.layer2(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer3(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer4(output)
        if self.flag==1:
            print(output.shape)
        output = self.layer5(output)
        if self.flag==1:
            print(output.shape)
         # Flatten them for FC
        output = output.view(output.size(0), -1)
        if self.flag==1:
            print(output.shape)
        output = self.fc1(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc2(output)
        if self.flag==1:
            print(output.shape)
        output = self.fc3(output)
        if self.flag==1:
            print(output.shape)
        return output
class VitEncoder(nn.Module):
    def __init__(self, fc_hidden1=512, fc_hidden2=512, CNN_embed_dim=256 , h_RNN_layers=3, h_RNN=256, h_FC_dim=128, drop_p=0.3, num_classes=2):
        super(ResCNNEncoder, self).__init__()

        self.fc_hidden1, self.fc_hidden2 = fc_hidden1, fc_hidden2
        self.drop_p = drop_p
        mlp_mixer = ViT(
            image_size = 128,
            patch_size = 16,
            num_classes = 1024,
            dim = 1024,
            depth = 16,
            heads = 16,
            mlp_dim = 2048,
            dropout = 0.1,
            emb_dropout = 0.1
        )
        
        cnn_3d = CNN()
        self.cnn_3d= cnn_3d
        
        self.mlp_mixer = mlp_mixer
        self.fc1 = nn.Linear(1024, fc_hidden1)
        #self.fc1 = nn.Linear(mlp_mixer. , fc_hidden1)
        self.bn1 = nn.BatchNorm1d(fc_hidden1, momentum=0.01)
        self.fc2 = nn.Linear(fc_hidden1, fc_hidden2)
        self.bn2 = nn.BatchNorm1d(fc_hidden2, momentum=0.01)
        self.fc3 = nn.Linear(fc_hidden2, CNN_embed_dim)
        
        self.RNN_input_size = CNN_embed_dim
        self.h_RNN_layers = h_RNN_layers   # RNN hidden layers
        self.h_RNN = h_RNN                 # RNN hidden nodes
        self.h_FC_dim = h_FC_dim
        self.drop_p = drop_p
        self.num_classes = num_classes

        self.LSTM = nn.LSTM(
            input_size=self.RNN_input_size,
            hidden_size=self.h_RNN,        
            num_layers=h_RNN_layers,       
            batch_first=True,       # input & output will has batch size as 1s dimension. e.g. (batch, time_step, input_size)
        )

        self.fc4 = nn.Linear(self.h_RNN, self.h_FC_dim)
        self.fc5 = nn.Linear(self.h_FC_dim, self.num_classes)
        self.fc6 = nn.Linear(self.h_FC_dim, 64)
        self.fc7 = nn.Linear(64, self.num_classes)

        
    def forward(self, x_3d):
        cnn_embed_seq = []
        x1=self.cnn_3d(x_3d)
        #print(x1.shape)
        for t in range(x_3d.size(1)):
            # ResNet CNN
            with torch.no_grad():
                x = self.mlp_mixer(x_3d[:, t, :, :, :])  
                x = x.view(x.size(0), -1)             # flatten output of conv

            # FC layers
            x = self.bn1(self.fc1(x))
            x = F.relu(x)
            x = self.bn2(self.fc2(x))
            x = F.relu(x)
            x = F.dropout(x, p=self.drop_p, training=self.training)
            x = self.fc3(x)

            cnn_embed_seq.append(x)

        # swap time and sample dim such that (sample dim, time dim, CNN latent dim)
        cnn_embed_seq = torch.stack(cnn_embed_seq, dim=0).transpose_(0, 1)
        # cnn_embed_seq: shape=(batch, time_step, input_size)
        #print(cnn_embed_seq.shape)
        self.LSTM.flatten_parameters()
        RNN_out, (h_n, h_c) = self.LSTM(cnn_embed_seq, None)  
        """ h_n shape (n_layers, batch, hidden_size), h_c shape (n_layers, batch, hidden_size) """ 
        """ None represents zero initial hidden state. RNN_out has shape=(batch, time_step, output_size) """

        # FC layers
        x = self.fc4(RNN_out[:, -1, :])   # choose RNN_out at the last time step
        x = F.relu(x)
        #print(x.shape)
        x = F.dropout(x, p=self.drop_p, training=self.training)
        x = x + x1
        x = self.fc5(x)
        x = F.relu(x)
        
        
        return x
        

        

In [None]:
model = ResCNNEncoder()


In [None]:
x=torch.rand(10,16,3,96,96)
y=model(x)
y.shape

In [None]:

model.to(device)

# new run

In [None]:
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def fit_one_cycle(epochs, max_lr, model, train_loader, val_loader, 
                  weight_decay=0, grad_clip=None, opt_func=torch.optim.SGD):
    torch.cuda.empty_cache()
    history = []
    
    # Set up cutom optimizer with weight decay
    optimizer = opt_func(model.parameters(), max_lr, weight_decay=weight_decay)
    # Set up one-cycle learning rate scheduler
    sched = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr, epochs=epochs, 
                                                steps_per_epoch=len(train_loader))
    
    for epoch in range(epochs):
        # Training Phase 
        model.train()
        train_losses = []
        lrs = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            
            # Gradient clipping
            if grad_clip: 
                nn.utils.clip_grad_value_(model.parameters(), grad_clip)
            
            optimizer.step()
            optimizer.zero_grad()
            
            # Record & update learning rate
            lrs.append(get_lr(optimizer))
            sched.step()
        
        # Validation phase
        result = evaluate(model, val_loader)
        result['train_loss'] = torch.stack(train_losses).mean().item()
        result['lrs'] = lrs
        model.epoch_end(epoch, result)
        history.append(result)
    return history

In [None]:
history = [evaluate(model, test_loader)]
history

# Run and predict

In [None]:
from tqdm import tqdm
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0005)


# Train and test

def train(num_epoch, model):
    for epoch in range(0, num_epoch):
#         current_loss = 0.0
#         current_corrects = 0
        losses = []
        model.train()
        loop = tqdm(enumerate(train_loader), total=len(train_loader)) # create a progress bar
        for batch_idx, (data, targets) in loop:
            data = data.to(device=device)
            targets = targets.to(device=device)
            scores = model(data)
            
            
            
            loss = criterion(scores, targets)
            optimizer.zero_grad()
            losses.append(loss)
            loss.backward()
            optimizer.step()
            _, preds = torch.max(scores, 1)
#             current_loss += loss.item() * data.size(0)
#             current_corrects += (preds == targets).sum().item()
#             accuracy = int(current_corrects / len(train_loader.dataset) * 100)
            loop.set_description(f"Epoch {epoch+1}/{num_epoch} process: {int((batch_idx / len(train_loader)) * 100)}")
            loop.set_postfix(loss=loss.data.item())
        
        # save model
#         torch.save({ 
#                     'model_state_dict': model.state_dict(), 
#                     'optimizer_state_dict': optimizer.state_dict(), 
#                     }, 'checpoint_epoch_'+str(epoch)+'.pt')


        
# model.eval() is a kind of switch for some specific layers/parts of the model that behave differently,
# during training and inference (evaluating) time. For example, Dropouts Layers, BatchNorm Layers etc. 
# You need to turn off them during model evaluation, and .eval() will do it for you. In addition, 
# the common practice for evaluating/validation is using torch.no_grad() in pair with model.eval() 
# to turn off gradients computation:
        
def test():
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for x, y in test_loader:
            x = x.to(device)
            #print(x.shape)
            y = y.to(device)
            #print(y.shape)
            output = model(x)
            #print(output.shape)
            _, predictions = torch.max(output, 1)
            
            #print(predictions.shape)
            correct += (predictions == y).sum().item()
            test_loss = criterion(output, y)
            
    test_loss /= len(test_loader.dataset)
    print("Average Loss: ", test_loss, "  Accuracy: ", correct, " / ",
    len(test_loader.dataset), "  ", int(correct / len(test_loader.dataset) * 100), "%")

In [None]:
from tqdm import tqdm
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0001)
n_total_steps = len(train_loader)


# Train and test
trainingEpoch_loss = []
validationEpoch_loss = []
def train(num_epoch, model):
    for epoch in range(0, num_epoch):
        step_loss = []
        model.train()
        loop = tqdm(enumerate(train_loader), total=len(train_loader)) # create a progress bar
        for batch_idx, (data, targets) in loop:
            data = data.to(device=device)
            targets = targets.to(device=device)
            scores = model(data)
            loss = criterion(scores, targets)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            _, preds = torch.max(scores, 1)

            step_loss.append(loss.item())
            i=batch_idx
#             if (i+1) % 1 == 0:
#                 print (f'Epoch [{epoch+1}/{epochs}], Step [{i+1}/{n_total_steps}], Loss: {loss.item():.4f}')
            loop.set_description(f"Epoch {epoch+1}/{num_epoch} process: {int((batch_idx / len(train_loader)) * 100)}")
            loop.set_postfix(loss=loss.data.item())
                
        trainingEpoch_loss.append(np.array(step_loss).mean())
        
        test()

def test():
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for x, y in test_loader:
            validationStep_loss = []
            x = x.to(device)
            y = y.to(device)
            output = model(x)
            _, predictions = torch.max(output, 1)
            correct += (predictions == y).sum().item()
            test_loss = criterion(output, y)
            validationStep_loss.append(test_loss.item())
        validationEpoch_loss.append(np.array(validationStep_loss).mean())
        
    test_loss /= len(test_loader.dataset)
    print("Average Loss: ", test_loss, "  Accuracy: ", correct, " / ",
    len(test_loader.dataset), "  ", int(correct / len(test_loader.dataset) * 100), "%")

In [None]:
if __name__ == "__main__":
    epochs=10
    train(epochs, model) # train
    test() # test

In [None]:
from matplotlib import pyplot as plt
plt.plot(trainingEpoch_loss, label='train_loss')
plt.plot(validationEpoch_loss,label='val_loss')
plt.legend()
plt.show

In [None]:
%matplotlib inline

def print_graph(item, index, history):
    plt.figure()
    train_values = history.history[item][0:index]
    plt.plot(train_values)
    test_values = history.history['val_' + item][0:index]
    plt.plot(test_values)
    plt.legend(['training','validation'])
    plt.title('Training and validation '+ item)
    plt.xlabel('epoch')
    plt.show()
    plot = '{}.png'.format(item)
    plt.savefig(plot)


def get_best_epoch(test_loss, history):
    for key, item in enumerate(history.history.items()):
        (name, arr) = item
        if name == 'val_loss':
            for i in range(len(arr)):
                if round(test_loss, 2) == round(arr[i], 2):
                    return i
                
def model_summary(model, history):
    print('---'*30)
    test_loss, test_accuracy = model.evaluate(X_test_nn, y_test, verbose=0)

    if history:
        index = get_best_epoch(test_loss, history)
        print('Best Epochs: ', index)

        train_accuracy = history.history['accuracy'][index]
        train_loss = history.history['loss'][index]

        print('Accuracy on train:',train_accuracy,'\tLoss on train:',train_loss)
        print('Accuracy on test:',test_accuracy,'\tLoss on test:',test_loss)
        print_graph('loss', index, history)
        print_graph('accuracy', index, history)
        print('---'*30)                

In [None]:
model_summary(model, history)

In [None]:
import matplotlib.pyplot as plt

# assuming train_losses and val_losses are lists of loss values for each epoch
train_losses = [0.5, 0.4, 0.3, 0.2, 0.1]
val_losses = [1.0, 0.9, 0.8, 0.7, 0.6]

# plot the train and validation losses
plt.plot(train_losses, label='Train Loss')
plt.plot(val_losses, label='Validation Loss')

# add labels, title, and legend to the plot
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()

# show the plot
plt.show()
