<a href="https://colab.research.google.com/github/nefario7/cmu-deeplearning/blob/working-hw2/Homework%202/Continue_ConvNext_HW2P2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Mount Drive and Download Data

In [None]:
from IPython.display import clear_output 
! apt-get install -y -qq software-properties-common python-software-properties module-init-tools
! add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
! apt-get update -qq 2>&1 > /dev/null
! apt-get -y install -qq google-drive-ocamlfuse fuse

from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass

! google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
! echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}
% cd /content
! mkdir cmudrive
% cd ..
! google-drive-ocamlfuse /content/cmudrive
! pip install kaggle wandb torch-summary
! mkdir ~/.kaggle
! cp /content/cmudrive/IDL/kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! pip install --upgrade --force-reinstall --no-deps kaggle 
! kaggle config set -n path -v /content

! wandb login 4bdbe9c204105e1264fe3f54df2732fd1fff8040

clear_output()

In [None]:
! kaggle competitions download -c 11-785-s22-hw2p2-classification
! kaggle competitions download -c 11-785-s22-hw2p2-verification

! unzip -q /content/competitions/11-785-s22-hw2p2-classification/11-785-s22-hw2p2-classification.zip -d /content
! unzip -q /content/competitions/11-785-s22-hw2p2-verification/11-785-s22-hw2p2-verification.zip -d /content

clear_output()

## Dependencies

In [None]:
import yaml
import wandb
import os
import os.path as osp
import numpy as np
import math

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from torch.optim.lr_scheduler import ExponentialLR, ReduceLROnPlateau

import torchvision
import torchvision.transforms as ttf
import torchvision.models as models

!pip install albumentations==0.4.6
import albumentations as A
from albumentations.pytorch import ToTensorV2

from torchsummary import summary
from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
from functools import partial
from IPython.display import clear_output 

# from pytorch_lightning.callbacks.early_stopping import EarlyStopping

torch.autograd.set_detect_anomaly(False)
torch.autograd.profiler.profile(False)
torch.autograd.profiler.emit_nvtx(False)

clear_output()

# FaceNet Architectures

### MobileNetV2

In [None]:
import torch
import torch.nn as nn
import math


class InvertedResidualBlock(nn.Module):
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__()

        # Can only do identity residual connection if input & output are the same channel & spatial shape.
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False
        
        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        self.feature_mixing = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=hidden_dim, kernel_size=(1, 1), stride=1, padding=0, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6()
        )

        self.spatial_mixing = nn.Sequential(
            nn.Conv2d(in_channels=hidden_dim, out_channels=hidden_dim, kernel=(3,3), stride=stride, padding=1, groups=hidden_dim, bias=False),
            nn.BatchNorm2d(in_channels),
            nn.ReLU6()
        )

        self.bottleneck_channels = nn.Sequential(
            nn.Conv2d(in_channels=hidden_dim, out_channels=in_channels, kernel_size=(1, 1), stride=1, padding=0, bias=False),
            nn.BatchNorm2d(in_channels)
        )

    def forward(self, x):
        out = self.feature_mixing(x)
        out = self.spatial_mixing(out)
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + out
        else:
            return out

class MobileNetV2(nn.Module):
    """
    The heavy lifting is already done in InvertedBottleneck.

    Why MobileNetV2 and not V3? V2 is the foundation for V3, which uses "neural
    architecture search" to find better configurations of V2. If you understand
    V2 well, you can totally implement V3!
    """
    def __init__(self, num_classes= 7000):
        super().__init__()

        self.num_classes = num_classes

        """
        First couple of layers are special, just do them here.
        This is called the "stem". Usually, methods use it to downsample or twice.
        """
        self.stem = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=32, kernel=(3, 3), )
        )

        """
        Since we're just repeating InvertedResidualBlocks again and again, we
        want to specify their parameters like this.
        The four numbers in each row (a stage) are shown below.
        - Expand ratio: We talked about this in InvertedResidualBlock
        - Channels: This specifies the channel size before expansion
        - # blocks: Each stage has many blocks, how many?
        - Stride of first block: For some stages, we want to downsample. In a
          downsampling stage, we set the first block in that stage to have
          stride = 2, and the rest just have stride = 1.

        Again, note that almost every stage here is downsampling! By the time
        we get to the last stage, what is the image resolution? Can it still
        be called an image for our dataset? Think about this, and make changes
        as you want.
        """
        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [6,  24, 2, 2],
            [6,  32, 3, 2],
            [6,  64, 4, 2],
            [6,  96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # Remember that our stem left us off at 16 channels. We're going to 
        # keep updating this in_channels variable as we go
        in_channels = 16

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage
            
            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1, 
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels 
            
        self.layers = nn.Sequential(*layers) # Done, save them to the class

        # Some final feature mixing
        self.final_block = nn.Sequential(
            nn.Conv2d(in_channels, 1280, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(1280),
            nn.ReLU6()
        )

        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            # TODO: Fill this in!
            # Pool over & collapse the spatial dimensions to (1, 1)
            # Collapse the trivial (1, 1) dimensions
            # Project to our # of classes
        )

        self._initialize_weights()

    def _initialize_weights(self):
        """
        Usually, I like to use default pytorch initialization for stuff, but
        MobileNetV2 made a point of putting in some custom ones, so let's just
        use them.
        """
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x):
        out = self.stem(x)
        out = self.layers(out)
        out = self.final_block(out)
        out = self.cls_layer(out)

        return out

### ConvNext

In [None]:
import torch 
import torch.nn as nn
import math 
import torchvision

class ConvNextBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride, expand_ratio):
        super().__init__()
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False

        self.drop_prob =0.0

        # expand ratio = 4 ## for convnext
        hidden_dim = in_channels*expand_ratio

        self.depth_conv = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, groups =in_channels, kernel_size = 7, padding = 3, bias = False),
            nn.BatchNorm2d(in_channels),
        )

        self.pw_conv_inc = nn.Sequential(
            nn.Conv2d(in_channels, hidden_dim, kernel_size = 1, stride =1 ,bias = False ), 
            nn.GELU(),
        )
        
        self.pw_conv_dec = nn.Sequential(
            nn.Conv2d(hidden_dim, out_channels, kernel_size = 1, stride = 1, bias = False),
        )
        
    def forward(self, x):
        out = self.depth_conv(x)
        out = self.pw_conv_inc(out)
        out = self.pw_conv_dec(out)

        # if self.do_identity:
        #     return x+ torchvision.ops.stochastic_depth(input = out, p=self.drop_prob, mode = "batch")
        # else:
        #     return out

        # return x+ torchvision.ops.stochastic_depth(input = out, p=self.drop_prob, mode = "batch")
        return x + out
        
class ConvNext(nn.Module):
    def __init__(self, num_classes=7000, dims=[96, 192, 384, 768]):
        super().__init__()
        self.num_classes = num_classes

        self.stem = nn.Sequential(
            nn.Conv2d(3, dims[0], kernel_size=4, stride = 4),
            nn.BatchNorm2d(dims[0])
        )
        in_channels = dims[0]
        self.stages = [
            [4,dims[0], 3, 2], 
            [4,dims[1], 3, 2], 
            [4,dims[2], 9, 2], 
            [4,dims[3], 3, 2]    
        ]

        layers = []

        for i, current_stage in enumerate(self.stages):
            expand_ratio, num_channels, num_blocks, stride = current_stage

            out_channels = num_channels
            if in_channels!=out_channels:
                downsample_layer = nn.Sequential(
                    nn.BatchNorm2d(in_channels), 
                    nn.Conv2d(in_channels, out_channels, kernel_size=2, stride = 2)
                )
            else:
                downsample_layer = nn.Identity()

            in_channels = out_channels
            layers.append(downsample_layer)
            for block_idx in range(num_blocks):
                layers.append(ConvNextBlock(
                    in_channels=in_channels,
                    out_channels = out_channels, 
                    stride = stride ,
                    expand_ratio=expand_ratio
                ))

        
        self.layers = nn.Sequential(*layers)
        self.final_norm = nn.BatchNorm2d(dims[-1])

        self.cls_layer = nn.Sequential(
            nn.AdaptiveAvgPool2d((1,1)),
            nn.Flatten(),
            # Use dropout if you want post flatten 
            nn.Linear(dims[-1], num_classes)
        )
    
    def forward(self, x, return_feats = False):
        out = self.stem(x)
        out = self.layers(out)
        out = self.final_norm(out)
        feats = self.cls_layer[:2](out)
        out = self.cls_layer(out)
        if return_feats:
            return feats
        else:
            return out

In [None]:
class myConvNextBlock(nn.Module):
    def __init__(self, in_channels, out_channels, stride, expansion, activation='gelu'):
        super().__init__()
        self.stride, self.expansion, self.activation = stride, expansion, activation

        if self.stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False

        if self.activation == 'gelu':
            self.activate = nn.GELU()
        elif self.activation == 'relu6':
            self.activate = nn.ReLU6()
        
        expansion_channels = in_channels * self.expansion

        self.depthwise = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=in_channels,
                      kernel_size=(7, 7), stride=1, padding=3, groups=in_channels, bias=False),
            nn.BatchNorm2d(in_channels),
        )

        self.increase_channels = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=expansion_channels,
                      kernel_size=(1, 1), stride=1, padding=0, bias=False),
            self.activate
        )

        self.decrease_channels = nn.Sequential(
            nn.Conv2d(in_channels=expansion_channels, out_channels=out_channels,
                      kernel_size=(1, 1), stride=1, padding=0, bias=False),
        )

    def forward(self, x):
        out = self.depthwise(x)
        out = self.increase_channels(out)
        out = self.decrease_channels(out)

        # if self.do_identity:
        #     return out + x
        # else:
        #     return out
        return x + out

class myConvNext(nn.Module):
    def __init__(self, in_channels, num_classes, channel_set, block_set):
        super().__init__()
        
        assert len(channel_set) == len(block_set)

        self.stem = nn.Sequential(
            nn.Conv2d(in_channels, channel_set[0], kernel_size=(4, 4), stride=4, padding=0, bias=False),
            nn.BatchNorm2d(channel_set[0]),
        )

        main_layers = []
        for i, block in enumerate(block_set):
            for _ in range(block):
                main_layers.append(
                    myConvNextBlock(
                        in_channels=channel_set[i], 
                        out_channels=channel_set[i], 
                        stride=2, 
                        expansion=4
                        )
                    )
                         
            if i != len(block_set) - 1:
                downsample_layer = nn.Sequential(
                    nn.BatchNorm2d(channel_set[i]), 
                    nn.Conv2d(channel_set[i], channel_set[i+1], kernel_size=(2, 2), stride = 2)
                )
                main_layers.append(downsample_layer)

        main_layers.append(nn.BatchNorm2d(channel_set[-1]))
        self.main_blocks = nn.Sequential(*main_layers)
        flattening = [            
                        nn.AdaptiveAvgPool2d((1,1)),
                        nn.BatchNorm2d(channel_set[-1]),
                        nn.Flatten()]
        self.flatten_layers = nn.Sequential(*flattening)
        self.cls_layer = nn.Sequential(
            nn.Linear(channel_set[-1], num_classes)
        )

        self.initialize_weights()
        
    def forward(self, x, return_feats=False):
        out = self.stem(x)
        out = self.main_blocks(out)
        feats = self.flatten_layers(out)
        out = self.cls_layer(feats)

        if return_feats:
            return feats
        else:
            return out

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.trunc_normal_(m.weight, std=0.2)
            elif isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.2)
                nn.init.constant_(m.bias, 0)

### ResNet

In [None]:
class Conv2dAuto(nn.Conv2d):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.padding =  (self.kernel_size[0] // 2, self.kernel_size[1] // 2) # dynamic add padding based on the kernel_size

conv3x3 = partial(Conv2dAuto, kernel_size=3, bias=False) 
# conv = conv3x3(in_channels=32, out_channels=64)

def get_activation(option):
    return nn.ModuleDict([
        ['relu', nn.ReLU(inplace=True)],
        ['leaky_relu', nn.LeakyReLU(negative_slope=0.01, inplace=True)],
        ['selu', nn.SELU(inplace=True)],
        ['none', nn.Identity()]
    ])[option]

def resnet_block(in_channels, out_channels, conv, *args, **kwargs):
    resnet_block_layers = [
              conv(in_channels, out_channels, *args, **kwargs),
              nn.BatchNorm2d(out_channels)
              ]
    return nn.Sequential(*resnet_block_layers)

class ResidualBlock(nn.Module):
    def __init__(self, in_channels, out_channels, activation='relu'):
        super().__init__()
        self.in_channels = in_channels
        self.out_channels = out_channels
        self.activation = activation
        self.blocks = nn.Identity()
        self.activate = get_activation(activation)
        self.shortcut = nn.Identity()

    def forward(self, x):
        residual = x
        if self.apply_shortcut: residual = self.shortcut(x)
        x = self.blocks(x)
        x = x + residual
        x = self.activate(x)
        return x

    @property
    def apply_shortcut(self):
        return self.in_channels != self.out_channels

class ResNetResidualBlock(ResidualBlock):
    def __init__(self, in_channels, out_channels, expansion=1, downsampling=1, conv=conv3x3, *args, **kwargs):
        super().__init__(in_channels, out_channels, *args, **kwargs)
        self.expansion = expansion
        self.downsampling = downsampling
        self.conv = conv
        self.shortcut = nn.Sequential(
            nn.Conv2d(self.in_channels, self.expanded_channels, kernel_size=1,stride=self.downsampling, bias=False),
            nn.BatchNorm2d(self.expanded_channels)
            ) if self.should_apply_shortcut else None
        
    @property
    def expanded_channels(self):
        return self.out_channels * self.expansion
    
    @property
    def should_apply_shortcut(self):
        return self.in_channels != self.expanded_channels

class ResNetBasicBlock(ResNetResidualBlock):
    expansion=1
    def __init__(self, in_channels, out_channels, *args, **kwargs):
        super().__init__(in_channels, out_channels, *args, **kwargs)
        self.blocks = nn.Sequential(
            resnet_block(self.in_channels, self.out_channels, conv=self.conv, bias=False, stride=self.downsampling),
            get_activation(self.activation),
            resnet_block(self.out_channels, self.expanded_channels, conv=self.conv, bias=False),
        )

class ResNetBottleNeckBlock(ResNetResidualBlock):
    expansion = 4
    def __init__(self, in_channels, out_channels, *args, **kwargs):
        super().__init__(in_channels, out_channels, expansion=4, *args, **kwargs)
        self.blocks = nn.Sequential(
            resnet_block(self.in_channels, self.out_channels, self.conv, kernel_size=1),
            get_activation(self.activation),
            resnet_block(self.out_channels, self.out_channels, self.conv, kernel_size=3, stride=self.downsampling),
            get_activation(self.activation),
            resnet_block(self.out_channels, self.expanded_channels, self.conv, kernel_size=1),
        )
    

class ResNetLayer(nn.Module):
    def __init__(self, in_channels, out_channels, block=ResNetBasicBlock, n=1, *args, **kwargs):
        super().__init__()

        downsampling = 2 if in_channels != out_channels else 1
        self.blocks = nn.Sequential(
            block(in_channels , out_channels, *args, **kwargs, downsampling=downsampling),
            *[block(out_channels * block.expansion, 
                    out_channels, downsampling=1, *args, **kwargs) for _ in range(n - 1)]
        )

    def forward(self, x):
        x = self.blocks(x)
        return x

class ResNetEncoder(nn.Module):
    def __init__(self, in_channels=3, blocks_sizes=[64, 128, 256, 512], deepths=[2,2,2,2], activation='relu', block=ResNetBasicBlock, *args, **kwargs):
        super().__init__()
        self.blocks_sizes = blocks_sizes
        
        self.gate = nn.Sequential(
            nn.Conv2d(in_channels, self.blocks_sizes[0], kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(self.blocks_sizes[0]),
            get_activation(activation),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        )
        
        self.in_out_block_sizes = list(zip(blocks_sizes, blocks_sizes[1:]))
        self.blocks = nn.ModuleList([ 
            ResNetLayer(blocks_sizes[0], blocks_sizes[0], n=deepths[0], activation=activation, 
                        block=block,*args, **kwargs),
            *[ResNetLayer(in_channels * block.expansion, 
                          out_channels, n=n, activation=activation, 
                          block=block, *args, **kwargs) 
              for (in_channels, out_channels), n in zip(self.in_out_block_sizes, deepths[1:])]       
        ])
        
    def forward(self, x):
        x = self.gate(x)
        for block in self.blocks:
            x = block(x)
        return x

class ResNetDecoder(nn.Module):
    def __init__(self, in_channels, out_channels):
        super().__init__()
        self.avg = nn.AdaptiveAvgPool2d((1, 1))
        self.decoder = nn.Linear(in_channels, out_channels)

    def forward(self, x):
        x = self.avg(x)
        x = x.view(x.size(0), -1)
        x = self.decoder(x)
        return x

class myResNet(nn.Module):
    def __init__(self, inc, outc, *args, **kwargs):
        super().__init__()
        self.encoder = ResNetEncoder(inc, *args, **kwargs)
        self.decoder = ResNetDecoder(self.encoder.blocks[-1].blocks[-1].expanded_channels, outc)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


In [None]:
# model = resnet34(in_channels=3, n_classes=7000)
# layers = list(model.children())[:-1]
# layers.append(nn.AdaptiveAvgPool2d((1, 1)))
# layers.append(nn.Flatten())
# layers.append(nn.Linear(512, 7000))
# backbone = nn.Sequential(*layers)
# # model = nn.Flatten(model)
# # model = models.resnet34(pretrained=False)
# summary(backbone, (3, 224, 224))

# FaceNet

In [None]:
def convnext(in_channels, n_classes, variant):
    params = {
        'tiny': {'channel_set': [96, 192, 384, 768], 'block_set': [3, 3, 9, 3]},
        'small': {'channel_set': [96, 192, 384, 768], 'block_set': [3, 3, 27, 3]},
        'big': {'channel_set': [128, 256, 512, 1024], 'block_set': [3, 3, 27, 3]},
        'large': {'channel_set': [192, 384, 768, 1536], 'block_set': [3, 3, 27, 3]},
    }

    convnext = myConvNext(in_channels, n_classes, **params[variant])
    layers = list(convnext.children())[:-1]
    return layers, 768


def resnet34(in_channels, n_classes, block=ResNetBasicBlock, *args, **kwargs):
    resnet = myResNet(in_channels, n_classes, block=block, deepths=[3, 4, 6, 3], *args, **kwargs)
    layers = list(resnet.children())[:-1]
    return layers, 512

In [None]:
# cm, size = convnext(3, 7000, 'tiny')
# backbone = nn.Sequential(*cm)
# cls_layer = nn.Sequential(
#     nn.AdaptiveAvgPool2d((1,1)),
#     nn.Flatten(),
#     nn.Linear(size, 7000))

# summary(backbone, (3, 224, 224))

In [None]:
class FaceNet(nn.Module):
    def list_to_kwarg(self, inc, outc, kernel, s, p):
        params = dict()
        params["in_channels"] = inc
        params["out_channels"] = outc
        params["kernel_size"] = kernel
        params["stride"] = s
        params["padding"] = p
        return params

    def __init__(self, config):
        super().__init__()
        num_classes = 7000
        num_channels = 3
        backbone_layers = []
        if config['backbone'] == 'simple':
            for l_idx, l_params in config['arch'].items():
                conv_params = self.list_to_kwarg(*l_params["conv"])
                backbone_layers.append(nn.Conv2d(**conv_params))
                backbone_layers.append(nn.BatchNorm2d(conv_params["out_channels"]))
                backbone_layers.append(nn.ReLU())
                if l_params["pool"] is not None:
                    if l_params["pool"]["max"]:
                        backbone_layers.append(nn.AdaptiveMaxPool2d(l_params["pool"]["output"]))
                    else:
                        backbone_layers.append(nn.AdaptiveAvgPool2d(l_params["pool"]["output"]))
            backbone_layers.append(nn.Flatten())
            self.backbone = nn.Sequential(*backbone_layers)
            self.cls_layer = nn.Linear(512, num_classes)
        else:
            if config['backbone'] == 'resnet_34':
                backbone_layers, size = resnet34(num_channels, num_classes)
                self.backbone = nn.Sequential(*backbone_layers)
                flattening = [            
                            nn.AdaptiveAvgPool2d((1,1)),
                            nn.BatchNorm2d(size),
                            nn.Flatten()]
                self.flatten_layers = nn.Sequential(*flattening)
                self.cls_layer = nn.Sequential(nn.Linear(size, num_classes))

            elif config['backbone'] == 'convnext':
                backbone_layers, size = convnext(num_channels, num_classes, 'tiny')

                self.backbone = nn.Sequential(*backbone_layers)
                self.flatten_layers = nn.Identity()
                self.cls_layer = nn.Sequential(nn.Linear(size, num_classes))

        self.initialize_weights()
    
    def forward(self, x, return_feats=False):
        feats = self.backbone(x)
        feats = self.flatten_layers(feats)
        out = self.cls_layer(feats)

        if return_feats:
            return feats
        else:
            return out

    def initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.trunc_normal_(m.weight, std=0.2)
            elif isinstance(m, nn.Linear):
                nn.init.trunc_normal_(m.weight, std=0.2)
                nn.init.constant_(m.bias, 0)
            # elif isinstance(m, nn.BatchNorm2d):
            #     nn.init.constant_(m.weight, 1)
            #     nn.init.constant_(m.bias, 0)

# FaceNet Training

In [None]:
class ArcFace(torch.nn.Module):
    """ ArcFace (https://arxiv.org/pdf/1801.07698v1.pdf):
    """
    def __init__(self, s=64.0, margin=0.5):
        super(ArcFace, self).__init__()
        self.scale = s
        self.cos_m = math.cos(margin)
        self.sin_m = math.sin(margin)
        self.theta = math.cos(math.pi - margin)
        self.sinmm = math.sin(math.pi - margin) * margin
        self.easy_margin = False


    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
        index = torch.where(labels != -1)[0]
        target_logit = logits[index, labels[index].view(-1)]

        sin_theta = torch.sqrt(1.0 - torch.pow(target_logit, 2))
        cos_theta_m = target_logit * self.cos_m - sin_theta * self.sin_m  # cos(target+margin)
        if self.easy_margin:
            final_target_logit = torch.where(
                target_logit > 0, cos_theta_m, target_logit)
        else:
            final_target_logit = torch.where(
                target_logit > self.theta, cos_theta_m, target_logit - self.sinmm)

        logits[index, labels[index].view(-1)] = final_target_logit
        logits = logits * self.scale
        return logits

class CosFace(torch.nn.Module):
    def __init__(self, s=64.0, m=0.40):
        super(CosFace, self).__init__()
        self.s = s
        self.m = m

    def forward(self, logits: torch.Tensor, labels: torch.Tensor):
        index = torch.where(labels != -1)[0]
        target_logit = logits[index, labels[index].view(-1)]
        final_target_logit = target_logit - self.m
        logits[index, labels[index].view(-1)] = final_target_logit
        logits = logits * self.s
        return logits

In [None]:
from torchvision.transforms.transforms import ToTensor
class FaceNetSetup:
    def __init__(self, config, save_path):
        self.config = config
        self.log = config['log']

        if config['subset']:
            train_path = r"train_subset/train_subset"
        else:
            train_path = r"classification/classification/train"
        self.SAVE_DIR = save_path
        self.DATA_DIR = r"/content" 
        self.TRAIN_DIR = osp.join(self.DATA_DIR, train_path) 
        self.VAL_DIR = osp.join(self.DATA_DIR, r"classification/classification/dev")

    def __check_model_params(self):
        num_trainable_parameters = 0
        for p in self.model.parameters():
            num_trainable_parameters += p.numel()
        print("Number of Params: {}".format(num_trainable_parameters))
        assert num_trainable_parameters <= 35000000

    def __gen_model_name(self):
        save_name = ''
        if not self.config['subset']:
            save_name += "Full_"
        for key, val in self.config.items():
            abbr = key[0] if len(key) > 2 else key
            if isinstance(val, dict):
                data = 'lr' + str(val["lr"])
                save_name += data
                break
            else:
                data = abbr + str(val) + '_'
                save_name += data
        if self.config['randomize']:
            save_name = save_name + "-v" + str(np.random.randint(10, 1000))
        print("\nModel Name: ", save_name)

        return save_name

    def __save_model_params(self, continue_train):
        # Create Model Directory
        save_path = os.path.join(self.SAVE_DIR, self.model_name)
        if not continue_train:
            try:
                os.mkdir(save_path)
            except FileExistsError:
                d = input("Model name already exists. Delete existing model? (y/n)")
                if d == 'y':
                    import shutil
                    shutil.rmtree(save_path)
                    os.mkdir(save_path)
                else:
                    print("Exiting!")
                    exit(0)
                    return None

            os.mkdir(os.path.join(save_path, 'Checkpoints'))
            # Saving Model Configuration
            with open(os.path.join(save_path, 'model_config.yaml'), 'w') as metadata:
                yaml.dump({'Experiment': self.config['']}, metadata, indent=4, default_flow_style=False)
                yaml.dump(self.config, metadata, indent=4, default_flow_style=False)
            print("Model to be saved at: ", save_path)
        return save_path

    def __dataloaders(self): 
        """
        Transforms (data augmentation) is quite important for this task.
        Go explore https://pytorch.org/vision/stable/transforms.html for more details
        """
        if self.config["transforms"]:
            self.train_transforms = [
                                    # ttf.RandomRotation(10, expand=False),
                                    ttf.ColorJitter(brightness=(0.8, 1.2), contrast=(0.8, 1.2), saturation=(0.8, 1.2)),
                                    ttf.RandomHorizontalFlip(p=0.4),
                                    ttf.RandomResizedCrop((224, 224), scale=(0.3, 1)),
                                    ttf.RandAugment(),
                                    ttf.ToTensor(),
                                    # ttf.PILToTensor(),
                                    # ttf.ConvertImageDtype(torch.float),
                                    # ttf.RandomErasing(p=0.4),
                                    ]
            self.val_transforms = [ttf.ToTensor()]
        else:
            self.train_transforms = [ttf.ToTensor()]
            self.val_transforms = [ttf.ToTensor()]

        self.train_dataset = torchvision.datasets.ImageFolder(self.TRAIN_DIR, transform=ttf.Compose(self.train_transforms))
        self.val_dataset = torchvision.datasets.ImageFolder(self.VAL_DIR, transform=ttf.Compose(self.val_transforms))

        self.train_loader = DataLoader(self.train_dataset, batch_size=self.config['batch_size'], shuffle=True, drop_last=True, num_workers=2, pin_memory=True)
        self.val_loader = DataLoader(self.val_dataset, batch_size=self.config['batch_size'], shuffle=False, drop_last=True, num_workers=2)

    def setup(self, continue_train=False, chkpt=None):
        self.__dataloaders()
        # Model
        # self.model = FaceNet(self.config)
        # conv_params = {'channel_set': [96, 192, 384, 768], 'block_set': [3, 3, 9, 3]}
        # self.model = myConvNext(3, 7000, **conv_params)
        self.model = ConvNext()
        self.model.cuda()
        summary(self.model, (3, 224, 224))

        self.__check_model_params()
        self.model_name = self.__gen_model_name()
        self.model_path = self.__save_model_params(continue_train)

        # Loss
        if self.config["loss"] == 'CrossEL':
            self.criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
        elif self.config["loss"] == 'ArcFace':
            self.criterion == ArcFace()
        
        # Optimizer
        if self.config["optimizer"] == 'SGD':
            self.optimizer = optim.SGD(self.model.parameters(), **self.config['optim'])
        elif self.config["optimizer"] == "Adam":
            self.optimizer = optim.Adam(self.model.parameters(), **self.config['optim'])
        elif self.config["optimizer"] == "AdamW":
            self.optimizer = optim.AdamW(self.model.parameters(), **self.config['optim'])

        if continue_train:
            self.chkpt = chkpt
            assert chkpt is not None
            chkpt_path = os.path.join(self.model_path, 'Checkpoints', 'chkpt_' + str(chkpt) + '.pth')
            checkpoint = torch.load(chkpt_path)
            print(checkpoint.keys())
            self.model.load_state_dict(checkpoint['model_state_dict'])
            self.optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
            print("Continuing training from epoch ", checkpoint["epoch"] )

        # Scheduler
        if self.config["scheduler"] == 'CosineAnnealingLR':
            self.scheduler = optim.lr_scheduler.CosineAnnealingLR(self.optimizer, T_max=(len(self.train_loader) * self.config['epochs']))
        elif self.config["scheduler"] == 'ReduceLRonPlateau':
            self.scheduler = optim.lr_scheduler.ReduceLROnPlateau(self.optimizer, mode='max',factor=0.5, patience=2)

        self.scaler = torch.cuda.amp.GradScaler()
    
    def train(self, validate=False, continue_train=False):
        if continue_train:
            chkpt = self.chkpt
        else: 
            chkpt = 0   
        # epochs = self.config['epochs'] - chkpt
        epochs = self.config['epochs']
        batch_size = self.config['batch_size']

        if self.log:
            wandb.init(project="hw2-letsgo", entity="nefario7", config=self.config)
            wandb.watch(self.model, criterion=self.criterion, log="all", log_freq=batch_size, idx=None,log_graph=True)

        for epoch in range(epochs):
            self.model.train()
            print("-"*25 + "Epoch " + str(epoch) + "-"*25)
            # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
            batch_bar = tqdm(total=len(self.train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

            num_correct = 0
            total_loss = 0

            for i, (x, y) in enumerate(self.train_loader):
                self.model.train()
                self.optimizer.zero_grad()

                x = x.cuda()
                y = y.cuda()

                # Don't be surprised - we just wrap these two lines to make it work for FP16
                with torch.cuda.amp.autocast():     
                    outputs = self.model(x)
                    loss = self.criterion(outputs, y)

                # Update # correct & loss as we go
                num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
                total_loss += loss

                # tqdm lets you add some details so you can monitor training as you train.
                batch_bar.set_postfix(
                    acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
                    loss="{:.04f}".format(float(total_loss / (i + 1))),
                    num_correct=num_correct,
                    lr="{:.04f}".format(float(self.optimizer.param_groups[0]['lr'])))
                
                # Another couple things you need for FP16. 
                self.scaler.scale(loss).backward() # This is a replacement for loss.backward()
                self.scaler.step(self.optimizer) # This is a replacement for optimizer.step()
                self.scaler.update() # This is something added just for FP16
                if self.config["scheduler"] == 'CosineAnnealingLR':
                    self.scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

                batch_bar.update() # Update tqdm bar                

            batch_bar.close() # You need this to close the tqdm bar

            trainacc = 100 * num_correct / (len(self.train_loader) * batch_size)
            trainlos = float(total_loss / len(self.train_loader))
            trainlra = float(self.optimizer.param_groups[0]['lr'])
            print(f"Epoch {epoch + 1}/{epochs}: Train Acc {trainacc:.04f}%, Train Loss {trainlos:.04f}, Learning Rate {trainlra:.04f}")
            print(f"Total Loss = {total_loss}, Train Loader = {len(self.train_loader)}")

            if self.log:
                    wandb.log({
                        "Training Accuracy": trainacc,
                        "Training Loss": trainlos,
                        "Num Correct": num_correct,
                        "Learning Rate": trainlra
                            })
            if self.config["scheduler"] == 'ReduceLRonPlateau':
                val_acc = self.validate(self.model)
                self.model.train()
                self.scheduler.step(val_acc) # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.
            elif validate:
                val_acc = self.validate(self.model)

            if epoch % 10 == 0:
                self.save_model(epoch)
            
            # Save Checkpoint
            self.save_checkpoint(epoch + chkpt + 1, self.model, self.optimizer, total_loss / len(self.train_loader))
    
        return self.model
        
    def validate(self, val_model):
        val_model.eval()
        batch_bar = tqdm(total=len(self.val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
        num_correct = 0
        for i, (x, y) in enumerate(self.val_loader):
            x = x.cuda()
            y = y.cuda()

            with torch.no_grad():
                outputs = val_model(x)

            num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
            batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * self.config['batch_size'])))

            batch_bar.update()
            
        batch_bar.close()
        val_acc = 100 * num_correct / len(self.val_dataset)
        print("\nValidation: {:.04f}%".format(val_acc))
        if self.log:
            wandb.log({"Validation Accuracy": val_acc})

        return val_acc

    def save_checkpoint(self, epoch, model, optimizer, loss):
        print("\nSaving Checkpoint!")
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': loss
            }, os.path.join(self.model_path, 'Checkpoints', 'chkpt_' + str(epoch) + '.pth'))

    def save_model(self, epoch, onnx=False):
        # if save_best:
        #     torch.save(self.model.state_dict(), os.path.join(self.model_path, "best_model.pth"))
        # else:
        print("\nSaving Model!")
        name = os.path.join(self.model_path, "model_" + str(epoch) + ".pth")
        torch.save(self.model.state_dict(), name)
        if onnx:
            torch.onnx.export(self.model, name.split('.')[0] + '.onnx')
            wandb.save(name.split('.')[0] + '.onnx')

        print("Model saved at : ", self.model_path)

In [None]:
def run(config, val, folder = 'working'):
    torch.cuda.empty_cache()

    # FaceNet
    folder_path = r'/content/cmudrive/IDL/hw2-' + folder
    face = FaceNetSetup(config, save_path =folder_path)
    face.setup()
    # Model Training
    facenet_model = face.train(validate=val)
    # Save Trained Model
    face.save_model()
    # Validation
    face.validate(facenet_model)
    if face.log:
        wandb.finish()

    return face

def loop_run(config, tests, folder):
    for param in tests:
        p, v = list(param.items())[0]
        if isinstance(v, list):
            config[p] = v[0]
            config['optim'] = v[1]
        else:
            config[p] = v
        print(config)

        face = run(config, True, folder=folder)

        del face.model


# Hyperparameters and Run

In [None]:
# config = {
#     '': 'convnext',
#     'batch_size': 256,
#     'transforms': True,
#     'epochs': 100,
#     'backbone': 'convnext',
#     'dropout': None,
#     'optimizer': 'SGD',         # SGD, Adam, AdamW
#     'loss': 'CrossEL',          # CrossEL, 
#     'optim': {'lr': 0.1, 'momentum':0.9, 'weight_decay':1e-4m},
#     'scheduler': 'CosineAnnealingLR', 
#     'subset': False,
#     'save': True,
#     'log': True,
#     'randomize': False,
# }

# folder_path = r'/content/cmudrive/IDL/hw2-' + 'letsgo'
# face = FaceNetSetup(config, save_path=folder_path)
# face.setup(continue_train=True, chkpt=67)
# face.train(continue_train=True, validate=False)

In [None]:
config = {
    '': 'ConvNext',
    'batch_size': 256,
    'transforms': True,
    'epochs': 50,
    'backbone': 'convnext',
    'dropout': None,
    'optimizer': 'SGD',                 # SGD, Adam, AdamW
    'loss': 'CrossEL',                  # CrossEL
    'scheduler': 'ReduceLRonPlateau',   # CosineAnnealingLR, ReduceLRonPlateau
    'optim': {'lr': 0.1, 'momentum':0.9, 'weight_decay':1e-4, 'nesterov':True},
    'subset': False,
    'save': True,
    'log': True,
    'randomize': False,
}

In [None]:
# Loop Test
# tests = [
    # {'transforms': False}, 
    # {'transforms': True}, 
    # {'optimizer': ['Adam', {'lr':0.01}]}, 
    # {'optimizer': ['AdamW', {'lr':0.01, 'weight_decay':1e-4}]}
# ]

# Standalone Test
torch.cuda.empty_cache()

# # FaceNet
# folder_path = r'/content/cmudrive/IDL/hw2-' + 'please'
# face = FaceNetSetup(config, save_path = folder_path)
# face.setup()

# # Model Training
# facenet_model = face.train(validate=True)

# # Save Trained Model
# face.save_model()

# # Validation
# face.validate(facenet_model)

# if face.log:
#     wandb.finish()


folder_path = r'/content/cmudrive/IDL/hw2-' + 'please'
face = FaceNetSetup(config, save_path=folder_path)
face.setup(continue_train=True, chkpt=50)
face.train(continue_train=True, validate=False)

Layer (type:depth-idx)                   Output Shape              Param #
├─Sequential: 1-1                        [-1, 96, 56, 56]          --
|    └─Conv2d: 2-1                       [-1, 96, 56, 56]          4,704
|    └─BatchNorm2d: 2-2                  [-1, 96, 56, 56]          192
├─Sequential: 1-2                        [-1, 768, 7, 7]           --
|    └─Identity: 2-3                     [-1, 96, 56, 56]          --
|    └─ConvNextBlock: 2-4                [-1, 96, 56, 56]          --
|    |    └─Sequential: 3-1              [-1, 96, 56, 56]          4,896
|    |    └─Sequential: 3-2              [-1, 384, 56, 56]         36,864
|    |    └─Sequential: 3-3              [-1, 96, 56, 56]          36,864
|    └─ConvNextBlock: 2-5                [-1, 96, 56, 56]          --
|    |    └─Sequential: 3-4              [-1, 96, 56, 56]          4,896
|    |    └─Sequential: 3-5              [-1, 384, 56, 56]         36,864
|    |    └─Sequential: 3-6              [-1, 96, 56, 56]      

[34m[1mwandb[0m: Currently logged in as: [33mnefario7[0m (use `wandb login --relogin` to force relogin)


[34m[1mwandb[0m: logging graph, to disable use `wandb.watch(log_graph=False)`


-------------------------Epoch 0-------------------------




Epoch 1/50: Train Acc 99.0113%, Train Loss 1.4731, Learning Rate 0.0063
Total Loss = 804.28759765625, Train Loader = 546





Validation: 85.9543%

Saving Model!
Model saved at :  /content/cmudrive/IDL/hw2-please/Full_ConvNext_b256_tTrue_e50_bconvnext_dNone_oSGD_lCrossEL_sReduceLRonPlateau_lr0.1

Saving Checkpoint!
-------------------------Epoch 1-------------------------




Epoch 2/50: Train Acc 99.0685%, Train Loss 1.4697, Learning Rate 0.0063
Total Loss = 802.434326171875, Train Loader = 546





Validation: 85.8514%

Saving Checkpoint!
-------------------------Epoch 2-------------------------




Epoch 3/50: Train Acc 99.0256%, Train Loss 1.4719, Learning Rate 0.0063
Total Loss = 803.6770629882812, Train Loader = 546





Validation: 86.1771%

Saving Checkpoint!
-------------------------Epoch 3-------------------------




Epoch 4/50: Train Acc 99.0220%, Train Loss 1.4693, Learning Rate 0.0063
Total Loss = 802.2288208007812, Train Loader = 546





Validation: 86.1229%

Saving Checkpoint!
-------------------------Epoch 4-------------------------




Epoch 5/50: Train Acc 99.0399%, Train Loss 1.4682, Learning Rate 0.0063
Total Loss = 801.661376953125, Train Loader = 546





Validation: 85.9886%

Saving Checkpoint!
-------------------------Epoch 5-------------------------




Epoch 6/50: Train Acc 99.0063%, Train Loss 1.4679, Learning Rate 0.0063
Total Loss = 801.4757690429688, Train Loader = 546





Validation: 86.1457%

Saving Checkpoint!
-------------------------Epoch 6-------------------------




Epoch 7/50: Train Acc 99.1393%, Train Loss 1.4578, Learning Rate 0.0031
Total Loss = 795.959716796875, Train Loader = 546





Validation: 86.6057%

Saving Checkpoint!
-------------------------Epoch 7-------------------------




Epoch 8/50: Train Acc 99.1758%, Train Loss 1.4555, Learning Rate 0.0031
Total Loss = 794.7139282226562, Train Loader = 546





Validation: 86.5657%

Saving Checkpoint!
-------------------------Epoch 8-------------------------




Epoch 9/50: Train Acc 99.1293%, Train Loss 1.4564, Learning Rate 0.0031
Total Loss = 795.1715698242188, Train Loader = 546





Validation: 86.4943%

Saving Checkpoint!
-------------------------Epoch 9-------------------------




Epoch 10/50: Train Acc 99.1272%, Train Loss 1.4558, Learning Rate 0.0031
Total Loss = 794.8919677734375, Train Loader = 546





Validation: 86.5286%

Saving Checkpoint!
-------------------------Epoch 10-------------------------




Epoch 11/50: Train Acc 99.1851%, Train Loss 1.4500, Learning Rate 0.0016
Total Loss = 791.7093505859375, Train Loader = 546





Validation: 86.6571%

Saving Model!
Model saved at :  /content/cmudrive/IDL/hw2-please/Full_ConvNext_b256_tTrue_e50_bconvnext_dNone_oSGD_lCrossEL_sReduceLRonPlateau_lr0.1

Saving Checkpoint!
-------------------------Epoch 11-------------------------




Epoch 12/50: Train Acc 99.1873%, Train Loss 1.4506, Learning Rate 0.0016
Total Loss = 792.03173828125, Train Loader = 546





Validation: 86.8371%

Saving Checkpoint!
-------------------------Epoch 12-------------------------




Epoch 13/50: Train Acc 99.2109%, Train Loss 1.4484, Learning Rate 0.0016
Total Loss = 790.8237915039062, Train Loader = 546





Validation: 86.6886%

Saving Checkpoint!
-------------------------Epoch 13-------------------------




Epoch 14/50: Train Acc 99.2230%, Train Loss 1.4476, Learning Rate 0.0016
Total Loss = 790.3978271484375, Train Loader = 546





Validation: 86.7457%

Saving Checkpoint!
-------------------------Epoch 14-------------------------




Epoch 15/50: Train Acc 99.1994%, Train Loss 1.4485, Learning Rate 0.0016
Total Loss = 790.870849609375, Train Loader = 546





Validation: 86.7000%

Saving Checkpoint!
-------------------------Epoch 15-------------------------




Epoch 16/50: Train Acc 99.2202%, Train Loss 1.4466, Learning Rate 0.0008
Total Loss = 789.86669921875, Train Loader = 546





Validation: 86.8429%

Saving Checkpoint!
-------------------------Epoch 16-------------------------




Epoch 17/50: Train Acc 99.1994%, Train Loss 1.4473, Learning Rate 0.0008
Total Loss = 790.2464599609375, Train Loader = 546





Validation: 86.8229%

Saving Checkpoint!
-------------------------Epoch 17-------------------------




Epoch 18/50: Train Acc 99.2517%, Train Loss 1.4454, Learning Rate 0.0008
Total Loss = 789.1870727539062, Train Loader = 546





Validation: 86.8429%

Saving Checkpoint!
-------------------------Epoch 18-------------------------




Epoch 19/50: Train Acc 99.2152%, Train Loss 1.4451, Learning Rate 0.0004
Total Loss = 789.0362548828125, Train Loader = 546





Validation: 86.8457%

Saving Checkpoint!
-------------------------Epoch 19-------------------------




Epoch 20/50: Train Acc 99.2352%, Train Loss 1.4444, Learning Rate 0.0004
Total Loss = 788.6648559570312, Train Loader = 546





Validation: 86.8800%

Saving Checkpoint!
-------------------------Epoch 20-------------------------




Epoch 21/50: Train Acc 99.2137%, Train Loss 1.4449, Learning Rate 0.0004
Total Loss = 788.9308471679688, Train Loader = 546





Validation: 86.9143%

Saving Model!
Model saved at :  /content/cmudrive/IDL/hw2-please/Full_ConvNext_b256_tTrue_e50_bconvnext_dNone_oSGD_lCrossEL_sReduceLRonPlateau_lr0.1

Saving Checkpoint!
-------------------------Epoch 21-------------------------




Epoch 22/50: Train Acc 99.2595%, Train Loss 1.4441, Learning Rate 0.0004
Total Loss = 788.4917602539062, Train Loader = 546





Validation: 86.8457%

Saving Checkpoint!
-------------------------Epoch 22-------------------------




Epoch 23/50: Train Acc 99.1916%, Train Loss 1.4444, Learning Rate 0.0004
Total Loss = 788.6183471679688, Train Loader = 546





Validation: 86.8571%

Saving Checkpoint!
-------------------------Epoch 23-------------------------




Epoch 24/50: Train Acc 99.2288%, Train Loss 1.4436, Learning Rate 0.0004
Total Loss = 788.2271118164062, Train Loader = 546





Validation: 86.8314%

Saving Checkpoint!
-------------------------Epoch 24-------------------------




Epoch 25/50: Train Acc 99.2552%, Train Loss 1.4435, Learning Rate 0.0002
Total Loss = 788.138671875, Train Loader = 546





Validation: 86.8514%

Saving Checkpoint!
-------------------------Epoch 25-------------------------




Epoch 26/50: Train Acc 99.2624%, Train Loss 1.4424, Learning Rate 0.0002
Total Loss = 787.55712890625, Train Loader = 546





Validation: 86.8714%

Saving Checkpoint!
-------------------------Epoch 26-------------------------




Epoch 27/50: Train Acc 99.2731%, Train Loss 1.4422, Learning Rate 0.0002
Total Loss = 787.4480590820312, Train Loader = 546





Validation: 86.8914%

Saving Checkpoint!
-------------------------Epoch 27-------------------------




Epoch 28/50: Train Acc 99.2338%, Train Loss 1.4427, Learning Rate 0.0001
Total Loss = 787.6932373046875, Train Loader = 546





Validation: 86.9057%

Saving Checkpoint!
-------------------------Epoch 28-------------------------




Epoch 29/50: Train Acc 99.2438%, Train Loss 1.4426, Learning Rate 0.0001
Total Loss = 787.6332397460938, Train Loader = 546





Validation: 86.8857%

Saving Checkpoint!
-------------------------Epoch 29-------------------------




Epoch 30/50: Train Acc 99.2531%, Train Loss 1.4423, Learning Rate 0.0001
Total Loss = 787.5050659179688, Train Loader = 546





Validation: 86.8771%

Saving Checkpoint!
-------------------------Epoch 30-------------------------




Epoch 31/50: Train Acc 99.2445%, Train Loss 1.4432, Learning Rate 0.0000
Total Loss = 787.9873657226562, Train Loader = 546





Validation: 86.9229%

Saving Model!
Model saved at :  /content/cmudrive/IDL/hw2-please/Full_ConvNext_b256_tTrue_e50_bconvnext_dNone_oSGD_lCrossEL_sReduceLRonPlateau_lr0.1

Saving Checkpoint!
-------------------------Epoch 31-------------------------




Epoch 32/50: Train Acc 99.2710%, Train Loss 1.4421, Learning Rate 0.0000
Total Loss = 787.37548828125, Train Loader = 546





Validation: 86.8914%

Saving Checkpoint!
-------------------------Epoch 32-------------------------


Train:  55%|█████▌    | 301/546 [08:56<07:05,  1.74s/it, acc=99.2395%, loss=1.4424, lr=0.0000, num_correct=76470]

KeyboardInterrupt: ignored

# Classification Task: Submit to Kaggle

In [None]:
class ClassificationTestSet(Dataset):
    # It's possible to load test set data using ImageFolder without making a custom class.
    # See if you can think it through!

    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        return self.transforms(Image.open(self.img_paths[idx]))

In [None]:
class ClassificationSubmission():
    def __init__(self, data_path, csv_path):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.drive_dir = r'/content/cmudrive/IDL'
        self.DATA_DIR = r"/content" 
        self.TEST_DIR = osp.join(self.DATA_DIR, r"classification/classification/test")

    def __get_labels(self, imodel, iargs):
        imodel.eval()
        labels = []
        print(f"Context = {iargs['context']} | Batch Size = {iargs['batch_size']} | Arch = {iargs['arch']}")
        with torch.no_grad():
            for i in range(len(self.test_samples)):
                X = self.test_samples[i]
                test_items = SubmissionItems(X, context=iargs['context'])
                test_loader = torch.utils.data.DataLoader(test_items, batch_size=iargs['batch_size'], num_workers=2, pin_memory=True, shuffle=False)

                for data in tqdm(test_loader):
                    data = data.float().to(self.device)              
                    output = imodel(data)
                    y = torch.argmax(output, axis=1)
                    labels.extend(y.tolist())
        return labels

    def __load_model(self, model_name, model_type): 
        meta_path = os.path.join(self.drive_dir,  model_type, model_name, 'model_parameters.yaml')
        with open(meta_path, 'r') as meta:
            args = yaml.safe_load(meta)

        model_path = os.path.join(self.drive_dir, model_type, model_name, 'model.pth')
        model = Network(args["arch"], args['context'], args['drop']).to(self.device)
        # summary(model)
        model.load_state_dict(torch.load(model_path))
        return model, args

    def simple_inference(self, model_name, model_type):
        print("Running inference...")
        self.timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')
        model, args = self.__load_model(model_name, model_type)
        labels = self.__get_labels(model, args)
        
        return labels

    def ensemble_inference(self, model_names, model_type):
        print("Running ensembled inference...")
        self.timestamp = datetime.datetime.now().strftime('%Y-%m-%d_%H-%M-%S')

        prelim_labels = []
        for name in model_names:
            print("\n\n\tModel : ", name)
            model, args = self.__load_model(name, model_type)
            prelim_labels.append(self.__get_labels(model, args))

        accs = [86.146, 85.79, 84.95]
        w = accs / np.sum(accs)

        print("Combining predictions...")
        labels_df = pd.DataFrame(prelim_labels)
        labels_df = labels_df.transpose()
        ensembled_labels = labels_df.mode(axis=1, dropna=False).iloc[:, 0].tolist()
        # ensembled_labels = np.where((df.iloc[:,1] == df.iloc[:, 2]), df.iloc[:, 1], df.iloc[:, 0]).tolist()

        return labels_df, ensembled_labels

    def generate_submission(self, save_path, labels): 
        sub_dir = os.path.join(self.drive_dir, save_path + self.timestamp)
        sub_path = os.path.join(sub_dir, 'submission.csv')

        with open(r"/content/classification_early_submission.csv", "w+") as f:
            f.write("id,label\n")
            for i in tqdm(range(len(test_dataset))):
                f.write("{},{}\n".format(str(i).zfill(6) + ".jpg", res[i]))

        print(f"File saved at : {sub_path}")
        return sub_path

### Validation

In [None]:
DATA_DIR = r"/content"
TEST_DIR = osp.join(DATA_DIR, r"classification/classification/test")
val_transforms = [ttf.ToTensor()]

test_dataset = ClassificationTestSet(TEST_DIR, ttf.Compose(val_transforms))
test_loader = DataLoader(test_dataset, batch_size=config['batch_size'], shuffle=False, drop_last=False, num_workers=2, pin_memory=True)

In [None]:
folder_path = r'/content/cmudrive/IDL/hw2-please/Full_ConvNext_b256_tTrue_e50_bconvnext_dNone_oSGD_lCrossEL_sReduceLRonPlateau_lr0.1'

# Checkpoint Loading
# valmodel = ConvNext()
# valmodel.cuda()
# no = 50
# val_path = os.path.join(folder_path, 'Checkpoints', 'chkpt_' + str(no) + '.pth')

# Model Loading
valmodel = ConvNext()
valmodel.cuda()
val_path = os.path.join(folder_path, 'model_50.pth')


valmodel.load_state_dict(torch.load(val_path))
valmodel.eval()
# batch_bar = tqdm(total=len(test_loader), dynamic_ncols=True, position=0, leave=False, desc='Test')

# res = []
# for i, (x) in enumerate(test_loader):
#     with torch.no_grad():
#         x = x.cuda()
  
#         outputs = valmodel(x)

#         y = torch.argmax(outputs, axis=1)
#         res.extend(y.tolist())

#         batch_bar.update()
    
# batch_bar.close()

ConvNext(
  (stem): Sequential(
    (0): Conv2d(3, 96, kernel_size=(4, 4), stride=(4, 4))
    (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (layers): Sequential(
    (0): Identity()
    (1): ConvNextBlock(
      (depth_conv): Sequential(
        (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96, bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (pw_conv_inc): Sequential(
        (0): Conv2d(96, 384, kernel_size=(1, 1), stride=(1, 1), bias=False)
        (1): GELU()
      )
      (pw_conv_dec): Sequential(
        (0): Conv2d(384, 96, kernel_size=(1, 1), stride=(1, 1), bias=False)
      )
    )
    (2): ConvNextBlock(
      (depth_conv): Sequential(
        (0): Conv2d(96, 96, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=96, bias=False)
        (1): BatchNorm2d(96, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   

In [None]:
with open(r"/content/classification_submission_convnext.csv", "w+") as f:
    f.write("id,label\n")
    for i in tqdm(range(len(test_dataset))):
        f.write("{},{}\n".format(str(i).zfill(6) + ".jpg", res[i]))

100%|██████████| 35000/35000 [00:00<00:00, 537011.33it/s]


In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-classification -f /content/classification_submission_convnext.csv -m "MyConvNext"

100% 541k/541k [00:00<00:00, 948kB/s]
Successfully submitted to Face Recognition

# Verification Task

There are 6K verification dev images, but 166K "pairs" for you to compare. So, it's much more efficient to compute the features for the 6K verification images, and just compare afterwards.

This will be done by creating a dictionary mapping the image file names to the features. Then, you'll use this dictionary to compute the similarities for each pair.

In [None]:
class FaceVerNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.anchor_net = ConvNext()
        self.positive_net = ConvNext()
        self.negative_net = ConvNext()

    def forward(self, x):
        return x

In [None]:
class VerificationDataset(Dataset):
    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        # We return the image, as well as the path to that image (relative path)
        return self.transforms(Image.open(self.img_paths[idx])), osp.relpath(self.img_paths[idx], self.data_dir)

In [None]:
class TripletDataset(torchvision.datasets.VisionDataset):
  def __init__(self, root, transform):  
    # For "root", note that you're making this dataset on top of the regular classification dataset.
    self.dataset = torchvision.datasets.ImageFolder(root=root, transform=transform)
    
    # map class indices to dataset image indices
    self.classes_to_img_indices = [[] for _ in range(len(self.dataset.classes))]
    for img_idx, (_, class_id) in enumerate(self.dataset.samples):
      self.classes_to_img_indices[class_id].append(img_idx)
    
    # VisionDataset attributes for display
    self.root = root
    self.length = len(self.dataset.classes) # pseudo length! Length of this dataset is 7000, *not* the actual # of images in the dataset. You can just increase the # of epochs you train for.
    self.transforms = self.dataset.transforms
          
  def __len__(self):
    return self.length
    
  def __getitem__(self, anchor_class_idx):
    """Treat the given index as the anchor class and pick a triplet randomly"""
    anchor_class = self.classes_to_img_indices[anchor_class_idx]
    # choose positive pair (assuming each class has at least 2 images)
    anchor, positive = np.random.choice(a=anchor_class, size=2, replace=False)
    # choose negative image
    # hint for further exploration: you can choose 2 negative images to make it a Quadruplet Loss

    classes_to_choose_negative_class_from = list(range(self.length))
    classes_to_choose_negative_class_from.pop(???) # TODO: What are we removing?
    negative_class = # TODO: How do we randomly choose a negative class?
    negative = # TODO: How do we get a sample from that negative class?
    
    # self.dataset[idx] will return a tuple (image tensor, class label). You can use its outputs to train for classification alongside verification
    # If you do not want to train for classification, you can use self.dataset[idx][0] to get the image tensor
    return self.dataset[anchor], self.dataset[positive], self.dataset[negative]


In [None]:
triplets = TripletDataset(root='classification/classification/dev')

In [None]:
val_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/dev"), ttf.Compose([ttf.ToTensor()]))
val_ver_loader = torch.utils.data.DataLoader(val_veri_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2)

In [None]:
feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(val_ver_loader), total=len(val_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()
    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try the final outputs too!
        features = valmodel(imgs, return_feats=False)
        gelu = nn.GELU()
        features = gelu(features) 
        for i, feature in enumerate(features):
            feats_dict[path_names[i]] = feature
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.



In [None]:
# What does this dict look like?
print(list(feats_dict.items())[0][1].shape)

torch.Size([7000])


In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
cosine_sim = nn.CosineSimilarity(dim=0, eps=1e-8)
val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_dev.csv")

# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
gt_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2, gt = line.split(",")
    img_path1 = img_path1.split('/')[-1]
    img_path2 = img_path2.split('/')[-1]
    feat1 = feats_dict[img_path1]
    feat2 = feats_dict[img_path2]

    # TODO: Use the similarity metric
    sim_score = cosine_sim(feat1, feat2)
    pred_similarities.append(sim_score.item())
    gt_similarities.append(int(gt))

pred_similarities = np.array(pred_similarities)
gt_similarities = np.array(gt_similarities)

print("AUC:", roc_auc_score(gt_similarities, pred_similarities))



AUC: 0.9629680014176387


# Verification Task: Submit to Kaggle

In [None]:
test_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/test"), ttf.Compose([ttf.ToTensor()]))
test_ver_loader = torch.utils.data.DataLoader(test_veri_dataset, batch_size=config['batch_size'], shuffle=False, num_workers=2)

In [None]:
valmodel.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(test_ver_loader), total=len(test_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try to final outputs too!
        feats = valmodel(imgs, return_feats=False)
        gelu = nn.GELU()
        feats = gelu(feats) 
        for i, feat in enumerate(feats):
            feats_dict[path_names[i]] = feat
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.



In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
cosine_sim = nn.CosineSimilarity(dim=0, eps=1e-8)
val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_test.csv")

# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2 = line.split(",")
    img_path1 = img_path1.split('/')[-1]
    img_path2 = img_path2.split('/')[-1]
    feat1 = feats_dict[img_path1]
    feat2 = feats_dict[img_path2]
    sim_score = cosine_sim(feat1, feat2)
    pred_similarities.append(sim_score.item())

    # TODO: Finish up verification testing.
    # How to use these img_paths? What to do with the features?



In [None]:
with open(r"/content/verification_submission.csv", "w+") as f:
    f.write("id,match\n")
    for i in range(len(pred_similarities)):
        f.write("{},{}\n".format(i, pred_similarities[i]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-verification -f /content/verification_submission.csv -m 'MyConvNext'

100% 16.7M/16.7M [00:00<00:00, 43.3MB/s]
Successfully submitted to Face Verification