In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import torchvision
import torchvision.transforms as ttf

import os
import os.path as osp

from tqdm import tqdm
from PIL import Image
from sklearn.metrics import roc_auc_score
import numpy as np
import random
from torchsummary import summary

print(torch.__version__)

1.10.0+cu111


# TODOs
As you go, please read the code and keep an eye out for TODOs!

# Download Data

In [2]:
!pip install --upgrade --force-reinstall --no-deps kaggle==1.5.8
!mkdir /root/.kaggle

with open("/root/.kaggle/kaggle.json", "w+") as f:
    f.write('{"username":"meiirbekislamov","key":"af197071383b4332b004369ebae2a753"}') # Put your kaggle username & key here

!chmod 600 /root/.kaggle/kaggle.json

Collecting kaggle==1.5.8
  Downloading kaggle-1.5.8.tar.gz (59 kB)
[?25l[K     |█████▌                          | 10 kB 39.4 MB/s eta 0:00:01[K     |███████████                     | 20 kB 44.7 MB/s eta 0:00:01[K     |████████████████▋               | 30 kB 28.1 MB/s eta 0:00:01[K     |██████████████████████▏         | 40 kB 14.9 MB/s eta 0:00:01[K     |███████████████████████████▊    | 51 kB 17.6 MB/s eta 0:00:01[K     |████████████████████████████████| 59 kB 5.1 MB/s 
[?25hBuilding wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25l[?25hdone
  Created wheel for kaggle: filename=kaggle-1.5.8-py3-none-any.whl size=73275 sha256=5d9c79adaa5f03c49df5697397d72d2a005930376ba27bdc4c28e91d090b9b27
  Stored in directory: /root/.cache/pip/wheels/de/f7/d8/c3902cacb7e62cb611b1ad343d7cc07f42f7eb76ae3a52f3d1
Successfully built kaggle
Installing collected packages: kaggle
  Attempting uninstall: kaggle
    Found existing installation: kaggle 1.5.12


In [3]:
!kaggle competitions download -c 11-785-s22-hw2p2-classification
!kaggle competitions download -c 11-785-s22-hw2p2-verification

!unzip -q 11-785-s22-hw2p2-classification.zip
!unzip -q 11-785-s22-hw2p2-verification.zip

!ls

Downloading 11-785-s22-hw2p2-classification.zip to /content
 99% 2.34G/2.35G [00:34<00:00, 66.5MB/s]
100% 2.35G/2.35G [00:34<00:00, 72.4MB/s]
Downloading 11-785-s22-hw2p2-verification.zip to /content
 95% 249M/263M [00:02<00:00, 108MB/s]
100% 263M/263M [00:02<00:00, 119MB/s]
11-785-s22-hw2p2-classification.zip   sample_data
11-785-s22-hw2p2-verification.zip     train_subset
classification			      verification
classification_sample_submission.csv  verification_sample_submission.csv


# Hyperparameters for Face Verification

In [20]:
"""
The well-accepted SGD batch_size & lr combination for CNN classification is 256 batch size for 0.1 learning rate.
When changing batch size for SGD, follow the linear scaling rule - halving batch size -> halve learning rate, etc.
This is less theoretically supported for Adam, but in my experience, it's a decent ballpark estimate.
"""
batch_size = 256
lr = 0.1
epochs = 50 # Just for the early submission. We'd want you to train like 50 epochs for your main submissions.

# Hyperparameters for Face Classification

# Very Simple Network

In [8]:
class Network(nn.Module):
    """
    The Very Low early deadline architecture is a 4-layer CNN.
    The first Conv layer has 64 channels, kernel size 7, and stride 4.
    The next three have 128, 256, and 512 channels. Each have kernel size 3 and stride 2.
    Think about what the padding should be for each layer to not change spatial resolution.
    Each Conv layer is accompanied by a Batchnorm and ReLU layer.
    Finally, you want to average pool over the spatial dimensions to reduce them to 1 x 1.
    Then, remove (Flatten?) these trivial 1x1 dimensions away.
    Look through https://pytorch.org/docs/stable/nn.html 
    TODO: Fill out the model definition below! 

    Why does a very simple network have 4 convolutions?
    Input images are 224x224. Note that each of these convolutions downsample.
    Downsampling 2x effectively doubles the receptive field, increasing the spatial
    region each pixel extracts features from. Downsampling 32x is standard
    for most image models.

    Why does a very simple network have high channel sizes?
    Every time you downsample 2x, you do 4x less computation (at same channel size).
    To maintain the same level of computation, you 2x increase # of channels, which 
    increases computation by 4x. So, balances out to same computation.
    Another intuition is - as you downsample, you lose spatial information. Want
    to preserve some of it in the channel dimension.
    """
    def __init__(self, num_classes=7000):
        super().__init__()

        self.backbone = nn.Sequential(
            # Note that first conv is stride 4. It is (was?) standard to downsample.
            # 4x early on, as with 224x224 images, 4x4 patches are just low-level details.
            # Food for thought: Why is the first conv kernel size 7, not kernel size 3?

            nn.Conv2d(in_channels=3, out_channels=64, kernel_size=7, stride=4),
            nn.BatchNorm2d(64),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2),
            nn.BatchNorm2d(128), 
            nn.ReLU(),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2),
            nn.BatchNorm2d(512),
            nn.AvgPool2d(kernel_size=6),
            nn.Flatten()

            # TODO: Average pool over & reduce the spatial dimensions to (1, 1)
            # TODO: Collapse (Flatten) the trivial (1, 1) dimensions
            ) 
        
        self.cls_layer = nn.Linear(512, num_classes)
    
    def forward(self, x, return_feats=False):
        """
        What is return_feats? It essentially returns the second-to-last-layer
        features of a given image. It's a "feature encoding" of the input image,
        and you can use it for the verification task. You would use the outputs
        of the final classification layer for the classification task.

        You might also find that the classification outputs are sometimes better
        for verification too - try both.
        """
        feats = self.backbone(x)
        out = self.cls_layer(feats)

        if return_feats:
            return feats
        else:
            return out

# MobileNetV2

In [9]:
import torch
import torch.nn as nn
import math

class InvertedResidualBlock(nn.Module):
    
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__() # Just have to do this for all nn.Module classes

        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False

        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        self.feature_mixing = nn.Sequential(
            nn.Conv2d(in_channels, hidden_dim, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(),
        )

        self.spatial_mixing = nn.Sequential(
            nn.Conv2d(hidden_dim, hidden_dim, kernel_size=3, stride=stride, padding=1, groups=hidden_dim, bias=False),
            nn.BatchNorm2d(hidden_dim),
            nn.ReLU6(),
        )
        
        self.bottleneck_channels = nn.Sequential(
            nn.Conv2d(hidden_dim, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(out_channels),
        )

    def forward(self, x):
        out = self.feature_mixing(x)
        out = self.spatial_mixing(out)
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + out
        else:
            return out

class MobileNetV2(nn.Module):
   
    def __init__(self, num_classes= 7000):
        super().__init__()

        self.num_classes = num_classes

        self.stem = nn.Sequential(
            nn.Conv2d(3, 32, kernel_size=3, stride=2, padding=1, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU6(),
            nn.Conv2d(32, 32, kernel_size=3, stride=1, padding=1, groups=32, bias=False),
            nn.BatchNorm2d(32),
            nn.ReLU6(),
            nn.Conv2d(32, 16, kernel_size=1, stride=1, padding=0, bias=False),
            nn.BatchNorm2d(16),
        )

        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [6,  24, 2, 2],
            [6,  32, 3, 2],
            [6,  64, 4, 2],
            [6,  96, 3, 1],
            [6, 160, 3, 2],
            [6, 320, 1, 1],
        ]

        # Remember that our stem left us off at 16 channels. We're going to
        # keep updating this in_channels variable as we go
        in_channels = 16

        # Let's make the layers
        layers = []
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage

            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1,
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels

        self.layers = nn.Sequential(*layers) # Done, save them to the class

        # Some final feature mixing
        self.final_block = nn.Sequential(
            nn.Conv2d(in_channels, 1280, kernel_size=1, padding=0, stride=1, bias=False),
            nn.BatchNorm2d(1280),
            nn.ReLU6()
        )

        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            # nn.Linear(1280, num_classes)
        )
        self.cls_layer_final = nn.Linear(1280, num_classes)

        self._initialize_weights()

    def _initialize_weights(self):
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x, return_feats=False):
        out = self.stem(x)
        out = self.layers(out)
        out = self.final_block(out)
        feats = self.cls_layer(out)
        out = self.cls_layer_final(feats)

        if return_feats:
            return feats
        else:
            return out


# ConvNeXt

In [10]:
import torch
import torch.nn as nn
import math

class InvertedResidualBlock(nn.Module):
    
    def __init__(self,
                 in_channels,
                 out_channels,
                 stride,
                 expand_ratio):
        super().__init__() # Just have to do this for all nn.Module classes

        # Can only do identity residual connection if input & output are the
        # same channel & spatial shape.
        if stride == 1 and in_channels == out_channels:
            self.do_identity = True
        else:
            self.do_identity = False

        # Expand Ratio is like 6, so hidden_dim >> in_channels
        hidden_dim = in_channels * expand_ratio

        self.spatial_mixing = nn.Sequential(
            nn.Conv2d(in_channels, in_channels, kernel_size=7, stride=stride, padding=3, groups=in_channels, bias=False),
            nn.BatchNorm2d(in_channels),
            # nn.ReLU6(),
        )

        self.feature_mixing = nn.Sequential(
            nn.Conv2d(in_channels, hidden_dim, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(hidden_dim),
            nn.GELU(),
        )

        self.bottleneck_channels = nn.Sequential(
            nn.Conv2d(hidden_dim, out_channels, kernel_size=1, stride=1, padding=0, bias=False),
            # nn.BatchNorm2d(out_channels),
        )

    def forward(self, x):
        out = self.spatial_mixing(x)
        out = self.feature_mixing(out)
        out = self.bottleneck_channels(out)

        if self.do_identity:
            return x + out
        else:
            return out

class ConvNeXt(nn.Module):
   
    def __init__(self, num_classes= 7000):
        super().__init__()

        self.num_classes = num_classes

        self.stem = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=4, stride=4, padding=0, bias=False),
            nn.BatchNorm2d(96),
        )

        self.stage_cfgs = [
            # expand_ratio, channels, # blocks, stride of first block
            [4,  96, 3, 1],
            [4,  192, 3, 1],
            [4,  384, 9, 1],
            [4,  768, 3, 1],
        ]

        self.downsampling_layer = [
                                   nn.Sequential(
                                       nn.BatchNorm2d(96),
                                       nn.Conv2d(96, 192, kernel_size=2, stride=2)),

                                   nn.Sequential(
                                       nn.BatchNorm2d(192),
                                       nn.Conv2d(192, 384, kernel_size=2, stride=2)),

                                   nn.Sequential(
                                       nn.BatchNorm2d(384),
                                       nn.Conv2d(384, 768, kernel_size=2, stride=2))
                                   ]

        in_channels = 96

        # Let's make the layers
        layers = []
        ix = 0
        for curr_stage in self.stage_cfgs:
            expand_ratio, num_channels, num_blocks, stride = curr_stage

            for block_idx in range(num_blocks):
                out_channels = num_channels
                layers.append(InvertedResidualBlock(
                    in_channels=in_channels,
                    out_channels=out_channels,
                    # only have non-trivial stride if first block
                    stride=stride if block_idx == 0 else 1,
                    expand_ratio=expand_ratio
                ))
                # In channels of the next block is the out_channels of the current one
                in_channels = out_channels
            if ix < 3:
              layers.append(self.downsampling_layer[ix])
              ix += 1
              in_channels = 2 * in_channels

        self.layers = nn.Sequential(*layers) # Done, save them to the class


        # Now, we need to build the final classification layer.
        self.cls_layer = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.BatchNorm2d(768),
            nn.Flatten(),
            nn.Linear(768, num_classes)
        )
        # self.cls_layer_final = nn.Linear(768, num_classes)
        

        self._initialize_weights()

    def _initialize_weights(self):
        
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward(self, x, return_feats=False):
        out = self.stem(x)
        out = self.layers(out)
        out = self.cls_layer(out)
        # out = self.cls_layer_final(feats)

        if return_feats:
            return feats
        else:
            return out


# Dataset & DataLoader

In [11]:
from torch._C import dtype
"""
Transforms (data augmentation) is quite important for this task.
Go explore https://pytorch.org/vision/stable/transforms.html for more details
"""
DATA_DIR = "/content"
TRAIN_DIR = osp.join(DATA_DIR, "classification/classification/train") # This is a smaller subset of the data. Should change this to classification/classification/train
VAL_DIR = osp.join(DATA_DIR, "classification/classification/dev")
TEST_DIR = osp.join(DATA_DIR, "classification/classification/test")

train_transforms = [#ttf.ToPILImage(mode='RGB'),
                    ttf.RandAugment(), 
                    ttf.ToTensor(), 
                    ttf.RandomErasing(p=0.25)
                    # ttf.RandomApply(transforms=[ttf.ColorJitter(brightness=.5, hue=.3), 
                    #                             ttf.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75))
                    #                             ], p=0.2)]
]

                    
val_transforms = [ttf.ToTensor()]

train_dataset = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=ttf.Compose(train_transforms))
val_dataset = torchvision.datasets.ImageFolder(VAL_DIR,
                                               transform=ttf.Compose(val_transforms))


# Image Augmentation 
train_augment1 = [ttf.ToTensor(), 
                  ttf.RandomHorizontalFlip(p=0.4), 
                  ttf.RandomVerticalFlip(0.4),
                  ttf.ColorJitter(brightness=.5, hue=.3)]

train_augment2 = [ttf.ToTensor(), 
                  # ttf.RandomRotation(degrees=(0, 180)), 
                  # ttf.RandomAffine(degrees=(30, 70), translate=(0.1, 0.3), scale=(0.5, 0.75)),
                  ttf.RandomErasing(p=0.3, scale=(0.02, 0.33), ratio=(0.3, 3.3), value=0, inplace=False)
                  ]

train_augmented_dataset1 = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=ttf.Compose(train_augment1))
train_augmented_dataset2 = torchvision.datasets.ImageFolder(TRAIN_DIR,
                                                 transform=ttf.Compose(train_augment2))

# Concatenate two datasets
image_datasets = torch.utils.data.ConcatDataset([train_dataset, train_augmented_dataset2])

train_loader = DataLoader(train_dataset, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=2)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,
                        drop_last=True, num_workers=1)

# Triple Loss

In [12]:
class TripletDataset(torchvision.datasets.VisionDataset):
  def __init__(self, root, transform):  
    # For "root", note that you're making this dataset on top of the regular classification dataset.
    self.dataset = torchvision.datasets.ImageFolder(root=root, transform=transform)
    
    # map class indices to dataset image indices
    self.classes_to_img_indices = [[] for _ in range(len(self.dataset.classes))]
    for img_idx, (_, class_id) in enumerate(self.dataset.samples):
      self.classes_to_img_indices[class_id].append(img_idx)
    
    # VisionDataset attributes for display
    self.root = root
    self.length = len(self.dataset.classes) # pseudo length! Length of this dataset is 7000, *not* the actual # of images in the dataset. You can just increase the # of epochs you train for.
    self.transforms = self.dataset.transforms
          
  def __len__(self):
    return self.length
    
  def __getitem__(self, anchor_class_idx):
    """Treat the given index as the anchor class and pick a triplet randomly"""
    anchor_class = self.classes_to_img_indices[anchor_class_idx]
    # choose positive pair (assuming each class has at least 2 images)
    anchor, positive = np.random.choice(a=anchor_class, size=2, replace=False)
    # choose negative image
    # hint for further exploration: you can choose 2 negative images to make it a Quadruplet Loss

    classes_to_choose_negative_class_from = list(range(self.length))
    classes_to_choose_negative_class_from.pop(anchor_class_idx) # TODO: What are we removing?
    negative_class_idx = random.choice(classes_to_choose_negative_class_from)
    negative_class = self.classes_to_img_indices[negative_class_idx]
    negative = np.random.choice(a=negative_class, size=1, replace=False)
    
    # self.dataset[idx] will return a tuple (image tensor, class label). You can use its outputs to train for classification alongside verification
    # If you do not want to train for classification, you can use self.dataset[idx][0] to get the image tensor
    return self.dataset[anchor][0], self.dataset[positive][0], self.dataset[int(negative)][0]


In [13]:
train_dataset_triple_loss = TripletDataset(TRAIN_DIR, transform=ttf.Compose(train_transforms))

In [14]:
train_loader_triple_loss = DataLoader(train_dataset_triple_loss, batch_size=batch_size,
                          shuffle=True, drop_last=True, num_workers=2)

In [15]:
import matplotlib.pyplot as plt

def show_img(img):
  plt.figure(figsize=(5,3))
  npimg=img.numpy()
  plt.imshow(np.transpose(npimg,(1,2,0)))
  plt.show()

def show(imgs):
    fix, axs = plt.subplots(ncols=len(imgs), squeeze=False)
    for i, img in enumerate(imgs):
        img = ttf.ToPILImage()(img.to('cpu'))
        axs[0, i].imshow(np.asarray(img))
        axs[0, i].set(xticklabels=[], yticklabels=[], xticks=[], yticks=[])

# Setup everything for training

In [29]:
model = MobileNetV2()
model.cuda()
# model.load_state_dict(torch.load("/content/model_face_verif_triple_loss_MobilNetV2_epoch_49.pt"))


# For this homework, we're limiting you to 35 million trainable parameters, as
# outputted by this. This is to help constrain your search space and maintain
# reasonable training times & expectations
num_trainable_parameters = 0
for p in model.parameters():
    num_trainable_parameters += p.numel()
print("Number of Params: {}".format(num_trainable_parameters))

# TODO: What criterion do we use for this task?
criterion = torch.nn.CrossEntropyLoss(label_smoothing=0.1)
# criterion = torch.nn.TripletMarginLoss(margin=0.25)
optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=1e-4)
# scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=(len(train_loader) * epochs))
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.6, patience=2, mode='max', threshold=0.01)
# T_max is "how many times will i call scheduler.step() until it reaches 0 lr?"

# For this homework, we strongly strongly recommend using FP16 to speed up training.
# It helps more for larger models.
# Go to https://effectivemachinelearning.com/PyTorch/8._Faster_training_with_mixed_precision
# and compare "Single precision training" section with "Mixed precision training" section
scaler = torch.cuda.amp.GradScaler()

Number of Params: 11155928


# Let's train!

In [26]:
for epoch in range(epochs):
    model.train()
    # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
    batch_bar = tqdm(total=len(train_loader), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    num_correct = 0
    total_loss = 0

    for i, (x, y) in enumerate(train_loader):
        optimizer.zero_grad()

        x = x.cuda()
        y = y.cuda()

        # Don't be surprised - we just wrap these two lines to make it work for FP16
        with torch.cuda.amp.autocast():     
            outputs = model(x)
            loss = criterion(outputs, y)

        # Update # correct & loss as we go
        num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
        total_loss += float(loss)

        # tqdm lets you add some details so you can monitor training as you train.
        batch_bar.set_postfix(
            acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)),
            loss="{:.04f}".format(float(total_loss / (i + 1))),
            num_correct=num_correct,
            lr="{:.04f}".format(float(optimizer.param_groups[0]['lr'])))
        
        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16

         
        scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

        batch_bar.update() # Update tqdm bar
    batch_bar.close() # You need this to close the tqdm bar
    torch.save(model.state_dict(), f"model_ConvNeXt_epoch_{epoch}.pt")

    print("Epoch {}/{}: Train Acc {:.04f}%, Train Loss {:.04f}, Learning Rate {:.04f}".format(
        epoch + 1,
        epochs,
        100 * num_correct / (len(train_loader) * batch_size),
        float(total_loss / len(train_loader)),
        float(optimizer.param_groups[0]['lr'])))
    
    # You can add validation per-epoch here if you would like
    model.eval()
    batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
    num_correct = 0
    for i, (x, y) in enumerate(val_loader):

      x = x.cuda()
      y = y.cuda()

      with torch.no_grad():
          outputs = model(x)

      num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
      batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)))

      batch_bar.update()
    
    batch_bar.close()
    print("Validation: {:.04f}%".format(100 * num_correct / len(val_dataset)))
    # scheduler.step((100 * num_correct / len(val_dataset)))



Epoch 1/2: Train Acc 0.0136%, Train Loss 8.8644, Learning Rate 0.0005




Validation: 0.0200%




Epoch 2/2: Train Acc 0.0143%, Train Loss 8.8465, Learning Rate 0.0000


                                                                   

Validation: 0.0114%




# Face verification: Triple Loss Training

In [28]:
for epoch in range(epochs):
    model.train()
    # Quality of life tip: leave=False and position=0 are needed to make tqdm usable in jupyter
    batch_bar = tqdm(total=len(train_loader_triple_loss), dynamic_ncols=True, leave=False, position=0, desc='Train') 

    # num_correct = 0
    # total_loss = 0

    for i, (anchor, positive, negative) in enumerate(train_loader_triple_loss):
        optimizer.zero_grad()

        anchor = anchor.cuda()
        positive = positive.cuda()
        negative = negative.cuda()

        # Don't be surprised - we just wrap these two lines to make it work for FP16
        with torch.cuda.amp.autocast():     
            outputs_anchor = model(anchor, return_feats=True)
            outputs_positive = model(positive, return_feats=True)
            outputs_negative = model(negative, return_feats=True)
            loss = criterion(outputs_anchor, outputs_positive, outputs_negative)
        
        # Another couple things you need for FP16. 
        scaler.scale(loss).backward() # This is a replacement for loss.backward()
        scaler.step(optimizer) # This is a replacement for optimizer.step()
        scaler.update() # This is something added just for FP16
        scheduler.step() # We told scheduler T_max that we'd call step() (len(train_loader) * epochs) many times.

        batch_bar.update() # Update tqdm bar
    batch_bar.close() # You need this to close the tqdm bar
    torch.save(model.state_dict(), f"model_face_verif_triple_loss_MobilNetV2_epoch_{epoch}.pt")
    print("Epoch {}| Learning Rate {:.04f}".format(
        epoch + 1,
        epochs,
        float(optimizer.param_groups[0]['lr'])))

    # scheduler.step((100 * num_correct / len(val_dataset)))



Epoch 1| Learning Rate 2.0000


                                                        

Epoch 2| Learning Rate 2.0000




# Classification Task: Validation

In [30]:
# model.load_state_dict(torch.load("model_MobilNetV2_epoch_9.pt"))
model.eval()
batch_bar = tqdm(total=len(val_loader), dynamic_ncols=True, position=0, leave=False, desc='Val')
num_correct = 0
for i, (x, y) in enumerate(val_loader):

    x = x.cuda()
    y = y.cuda()

    with torch.no_grad():
        outputs = model(x)

    num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
    batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)))

    batch_bar.update()
    
batch_bar.close()
print("Validation: {:.04f}%".format(100 * num_correct / len(val_dataset)))

                                                                   

Validation: 0.0171%




# Classification Task: Submit to Kaggle

In [None]:
class ClassificationTestSet(Dataset):
    # It's possible to load test set data using ImageFolder without making a custom class.
    # See if you can think it through!

    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        return self.transforms(Image.open(self.img_paths[idx]))

In [None]:
test_dataset = ClassificationTestSet(TEST_DIR, ttf.Compose(val_transforms))
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False,
                         drop_last=False, num_workers=1)

In [None]:
model.eval()
batch_bar = tqdm(total=len(test_loader), dynamic_ncols=True, position=0, leave=False, desc='Test')

res = []
for i, (x) in enumerate(test_loader):

    x = x.cuda()

    with torch.no_grad():
        outputs = model(x)
        pred_y = torch.argmax(outputs, axis=1)
        # res.append(outputs)
        res.extend(pred_y.tolist())

    # num_correct += int((torch.argmax(outputs, axis=1) == y).sum())
    # batch_bar.set_postfix(acc="{:.04f}%".format(100 * num_correct / ((i + 1) * batch_size)))
    

    batch_bar.update()
    
batch_bar.close()



In [None]:
with open("classification_early_submission.csv", "w+") as f:
    f.write("id,label\n")
    for i in range(len(test_dataset)):
        f.write("{},{}\n".format(str(i).zfill(6) + ".jpg", res[i]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-classification -f classification_early_submission.csv -m " ConvNeXt submission 35 epochs (randAug, erase)"

100% 541k/541k [00:03<00:00, 150kB/s]
Successfully submitted to Face Recognition

# Verification Task: Validation

There are 6K verification dev images, but 166K "pairs" for you to compare. So, it's much more efficient to compute the features for the 6K verification images, and just compare afterwards.

This will be done by creating a dictionary mapping the image file names to the features. Then, you'll use this dictionary to compute the similarities for each pair.

In [None]:
!ls verification/verification/dev | wc -l
!cat verification/verification/verification_dev.csv | wc -l

6000
166801


In [None]:
class VerificationDataset(Dataset):
    def __init__(self, data_dir, transforms):
        self.data_dir = data_dir
        self.transforms = transforms

        # This one-liner basically generates a sorted list of full paths to each image in data_dir
        self.img_paths = list(map(lambda fname: osp.join(self.data_dir, fname), sorted(os.listdir(self.data_dir))))

    def __len__(self):
        return len(self.img_paths)
    
    def __getitem__(self, idx):
        # We return the image, as well as the path to that image (relative path)
        return self.transforms(Image.open(self.img_paths[idx])), osp.relpath(self.img_paths[idx], self.data_dir)

In [None]:
val_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/dev"),
                                       ttf.Compose(val_transforms))
val_ver_loader = torch.utils.data.DataLoader(val_veri_dataset, batch_size=batch_size, 
                                             shuffle=False, num_workers=1)

In [None]:
# model.cuda()
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(val_ver_loader), total=len(val_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try the final outputs too!
        feats = model(imgs, return_feats=True) 
        # feats = nn.AdaptiveAvgPool2d((1, 1))(feats)
        # feats = nn.BatchNorm2d(768)(feats)
        # feats = nn.ReLU()(feats)
        for i in range(len(path_names)):
          feats_dict[path_names[i]] = feats[i].cpu()





In [None]:
from sys import call_tracing
call_tracing# What does this dict look like?
# print(list(feats_dict.items())[0])

<function sys.call_tracing>

In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
similarity_metric = nn.CosineSimilarity(dim=0, eps=1e-6)

val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_dev.csv")


# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
gt_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2, gt = line.split(",")

    # TODO: Use the similarity metric
    # How to use these img_paths? What to do with the features?
    similarity = similarity_metric(feats_dict[img_path1.split("/")[1]], feats_dict[img_path2.split("/")[1]])

    gt_similarities.append(int(gt))
    pred_similarities.append(similarity)

pred_similarities = np.array(pred_similarities)
gt_similarities = np.array(gt_similarities)

print("AUC:", roc_auc_score(gt_similarities, pred_similarities))



AUC: 0.9671810353372368


# Verification Task: Submit to Kaggle

In [None]:
test_veri_dataset = VerificationDataset(osp.join(DATA_DIR, "verification/verification/test"),
                                        ttf.Compose(val_transforms))
test_ver_loader = torch.utils.data.DataLoader(test_veri_dataset, batch_size=batch_size, 
                                              shuffle=False, num_workers=1)

In [None]:
model.eval()

feats_dict = dict()
for batch_idx, (imgs, path_names) in tqdm(enumerate(test_ver_loader), total=len(test_ver_loader), position=0, leave=False):
    imgs = imgs.cuda()

    with torch.no_grad():
        # Note that we return the feats here, not the final outputs
        # Feel free to try to final outputs too!
        feats = model(imgs,return_feats=True) 
        # feats = nn.AdaptiveAvgPool2d((1, 1))(feats)
        # feats = nn.BatchNorm2d(768)(feats)
        # feats = nn.GELU()(feats)
        for i in range(len(path_names)):
          feats_dict[path_names[i]] = feats[i].cpu()
    
    # TODO: Now we have features and the image path names. What to do with them?
    # Hint: use the feats_dict somehow.



In [None]:
# We use cosine similarity between feature embeddings.
# TODO: Find the relevant function in pytorch and read its documentation.
similarity_metric = nn.CosineSimilarity(dim=0, eps=1e-6)
val_veri_csv = osp.join(DATA_DIR, "verification/verification/verification_test.csv")


# Now, loop through the csv and compare each pair, getting the similarity between them
pred_similarities = []
for line in tqdm(open(val_veri_csv).read().splitlines()[1:], position=0, leave=False): # skip header
    img_path1, img_path2 = line.split(",")

    # TODO: Finish up verification testing.
    # How to use these img_paths? What to do with the features?
    similarity = similarity_metric(feats_dict[img_path1.split("/")[1]], feats_dict[img_path2.split("/")[1]])
    pred_similarities.append(similarity)



In [None]:
with open("verification_early_submission.csv", "w+") as f:
    f.write("id,match\n")
    for i in range(len(pred_similarities)):
        f.write("{},{}\n".format(i, pred_similarities[i]))

In [None]:
!kaggle competitions submit -c 11-785-s22-hw2p2-verification -f verification_early_submission.csv -m "MobileNetv2 (model_epoch_26) triple loss 150 epochs"

100% 16.4M/16.4M [00:04<00:00, 3.50MB/s]
Successfully submitted to Face Verification

# Extras

In [None]:
# If you keep re-initializing your model in Colab, can run out of GPU memory, need to restart.
# These three lines can help that - run this before you re-initialize your model

del model
torch.cuda.empty_cache()
!nvidia-smi

Thu Mar 17 02:53:21 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   38C    P0    33W / 250W |  16209MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces