In [1]:
import torch
torch.cuda.is_available()

True

In [2]:
!pip install git+https://github.com/OML-Team/open-metric-learning.git
!pip install wandb
!wandb login "2a3ffdce0110826a26805443c7575053621bc696"

!wget "https://data.caltech.edu/records/65de6-vp158/files/CUB_200_2011.tgz"
!tar -zxvf CUB_200_2011.tgz

!wget "https://raw.githubusercontent.com/OML-Team/open-metric-learning/main/pipelines/datasets_converters/convert_cub.py"
!python convert_cub.py --dataset_root=/kaggle/working/CUB_200_2011 --no_bboxes

Collecting git+https://github.com/OML-Team/open-metric-learning.git
  Cloning https://github.com/OML-Team/open-metric-learning.git to /tmp/pip-req-build-q0typa2h
  Running command git clone --filter=blob:none --quiet https://github.com/OML-Team/open-metric-learning.git /tmp/pip-req-build-q0typa2h
  Resolved https://github.com/OML-Team/open-metric-learning.git to commit f5f8e8b182b5ff466f7d739f710ff282ec85f1b8
  Preparing metadata (setup.py) ... [?25ldone
Collecting pytorch-lightning<1.7.0,>=1.5.9
  Downloading pytorch_lightning-1.6.5-py3-none-any.whl (585 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m585.9/585.9 kB[0m [31m24.6 MB/s[0m eta [36m0:00:00[0m
Collecting jupyter>=1.0.0
  Downloading jupyter-1.0.0-py2.py3-none-any.whl (2.7 kB)
Collecting grad-cam>=1.4.6
  Downloading grad-cam-1.4.6.tar.gz (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m55.5 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0mm
[?25h  Installing

In [3]:
import datetime as dt
from pathlib import Path

import numpy as np
import pandas as pd
import timm
import torch
from torch.utils.data import DataLoader
from tqdm import tqdm

from oml.datasets.base import DatasetWithLabels
from oml.inference.flat import inference_on_images
from oml.losses.triplet import TripletLossWithMiner
from oml.miners.cross_batch import TripletMinerWithMemory
from oml.models.vit.vit import ViTExtractor
from oml.samplers.balance import BalanceSampler
from oml.transforms.images.albumentations import (
    get_augs_albu,
    get_normalisation_resize_albu
)

import wandb

In [4]:
from oml.functional.metrics import (
    calc_gt_mask,
    calc_mask_to_ignore,
    calc_retrieval_metrics,
)


def compute_metrics(dist_mat, labels, is_query, is_gallery, **metrics):
    mask_gt = calc_gt_mask(labels=labels, is_query=is_query, is_gallery=is_gallery)
    mask_to_ignore = calc_mask_to_ignore(is_query=is_query, is_gallery=is_gallery)
    return calc_retrieval_metrics(dist_mat, mask_gt, mask_to_ignore, **metrics)


def transform_metrics_for_wandb_logging(metrics_value):
    res = {}
    for metric_name in metrics_value:
        for k in metrics_value[metric_name]:
            res[metric_name + '/' + str(k)] = metrics_value[metric_name][k].item()
    return res


def save_model(path, num_epochs, model, optimizer, scheduler=None):
    '''Save on GPU'''
    data = {
        'num_epochs': num_epochs,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'scheduler_state_dict': scheduler.state_dict() if scheduler is not None else None
    }
    torch.save(data, path)


def load_model(path, device, model, optimizer=None, scheduler=None):
    '''Load on GPU'''
    data = torch.load(path)
    model.load_state_dict(data['model_state_dict'])
    model.to(device)
    if optimizer is not None:
        optimizer.load_state_dict(data['optimizer_state_dict'])
    if scheduler is not None:
        scheduler.load_state_dict(data['scheduler_state_dict'])
    return data['num_epochs']


@torch.no_grad()
def inference(model, valid_loader, device):
    embeds, labels = [], []
    for batch in valid_loader:
        embeds += [model(batch['input_tensors'].to(device))]
        labels += [batch['labels']]
    return torch.cat(embeds, dim=0).cpu(), torch.cat(labels, dim=0).cpu()


@torch.no_grad()
def validation(model, valid_loader, metrics, device):
    model.eval()
    embeds, labels = inference(model, valid_loader, device)
    print(f'Inference finished: {dt.datetime.now()}')

    dist_mat = torch.cdist(embeds, embeds, p=2)
    mask = torch.ones(len(embeds))
    metrics_value = compute_metrics(dist_mat, labels, mask, mask, **metrics)
    wandb_metrics_value = transform_metrics_for_wandb_logging(metrics_value)
    print(wandb_metrics_value, end='\n\n')
    
    return wandb_metrics_value

In [5]:
dataset_root = '/kaggle/working/CUB_200_2011'
num_workers = 2
valid_batch_size = 128
n_labels = 8
n_instances = 4

df = pd.read_csv(dataset_root + '/df.csv')
# use trainval split as in DML articles
df[['is_query', 'is_gallery']] = np.nan
df.loc[df['label'] <= 100, 'split'] = 'train'
df.loc[df['label'] > 100, 'split'] = 'validation'
df.loc[df['label'] > 100, ['is_query', 'is_gallery']] = True

df_train = df[df['split'] == 'train']
df_valid = df[df['split'] == 'validation']

train_transforms = get_augs_albu(224)
valid_transforms = get_normalisation_resize_albu(224)

train_dataset = DatasetWithLabels(df_train, transform=train_transforms, dataset_root=dataset_root)
valid_dataset = DatasetWithLabels(df_valid, transform=valid_transforms, dataset_root=dataset_root)

sampler = BalanceSampler(train_dataset.get_labels(), n_labels=n_labels, n_instances=n_instances)
train_loader = DataLoader(train_dataset, batch_sampler=sampler, num_workers=num_workers)
valid_loader = DataLoader(valid_dataset, batch_size=valid_batch_size, num_workers=num_workers)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# model = ViTExtractor('vits16_dino', arch='vits16', normalise_features=False).to(device)
model = timm.create_model('vit_small_patch16_224', pretrained=True).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
scheduler = None
criterion = TripletLossWithMiner(margin=None, miner=TripletMinerWithMemory(50, 3))

# Training

In [6]:
torch.manual_seed(42)

n_epochs = 10000
valid_period = 10

metrics = {
    'cmc_top_k': [1],  # to calculate cmc@1
    'map_top_k': [5],  # to calculate map@5
    'precision_top_k': [],
    'fmr_vals': []
}

wandb_init_data = {
    'project': 'TP3',
    'name': 'run',
    'save_code': True,
    'config': {
        'model': 'ViT',
        'optimizer': optimizer,
        'scheduler': scheduler,
        'sampler': {
            'name': 'balanced',
            'n_labels': n_labels,
            'n_instances': n_instances
        },
        
        'valid_period': valid_period,

        'dataset': 'CUB_200_2011',
        'num_epochs': n_epochs,
        'dataloader_num_workers': num_workers,
        'script': _ih[-1]
    }
}

with wandb.init(**wandb_init_data) as run:
    print('Evaluating pre-trained model before training')
    wandb_metrics_value = validation(model, valid_loader, metrics, device)
    wandb.log(wandb_metrics_value)
    best_cmc1 = wandb_metrics_value['cmc/1']
    
    for epoch in range(n_epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            embeddings = model(batch['input_tensors'].to(device))
            loss = criterion(embeddings, batch['labels'].to(device))
            loss.backward()
            optimizer.step()

        if (epoch + 1) % valid_period == 0:
            print(f'{epoch + 1} training epochs finished\nValidation started: {dt.datetime.now()}')
            with torch.inference_mode():
                wandb_metrics_value = validation(model, valid_loader, metrics, device)
                wandb.log(wandb_metrics_value)
                
                if wandb_metrics_value['cmc/1'] > best_cmc1:
                    best_cmc1 = wandb_metrics_value['cmc/1']
                    save_model('best.pt', epoch + 1, model, optimizer, scheduler)
                    wandb.save('best.pt')
                    print(f'\nNew best CMC@1 {best_cmc1} at {epoch + 1} epoch\n')

[34m[1mwandb[0m: Currently logged in as: [33mnik-fedorov[0m. Use [1m`wandb login --relogin`[0m to force relogin


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.01666968155000177, max=1.0)…

Evaluating pre-trained model before training
Inference finished: 2023-05-04 22:16:41.147658
{'cmc/1': 0.6166441440582275, 'map/5': 0.6818005442619324}

10 training epochs finished
Validation started: 2023-05-04 22:17:29.125823
Inference finished: 2023-05-04 22:18:03.714586
{'cmc/1': 0.6806212067604065, 'map/5': 0.7372493743896484}


New best CMC@1 0.6806212067604065 at 10 epoch

20 training epochs finished
Validation started: 2023-05-04 22:18:53.744027
Inference finished: 2023-05-04 22:19:28.905114
{'cmc/1': 0.6899054646492004, 'map/5': 0.7409703135490417}


New best CMC@1 0.6899054646492004 at 20 epoch

30 training epochs finished
Validation started: 2023-05-04 22:20:17.301917
Inference finished: 2023-05-04 22:20:51.967556
{'cmc/1': 0.6912559270858765, 'map/5': 0.7407087087631226}


New best CMC@1 0.6912559270858765 at 30 epoch

40 training epochs finished
Validation started: 2023-05-04 22:21:41.230532
Inference finished: 2023-05-04 22:22:16.161263
{'cmc/1': 0.7018905878067017, 'map/5

VBox(children=(Label(value='253.415 MB of 253.415 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0,…

0,1
cmc/1,▇█████████▁▁
map/5,▇█████████▁▁

0,1
cmc/1,0.0
map/5,0.00573


KeyboardInterrupt: 

In [None]:
for x in model.parameters():
    print(torch.norm(x))

# Validation only

In [7]:
# model = ViTExtractor('vits16_cub', arch='vits16', normalise_features=False).to(device)

metrics = {
    'cmc_top_k': [1],  # to calculate cmc@1
    'map_top_k': [5],  # to calculate map@5
    'precision_top_k': [],
    'fmr_vals': []
}

with torch.inference_mode():
    model.eval()
    embeds = inference(model, valid_loader, device)
    labels = df_valid['label'].values

    dist_mat = torch.cdist(embeds, embeds, p=2)
    mask = torch.ones(len(embeds))
    metrics_value = compute_metrics(dist_mat, labels, mask, mask, **metrics)
    
    print(metrics_value)

{'cmc': {1: tensor(0.6300)}, 'map': {5: tensor(0.6912)}}


# Saving model in wandb

In [None]:
with wandb.init(**{'project': 'TP3', 'name': 'run'}) as run:
    save_model('model.pt', 10, model, optimizer)
    wandb.save('model.pt')

# Loading model from wandb and resume training

In [6]:
import wandb
best_model = wandb.restore('best.pt', run_path="nik-fedorov/TP3/f0ian1ey")

# model = ViTExtractor('vits16_dino', arch='vits16', normalise_features=False).to(device)
# model = timm.create_model('vit_small_patch16_224', pretrained=True).to(device)
load_model(best_model.name, device, model, optimizer)
best_model.name

'/kaggle/working/best.pt'

In [7]:
torch.manual_seed(42)

n_epochs = 10000
valid_period = 10

metrics = {
    'cmc_top_k': [1],  # to calculate cmc@1
    'map_top_k': [5],  # to calculate map@5
    'precision_top_k': [],
    'fmr_vals': []
}

wandb_init_data = {
    'project': 'TP3',
    'name': 'run',
    'save_code': True,
    'config': {
        'model': 'ViT',
        'optimizer': optimizer,
        'scheduler': scheduler,
        'sampler': {
            'name': 'balanced',
            'n_labels': n_labels,
            'n_instances': n_instances
        },
        
        'valid_period': valid_period,

        'dataset': 'CUB_200_2011',
        'num_epochs': n_epochs,
        'dataloader_num_workers': num_workers,
        'script': _ih[-1]
    }
}

with wandb.init(**wandb_init_data) as run:
    print('Evaluating pre-trained model before training')
    wandb_metrics_value = validation(model, valid_loader, metrics, device)
    wandb.log(wandb_metrics_value)
    best_cmc1 = wandb_metrics_value['cmc/1']
    
    for epoch in range(n_epochs):
        model.train()
        for batch in train_loader:
            optimizer.zero_grad()
            embeddings = model(batch['input_tensors'].to(device))
            loss = criterion(embeddings, batch['labels'].to(device))
            loss.backward()
            optimizer.step()
        
        
        mp = max(torch.max(x) for x in model.parameters())
        params_norms = [torch.norm(x) for x in model.parameters()]
        pn = torch.norm(torch.stack(params_norms))
        mg = max(torch.max(x.grad) for x in model.parameters())
        grads_norms = [torch.norm(x.grad) for x in model.parameters()]
        gn = torch.norm(torch.stack(grads_norms))
        
        print(mp, pn, '     ', mg, gn)
            

        if (epoch + 1) % valid_period == 0:
            print(f'{epoch + 1} training epochs finished\nValidation started: {dt.datetime.now()}')
            with torch.inference_mode():
                wandb_metrics_value = validation(model, valid_loader, metrics, device)
                wandb.log(wandb_metrics_value)
                
                if wandb_metrics_value['cmc/1'] > best_cmc1:
                    best_cmc1 = wandb_metrics_value['cmc/1']
                    save_model('best.pt', epoch + 1, model, optimizer, scheduler)
                    wandb.save('best.pt')
                    print(f'\nNew best CMC@1 {best_cmc1} at {epoch + 1} epoch\n')

[34m[1mwandb[0m: Currently logged in as: [33mnik-fedorov[0m. Use [1m`wandb login --relogin`[0m to force relogin


Evaluating pre-trained model before training
Inference finished: 2023-05-05 00:01:34.231275
{'cmc/1': 0.71049964427948, 'map/5': 0.7619401216506958}

tensor(22.1182, device='cuda:0', grad_fn=<MaxBackward1>) tensor(404.7826, device='cuda:0', grad_fn=<NormBackward1>)       tensor(0.1297, device='cuda:0') tensor(4.3940, device='cuda:0')
tensor(22.1182, device='cuda:0', grad_fn=<MaxBackward1>) tensor(404.7826, device='cuda:0', grad_fn=<NormBackward1>)       tensor(0.0587, device='cuda:0') tensor(3.0870, device='cuda:0')
tensor(22.1183, device='cuda:0', grad_fn=<MaxBackward1>) tensor(404.7826, device='cuda:0', grad_fn=<NormBackward1>)       tensor(0.0334, device='cuda:0') tensor(2.0690, device='cuda:0')
tensor(22.1183, device='cuda:0', grad_fn=<MaxBackward1>) tensor(404.7826, device='cuda:0', grad_fn=<NormBackward1>)       tensor(0.0954, device='cuda:0') tensor(3.4967, device='cuda:0')
tensor(22.1183, device='cuda:0', grad_fn=<MaxBackward1>) tensor(404.7827, device='cuda:0', grad_fn=<NormBa

0,1
cmc/1,█████▁
map/5,█████▁

0,1
cmc/1,0.0
map/5,0.00573


KeyboardInterrupt: 