# Classification of the feature vectors acquired from the Contrastive Learning model
## - Linear Classification
## - Support Vector Machine

### 1. Imports

In [19]:
import sklearn
import scipy
#import skimage
import pandas
import numpy as np
from PIL import Image
import bokeh
from copy import deepcopy

## Imports for plotting
import matplotlib.pyplot as plt
plt.set_cmap('cividis')
%matplotlib inline
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg', 'pdf') # For export
import matplotlib
matplotlib.rcParams['lines.linewidth'] = 2.0
import seaborn as sns
sns.set()

## tqdm for loading bars
from tqdm.notebook import tqdm

## PyTorch
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import torchvision.models as models

## Torchvision
import torchvision
from torchvision import transforms

# PyTorch Lightning
import pytorch_lightning as pl
from pytorch_lightning.callbacks import LearningRateMonitor, ModelCheckpoint

# Import tensorboard
%load_ext tensorboard

from tensorboard.plugins import projector

import cv2
import pathlib
import os
import datetime
#import tensorflow as tf

from os import listdir, walk
from os.path import isfile, join

from tensorboardX import SummaryWriter

The tensorboard extension is already loaded. To reload it, use:
  %reload_ext tensorboard


  set_matplotlib_formats('svg', 'pdf') # For export


In [None]:
!pip3 install ipywidgets==7.7.2

### 2. Dataset Preparation

In [20]:
from os import listdir, walk
from os.path import isfile, join


data_path_train = "/gpfs/data/fs71186/kadic/train_images"
data_path_test = "/gpfs/data/fs71186/kadic/test_images"

ground_truths = "/gpfs/data/fs71186/kadic/train_metadata.json"

train_image_files = [join(dirpath,f) for (dirpath, dirnames, filenames) in walk(data_path_train) for f in filenames] 

print(len(train_image_files))

test_image_files = [join(dirpath,f) for (dirpath, dirnames, filenames) in walk(data_path_test) for f in filenames] 

print(len(test_image_files))

train_image_files = sorted(train_image_files)
test_image_files = sorted(test_image_files)

679989
210408


In [21]:
import json
 
# Opening JSON file
f = open(ground_truths)
 
# returns JSON object as
# a dictionary
ground_truth_data = json.load(f)

print(ground_truth_data.keys())
print(len(ground_truth_data["annotations"]))
print(ground_truth_data["annotations"][0])
print(train_image_files[0])

print(len(ground_truth_data["categories"]))
print(len(ground_truth_data["genera"]))
      
gt_annot = ground_truth_data["annotations"]
# Iterating through the json
# list 
#Closing file

f.close()

dict_keys(['annotations', 'images', 'categories', 'genera', 'institutions', 'distances', 'license'])
839772
{'genus_id': 1, 'institution_id': 0, 'category_id': 0, 'image_id': '00000__001'}
/gpfs/data/fs71186/kadic/train_images/000/00/00000__001.jpg
15501
2564


In [22]:
train_data = []
test_data = []

for i, img in enumerate(train_image_files):
    if(i % 100000 == 0):
        #print(i)
        print(img)
        print(gt_annot[i]['image_id'])
    train_data.append((img, gt_annot[i]['category_id']))

print(len(train_image_files))
print(len(train_data))

/gpfs/data/fs71186/kadic/train_images/000/00/00000__001.jpg
00000__001
/gpfs/data/fs71186/kadic/train_images/019/33/01933__009.jpg
01933__009
/gpfs/data/fs71186/kadic/train_images/037/74/03774__113.jpg
03774__113
/gpfs/data/fs71186/kadic/train_images/055/74/05574__051.jpg
05574__051
/gpfs/data/fs71186/kadic/train_images/074/12/07412__018.jpg
07412__018
/gpfs/data/fs71186/kadic/train_images/092/23/09223__036.jpg
09223__036
/gpfs/data/fs71186/kadic/train_images/111/17/11117__031.jpg
11117__031
679989
679989


In [23]:
labels = []
label_count = {}
for img, annot in train_data:
    if annot not in labels:
        labels.append(annot)
        label_count[str(annot)] = 1
    else:
        label_count[str(annot)] = int(label_count[str(annot)]) + 1
        
#print(len(labels))
#print(label_count)
sorted_count = dict(sorted(label_count.items(), key=lambda item: item[1]))

#print(list(sorted_count.items())[:200])

In [24]:
print(len(list(sorted_count.items())))
biggest_categories = list(sorted_count.items())[12333:]
#print(list(sorted_count.items())[12333:])

print(len(biggest_categories))
cat_sum = 0
for cat in biggest_categories:
    cat_sum += cat[1]
print(cat_sum)

12533
200
15965


In [25]:
dict_keys_back = {}
for i, cat in enumerate(biggest_categories):
    dict_keys_back[str(cat[0])] = i
#print(dict_keys_back)

In [26]:
reduced_train_data = []
for img, annot in train_data:
    for key, val in biggest_categories:
        if str(annot) == str(key):
            reduced_train_data.append((img, annot))

print(len(reduced_train_data))

15965


In [27]:
reduced_train_data_subbed = []
for img, annot in train_data:
    for key, val in biggest_categories:
        if str(annot) == str(key):
            reduced_train_data_subbed.append((img, dict_keys_back[str(annot)]))

print(len(reduced_train_data_subbed))

15965


### 3. Parameters for loading the SimCLR model and for Training

In [28]:
# Path to the folder where the datasets are/should be downloaded (e.g. CIFAR10)
DATASET_PATH = data_path_train
# Path to the folder where the pretrained models are saved
CHECKPOINT_PATH = "../saved_models/contrastive_models"
# In this notebook, we use data loaders with heavier computational processing. It is recommended to use as many
# workers as possible in a data loader, which corresponds to the number of CPU cores
NUM_WORKERS = 128 #os.cpu_count()

# Setting the seed
pl.seed_everything(42)

# Ensure that all operations are deterministic on GPU (if used) for reproducibility
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

device = torch.device("cuda:0") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)
print("Number of workers:", NUM_WORKERS)

[rank: 0] Global seed set to 42


Device: cuda:0
Number of workers: 128


In [29]:
class ImageDataset(Dataset):
    def __init__(self, paths,transform):
        self.paths = [i[0] for i in paths]
        self.transform = transform
        self.target_paths = [i[1] for i in paths]
        
    def __len__(self):
        return len(self.paths)
    

    def __getitem__(self, index):
        image_path = self.paths[index]
        image_l = Image.open(image_path)
        image = image_l.convert('RGB')
        image_tensor = image
        if self.transform:
            image_tensor = self.transform(image)
        
        target = self.target_paths[index]
        
        return (image_tensor, target)

### 4. Load The Pretrained Model

In [12]:
class SimCLR(pl.LightningModule):

    def __init__(self, hidden_dim, lr, temperature, weight_decay, max_epochs=500):
        super().__init__()
        self.save_hyperparameters()
        assert self.hparams.temperature > 0.0, 'The temperature must be a positive float!'
        # Base model f(.)
        self.convnet = torchvision.models.resnet18(num_classes=4*hidden_dim)  # Output of last linear layer
        # The MLP for g(.) consists of Linear->ReLU->Linear
        self.convnet.fc = nn.Sequential(
            self.convnet.fc,  # Linear(ResNet output, 4*hidden_dim)
            nn.ReLU(inplace=True),
            nn.Linear(4*hidden_dim, hidden_dim)
        )

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(),
                                lr=self.hparams.lr,
                                weight_decay=self.hparams.weight_decay)
        lr_scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer,
                                                            T_max=self.hparams.max_epochs,
                                                            eta_min=self.hparams.lr/50)
        return [optimizer], [lr_scheduler]

    def info_nce_loss(self, batch, mode='train'):
        imgs, _ = batch
        imgs = torch.cat(imgs, dim=0)

        # Encode all images
        feats = self.convnet(imgs)
        # Calculate cosine similarity
        cos_sim = F.cosine_similarity(feats[:,None,:], feats[None,:,:], dim=-1)
        # Mask out cosine similarity to itself
        self_mask = torch.eye(cos_sim.shape[0], dtype=torch.bool, device=cos_sim.device)
        cos_sim.masked_fill_(self_mask, -9e15)
        # Find positive example -> batch_size//2 away from the original example
        pos_mask = self_mask.roll(shifts=cos_sim.shape[0]//2, dims=0)
        # InfoNCE loss
        cos_sim = cos_sim / self.hparams.temperature
        nll = -cos_sim[pos_mask] + torch.logsumexp(cos_sim, dim=-1)
        nll = nll.mean()

        # Logging loss
        self.log(mode+'_loss', nll)
        # Get ranking position of positive example
        comb_sim = torch.cat([cos_sim[pos_mask][:,None],  # First position positive example
                              cos_sim.masked_fill(pos_mask, -9e15)],
                             dim=-1)
        sim_argsort = comb_sim.argsort(dim=-1, descending=True).argmin(dim=-1)
        # Logging ranking metrics
        self.log(mode+'_acc_top1', (sim_argsort == 0).float().mean())
        self.log(mode+'_acc_top5', (sim_argsort < 5).float().mean())
        self.log(mode+'_acc_mean_pos', 1+sim_argsort.float().mean())

        return nll

    def training_step(self, batch, batch_idx):
        return self.info_nce_loss(batch, mode='train')

    def validation_step(self, batch, batch_idx):
        self.info_nce_loss(batch, mode='val')

In [13]:
pretrained_filename = "../saved_models/contrastive_models/SimCLR_Eval_2_Validated.ckpt"

simclr_model = SimCLR.load_from_checkpoint(pretrained_filename)
print(type(simclr_model))
#train_feats_simclr = prepare_data_features(simclr_model, dataset_full_nocut)

<class '__main__.SimCLR'>


## 5. Linear Classification

In [14]:
class LogisticRegression(pl.LightningModule):

    def __init__(self, feature_dim, num_classes, lr, weight_decay, max_epochs=100):
        super().__init__()
        self.save_hyperparameters()
        # Mapping from representation h to classes
        self.model = nn.Linear(feature_dim, num_classes)

    def configure_optimizers(self):
        optimizer = optim.AdamW(self.parameters(),
                                lr=self.hparams.lr,
                                weight_decay=self.hparams.weight_decay)
        lr_scheduler = optim.lr_scheduler.MultiStepLR(optimizer,
                                                      milestones=[int(self.hparams.max_epochs*0.6),
                                                                  int(self.hparams.max_epochs*0.8)],
                                                      gamma=0.1)
        return [optimizer], [lr_scheduler]

    def _calculate_loss(self, batch, mode='train'):
        feats, labels = batch
        preds = self.model(feats)
        loss = F.cross_entropy(preds, labels)
        acc = (preds.argmax(dim=-1) == labels).float().mean()

        self.log(mode + '_loss', loss)
        self.log(mode + '_acc', acc)
        return loss

    def training_step(self, batch, batch_idx):
        return self._calculate_loss(batch, mode='train')

    def validation_step(self, batch, batch_idx):
        self._calculate_loss(batch, mode='val')

    def test_step(self, batch, batch_idx):
        self._calculate_loss(batch, mode='test')

In [None]:
img_transforms = transforms.Compose([transforms.Resize((1000, 666)),
                                     transforms.ToTensor(),
                                     transforms.Normalize((0.5,), (0.5,))])
cpy = reduced_train_data

import random

random.shuffle(cpy)

tr_files = cpy[0:12772] #train_data[0:543991]
ts_files = cpy[12772:15965] #train_data[543991:]

print(len(tr_files))

for file in ts_files:
    print(file[1])

train_img_data = ImageDataset(tr_files, transform = img_transforms)

test_img_data = ImageDataset(ts_files, transform = img_transforms)

print("Number of training examples:", len(train_img_data))
print("Number of test examples:", len(test_img_data))

print(type(train_img_data[0]))

In [38]:
@torch.no_grad()
def prepare_data_features(model, dataset):
    # Prepare model
    network = deepcopy(model.convnet)
    network.fc = nn.Identity()  # Removing projection head g(.)
    network.eval()
    network.to(device)

    # Encode all images
    data_loader = data.DataLoader(dataset, batch_size=64, num_workers=NUM_WORKERS, shuffle=False, drop_last=False)
    feats, labels = [], []
    for batch_imgs, batch_labels in tqdm(data_loader):
        #print(len(batch_imgs))
        batch_imgs = batch_imgs.to(device)
        batch_feats = network(batch_imgs)
        feats.append(batch_feats.detach().cpu())
        labels.append(batch_labels)

    feats = torch.cat(feats, dim=0)
    labels = torch.cat(labels, dim=0)

    # Sort images by labels
    labels, idxs = labels.sort()
    feats = feats[idxs]

    return data.TensorDataset(feats, labels)

In [39]:
train_feats_simclr = prepare_data_features(simclr_model, train_img_data)
test_feats_simclr = prepare_data_features(simclr_model, test_img_data)

  0%|          | 0/200 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [45]:
def train_logreg(batch_size, train_feats_data, test_feats_data, model_suffix, max_epochs=100, **kwargs):
    trainer = pl.Trainer(default_root_dir=os.path.join(CHECKPOINT_PATH, "LogisticRegression"),
                         accelerator="gpu" if str(device).startswith("cuda") else "cpu",
                         devices=1,
                         max_epochs=max_epochs,
                         callbacks=[ModelCheckpoint(save_weights_only=True, mode='max', monitor='val_acc'),
                                    LearningRateMonitor("epoch")],
                         enable_progress_bar=False,
                         check_val_every_n_epoch=10, 
                            )
    trainer.logger._default_hp_metric = None

    # Data loaders
    train_loader = data.DataLoader(train_feats_data, batch_size=batch_size, shuffle=True,
                                   drop_last=False, num_workers=NUM_WORKERS)
    test_loader = data.DataLoader(test_feats_data, batch_size=batch_size, shuffle=False,
                                  drop_last=False, num_workers=NUM_WORKERS)

    # Check whether pretrained model exists. If yes, load it and skip training
    pretrained_filename = os.path.join(CHECKPOINT_PATH, f"LogisticRegression_{model_suffix}.ckpt")
    if os.path.isfile(pretrained_filename):
        print(f"Found pretrained model at {pretrained_filename}, loading...")
        model = LogisticRegression.load_from_checkpoint(pretrained_filename)
    else:
        pl.seed_everything(42)  # To be reproducable
        model = LogisticRegression(**kwargs)
        trainer.fit(model, train_loader, test_loader)
        model = LogisticRegression.load_from_checkpoint(trainer.checkpoint_callback.best_model_path)

    # Test best model on train and validation set
    train_result = trainer.test(model, train_loader, verbose=False)
    test_result = trainer.test(model, test_loader, verbose=False)
    result = {"train": train_result[0]["test_acc"], "test": test_result[0]["test_acc"]}

    return model, result

In [None]:
_, set_results = train_logreg(batch_size=64,
                                        train_feats_data=train_feats_simclr,
                                        test_feats_data=test_feats_simclr,
                                        model_suffix="first_one",
                                        feature_dim=train_feats_simclr.tensors[0].shape[1],
                                        num_classes=200,
                                        lr=1e-3,
                                        weight_decay=1e-3)

In [None]:
dataset_size = 15000
test_scores = results["test"]

print(test_scores)

## 3. Support Vector Machine Classification