# SimCLR
PyTorch implementation of SimCLR: A Simple Framework for Contrastive Learning of Visual Representations by T. Chen et al. With support for the LARS (Layer-wise Adaptive Rate Scaling) optimizer and global batch norm.

[Link to paper](https://arxiv.org/pdf/2002.05709.pdf)


In [1]:
gpu_info = !nvidia-smi 
gpu_info = '\n'.join(gpu_info)
print(gpu_info)

Fri Jun 17 10:26:17 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 450.51.05    Driver Version: 450.51.05    CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  TITAN Xp            Off  | 00000000:17:00.0 Off |                  N/A |
| 23%   28C    P8     8W / 250W |   4189MiB / 12196MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
|   1  TITAN Xp            Off  | 00000000:65:00.0 Off |                  N/A |
| 23%   36C    P8     9W / 250W |    939MiB / 12194MiB |      0%      Default |
|       

## Setup the repository

In [2]:
%pip install  pyyaml --upgrade

Note: you may need to restart the kernel to use updated packages.


In [3]:
%pip install simCLR

Note: you may need to restart the kernel to use updated packages.


# Part 1:
## SimCLR pre-training

In [2]:
import os
import torch
import numpy as np  
import argparse

apex = False
try:
    from apex import amp
    apex = True
except ImportError:
    print(
        "Install the apex package from https://www.github.com/nvidia/apex to use fp16 for training"
    )


from simclr import SimCLR
from simclr.modules import get_resnet, NT_Xent
from simclr.modules.transformations import TransformsSimCLR

Install the apex package from https://www.github.com/nvidia/apex to use fp16 for training


### Load arguments from `config/config.yaml`

In [3]:
from pprint import pprint
import argparse
from utils import yaml_config_hook

parser = argparse.ArgumentParser(description="SimCLR")
config = yaml_config_hook("./config/config.yaml")
for k, v in config.items():
    parser.add_argument(f"--{k}", default=v, type=type(v))

args = parser.parse_args([])
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
#device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [4]:
### override any configuration parameters here, e.g. to adjust for use on GPUs on the Colab platform:
args.batch_size = 64
args.resnet = "resnet18"
pprint(vars(args))

{'batch_size': 64,
 'dataparallel': 0,
 'dataset': '',
 'dataset_dir': './data',
 'epoch_num': 10,
 'epochs': 10,
 'gpus': ['0 1'],
 'image_size': 64,
 'logistic_batch_size': 256,
 'logistic_epochs': 500,
 'model_path': 'model18_simclr.pt',
 'nodes': 1,
 'nr': 1,
 'optimizer': 'Adam',
 'pretrain': True,
 'projection_dim': 64,
 'reload': False,
 'resnet': 'resnet18',
 'seed': 9,
 'start_epoch': 0,
 'temperature': 0.5,
 'weight_decay': 1e-06,
 'workers': 12}


### Load dataset into train loader

In [5]:
import pandas as pd

if not os.path.isfile('IFCB.csv.zip'):
    print("CSV data do not exist. Downloading...")
    !wget -O IFCB.csv.zip "https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/EfsVLhFsYJpPjO0KZlpWUq0BU6LaqJ989Re4XzatS9aG4Q?download=1"

data = pd.read_csv('IFCB.csv.zip',compression='infer', header=0,sep=',',quotechar='"')

#Compute sample and year information
data['year'] = data['Sample'].str[6:10].astype(str) #Compute the year
samples=data.groupby('Sample').first()
samples = samples[["year"]]
print(data)

                        Sample  roi_number        OriginalClass  \
0        IFCB1_2006_158_000036           1                  mix   
1        IFCB1_2006_158_000036           2  Tontonia_gracillima   
2        IFCB1_2006_158_000036           3                  mix   
3        IFCB1_2006_158_000036           4                  mix   
4        IFCB1_2006_158_000036           5                  mix   
...                        ...         ...                  ...   
3457814  IFCB5_2014_353_205141        6850       Leptocylindrus   
3457815  IFCB5_2014_353_205141        6852                  mix   
3457816  IFCB5_2014_353_205141        6855                  mix   
3457817  IFCB5_2014_353_205141        6856                  mix   
3457818  IFCB5_2014_353_205141        6857                  mix   

              AutoClass FunctionalGroup  year  
0                   mix      Flagellate  2006  
1           ciliate_mix         Ciliate  2006  
2                   mix      Flagellate  2006  
3  

In [6]:
import progressbar
from tqdm import tqdm
from shutil import copyfile
import numpy as np

tqdm.pandas()

classcolumn = "AutoClass" #AutoClass means 51 classes
yearstraining = ['2006','2007'] #Years to consider as training
yearstest = ['2008'] #Years to consider as test

samplestraining = list(samples[samples['year'].isin(yearstraining)].index) #Samples to consider for training
samplestest = list(samples[samples['year'].isin(yearstest)].index) #Samples to consider for testing

classes=np.unique(data[classcolumn])
classes.sort()


In [8]:
import torchvision.transforms as T
from h5ifcbdataset import H5IFCBDataset
from torch.utils.data import DataLoader

hdf5_files_path = "/media/nas/olayap/env_olaya/TFM/IFBC_HDF5_olaya/output/"

#files to load
filestraining = [hdf5_files_path+s+'.hdf5' for s in samplestraining]
filestest = [hdf5_files_path+s+'.hdf5' for s in samplestest]

train_dset = H5IFCBDataset(filestraining,classes,classattribute=classcolumn, verbose=1,trainingset=False,transform=TransformsSimCLR(size=args.image_size))


In [16]:
pctg = 0.2
percentage = round(pctg*len(train_dset))
train_dset_pctg_label, train_dset_pctg_unlabel = torch.utils.data.random_split(train_dset, [percentage, len(train_dset)-percentage], generator=torch.Generator().manual_seed(0))
train_loader_unlabel = DataLoader(train_dset_pctg_unlabel,batch_size=args.batch_size,num_workers=args.workers,shuffle=True,pin_memory=True,drop_last=True)


In [8]:
args.epochs = 100
args.model_path = "model18_simclr_20_pr.pt"
args.batch_size = 128

### Load the SimCLR model, optimizer and learning rate scheduler

In [21]:
CUDA_LAUNCH_BLOCKING=1

In [10]:
import torch.nn as nn
from simclr.modules import LARS

# initialize ResNet
encoder = get_resnet(args.resnet, pretrained=False)
n_features = encoder.fc.in_features  


# initialize model
model = SimCLR(encoder, args.projection_dim, n_features)
model = model.to(device)

# optimizer / loss
scheduler = None
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)  


### Initialize the criterion (NT-Xent loss)

In [11]:
criterion = NT_Xent(args.batch_size, args.temperature, world_size=1)

### Train function

In [12]:
def train(args, train_loader, model, criterion, optimizer): 
    loss_epoch = 0
    for step, ((x_i, x_j),_, _)  in enumerate(train_loader):
        optimizer.zero_grad()
        x_i = x_i.cuda(non_blocking=True)
        x_j = x_j.cuda(non_blocking=True)
    
        # positive pair, with encoding
        h_i, h_j, z_i, z_j = model(x_i, x_j)

        loss = criterion(z_i, z_j)
        loss.backward()

        optimizer.step()

        if step % 100 == 0:
            print(f"Step [{step}/{len(train_loader)}]\t Loss: {loss.item()}")

        loss_epoch += loss.item()
        args.global_step += 1
    return loss_epoch

### Start training

In [None]:
torch.cuda.empty_cache()
torch.cuda.memory_summary(device=None, abbreviated=False)

args.global_step = 0
args.current_epoch = 0
for epoch in range(args.start_epoch, args.epochs):
    lr = optimizer.param_groups[0]["lr"]
    loss_epoch = train(args, train_loader_unlabel, model, criterion, optimizer) 

    if scheduler:
        scheduler.step()

    # save every 10 epochs
    if epoch % 50 == 0:
        torch.save(model.state_dict(), args.model_path)

    print(
        f"Epoch [{epoch+1}/{args.epochs}]\t Loss: {loss_epoch / len(train_loader_unlabel)}\t lr: {round(lr, 5)}"
    )
    args.current_epoch += 1

# end training
torch.save(model.state_dict(), args.model_path)

# Part 2:
## Linear evaluation using logistic regression, using weights from frozen, pre-trained SimCLR model

In [1]:
import torch
import torchvision
import numpy as np
import argparse

In [2]:
import torch.nn as nn
class LogisticRegression(nn.Module):
    def __init__(self, n_features, n_classes):
        super(LogisticRegression, self).__init__()

        self.model = nn.Linear(n_features, n_classes)

    def forward(self, x):
        return self.model(x)

In [75]:
def train(args, loader, simclr_model, model, criterion, optimizer):
    loss_epoch = 0
    accuracy_epoch = 0
    model.train()
    for step, (x, y) in enumerate(loader):
        optimizer.zero_grad()

        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        predicted = output.argmax(1)
        acc = (predicted == y).sum().item() / y.size(0)
        accuracy_epoch += acc

        loss.backward()
        optimizer.step()

        loss_epoch += loss.item()
        # if step % 100 == 0:
        #     print(
        #         f"Step [{step}/{len(loader)}]\t Loss: {loss.item()}\t Accuracy: {acc}"
        #     )

    return loss_epoch, accuracy_epoch

In [76]:
import torch.nn.functional as nnf

def test(args, loader, simclr_model, model, criterion, optimizer, percentage, results_save_path):
    loss_epoch = 0
    accuracy_epoch = 0
    model.eval()
    y_true = []
    y_pred = []
    y_probs = []

    for step, (x, y) in enumerate(loader):
        model.zero_grad()

        x = x.to(device)
        y = y.to(device)

        output = model(x)
        loss = criterion(output, y)

        predicted = output.argmax(1)
        acc = (predicted == y).sum().item() / y.size(0)
        accuracy_epoch += acc
        
        prob = nnf.softmax(output, dim=1)
        y_probs.extend(prob.cpu().detach().numpy())
        y_true.extend(y.cpu().numpy())
        y_pred.extend(predicted.cpu().numpy())

        loss_epoch += loss.item()
        
    np.savetxt("{}/{}_true.csv".format(results_save_path,round(percentage*100)),y_true,fmt='%d')
    np.savetxt("{}/{}_pred.csv".format(results_save_path,round(percentage*100)),y_pred,fmt='%d')
    np.savetxt("{}/{}_probs.csv".format(results_save_path,round(percentage*100)), y_probs, delimiter=",",fmt='%f')

    return loss_epoch, accuracy_epoch


In [77]:
from pprint import pprint
from utils import yaml_config_hook

parser = argparse.ArgumentParser(description="SimCLR")
config = yaml_config_hook("./config/config.yaml")
for k, v in config.items():
    parser.add_argument(f"--{k}", default=v, type=type(v))

args = parser.parse_args([])

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
#device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

### Load dataset into train/test dataloaders

Vuelvo a cargar el trainset porque sino se entrena la red con las transformaciones de data augmentation. 

In [48]:
from trans_2 import TransformsSimCLR2
train_dset_ = H5IFCBDataset(filestraining,classes,classattribute=classcolumn, verbose=1,trainingset=False,transform=TransformsSimCLR2(size=args.image_size))
pctg = 0.2
percentage = round(pctg*len(train_dset_))
train_dset_pctg_label, _ = torch.utils.data.random_split(train_dset_, [percentage, len(train_dset_)-percentage], generator=torch.Generator().manual_seed(0))


Loading samples: 100%|██████████| 164/164 [07:35<00:00,  2.78s/it]


In [63]:
test_dset = H5IFCBDataset(filestest,classes,classattribute=classcolumn, verbose=1,trainingset=False,transform=TransformsSimCLR2(size=args.image_size)) #.test_transform)

Loading samples: 100%|██████████| 122/122 [08:49<00:00,  4.34s/it]


In [64]:

train_loader_label = torch.utils.data.DataLoader(
    train_dset_pctg_label,
    batch_size=args.logistic_batch_size,
    shuffle=True,
    drop_last=True,
    num_workers=args.workers,
)

test_loader = torch.utils.data.DataLoader(
    test_dset,
    batch_size=args.logistic_batch_size,
    shuffle=False,
    drop_last=True,
    num_workers=args.workers,
)

### Load ResNet encoder / SimCLR and load model weights

In [79]:
encoder = get_resnet(args.resnet, pretrained=False) # don't load a pre-trained model from PyTorch repo
n_features = encoder.fc.in_features  # get dimensions of fc layer

# load pre-trained model from checkpoint
simclr_model = SimCLR(encoder, args.projection_dim, n_features)

simclr_model.load_state_dict(torch.load("model18_simclr_20.pt")) #, map_location=device.type)
simclr_model = simclr_model.to(device)
    

In [80]:
## Logistic Regression
n_classes = len(classes) 
model = LogisticRegression(simclr_model.n_features, n_classes)
model = model.to(device)

In [81]:
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = torch.nn.CrossEntropyLoss()

### Helper functions to map all input data $X$ to their latent representations $h$ that are used in linear evaluation (they only have to be computed once)

In [82]:
def inference(loader, simclr_model, device):
    feature_vector = []
    labels_vector = []
    for step, (x, y, _) in enumerate(loader):
        
        x = x[0].to(device)

        # get encoding
        with torch.no_grad():
            h, _, z, _ = simclr_model(x, x)

        h = h.detach()

        feature_vector.extend(h.cpu().detach().numpy())
        labels_vector.extend(y.numpy())

        if step % 20 == 0:
            print(f"Step [{step}/{len(loader)}]\t Computing features...")

    feature_vector = np.array(feature_vector)
    labels_vector = np.array(labels_vector)
    print("Features shape {}".format(feature_vector.shape))
    return feature_vector, labels_vector


def get_features(context_model, train_loader, test_loader, device):
    train_X, train_y = inference(train_loader, context_model, device)
    test_X, test_y = inference(test_loader, context_model, device)
    return train_X, train_y, test_X, test_y


def create_data_loaders_from_arrays(X_train, y_train, X_test, y_test, batch_size):
    train = torch.utils.data.TensorDataset(
        torch.from_numpy(X_train), torch.from_numpy(y_train)
    )
    train_loader = torch.utils.data.DataLoader(
        train, batch_size=batch_size, shuffle=False
    )

    test = torch.utils.data.TensorDataset(
        torch.from_numpy(X_test), torch.from_numpy(y_test)
    )
    test_loader = torch.utils.data.DataLoader(
        test, batch_size=batch_size, shuffle=False
    )
    return train_loader, test_loader

In [83]:
print("### Creating features from pre-trained context model ###")
(train_X, train_y, test_X, test_y) = get_features(
    simclr_model, train_loader_label, test_loader, device
)

arr_train_loader, arr_test_loader = create_data_loaders_from_arrays(
    train_X, train_y, test_X, test_y, args.logistic_batch_size
)

### Creating features from pre-trained context model ###
Step [0/315]	 Computing features...
Step [20/315]	 Computing features...
Step [40/315]	 Computing features...
Step [60/315]	 Computing features...
Step [80/315]	 Computing features...
Step [100/315]	 Computing features...
Step [120/315]	 Computing features...
Step [140/315]	 Computing features...
Step [160/315]	 Computing features...
Step [180/315]	 Computing features...
Step [200/315]	 Computing features...
Step [220/315]	 Computing features...
Step [240/315]	 Computing features...
Step [260/315]	 Computing features...
Step [280/315]	 Computing features...
Step [300/315]	 Computing features...
Features shape (80640, 512)
Step [0/1669]	 Computing features...
Step [20/1669]	 Computing features...
Step [40/1669]	 Computing features...
Step [60/1669]	 Computing features...
Step [80/1669]	 Computing features...
Step [100/1669]	 Computing features...
Step [120/1669]	 Computing features...
Step [140/1669]	 Computing features...
Step [1

In [86]:
args.logistic_epochs = 500

In [87]:
import matplotlib.pyplot as plt
import numpy as np

fig = plt.figure()
acc = np.zeros(int(args.logistic_epochs/10))
x_axe = np.linspace(0, args.logistic_epochs, int(args.logistic_epochs/10), endpoint=True)

for epoch in range(args.logistic_epochs):
    loss_epoch, accuracy_epoch = train(args, arr_train_loader, simclr_model, model, criterion, optimizer)

    if epoch % 10 == 0:
        print(f"Epoch [{epoch}/{args.logistic_epochs}]\t Loss: {loss_epoch / len(train_loader_label)}\t Accuracy: {accuracy_epoch / len(train_loader_label)}")
        acc[int(epoch/10)-1] = accuracy_epoch / len(train_loader_label)

# final testing
results_save_path = "model18_simclr_20"
if not os.path.isdir(results_save_path):
    os.mkdir(results_save_path)
else:
    print("Output directory already exists, will override everything there...")
    
loss_epoch, accuracy_epoch = test(args, arr_test_loader, simclr_model, model, criterion, optimizer, 0.2, results_save_path)

print(f"\n[FINAL]\t Loss: {loss_epoch / len(test_loader)}\t Accuracy: {accuracy_epoch / len(test_loader)}")

Epoch [0/500]	 Loss: 0.2674724187642809	 Accuracy: 0.9021825396825397
Epoch [10/500]	 Loss: 0.2657891698772945	 Accuracy: 0.9026785714285714
Epoch [20/500]	 Loss: 0.2642168392264654	 Accuracy: 0.9031125992063492
Epoch [30/500]	 Loss: 0.2627426349454456	 Accuracy: 0.9038938492063492
Epoch [40/500]	 Loss: 0.2613558163245519	 Accuracy: 0.9042286706349206
Epoch [50/500]	 Loss: 0.2600473162200716	 Accuracy: 0.9045882936507936
Epoch [60/500]	 Loss: 0.25880938224376193	 Accuracy: 0.9048983134920635
Epoch [70/500]	 Loss: 0.25763536554480354	 Accuracy: 0.9052083333333333
Epoch [80/500]	 Loss: 0.25651949625166637	 Accuracy: 0.905704365079365
Epoch [90/500]	 Loss: 0.2554567648777886	 Accuracy: 0.9060143849206349
Epoch [100/500]	 Loss: 0.2544427591183829	 Accuracy: 0.906312003968254
Epoch [110/500]	 Loss: 0.2534736036308228	 Accuracy: 0.906671626984127
Epoch [120/500]	 Loss: 0.25254586338996887	 Accuracy: 0.9070560515873016
Epoch [130/500]	 Loss: 0.25165646445183526	 Accuracy: 0.9073040674603174
E

<Figure size 432x288 with 0 Axes>