<a href="https://colab.research.google.com/github/pglez82/IFCB_semisupervised/blob/master/IFCB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Showing system info

In [0]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, 
and then re-execute this cell.
Your runtime has 13.7 gigabytes of available RAM



# Download SimCLR code
In this step we download the SimCLR code for **PyTorch** and install its dependencies



In [0]:
import os

if not os.path.isdir("SimCLR"):
  !git clone https://github.com/pglez82/SimCLR.git
 
%cd SimCLR
!sh setup.sh || python3 -m pip install -r requirements.txt || exit 1
!pip install  pyyaml --upgrade

/content/SimCLR
setup.sh: 2: setup.sh: conda: not found
setup.sh: 2: setup.sh: conda: not found
Requirement already up-to-date: pyyaml in /usr/local/lib/python3.6/dist-packages (5.3.1)


# Download the images
In this section, we **donwload** the data and **uncompress** it. The code has checks in order to ensure that already downloaded data is not redownloaded

In [0]:
if not os.path.isfile("IFCB_data.tar") and not os.path.isdir("data"):
  print("Data do not exist in local. Downloading...")
  !wget -O IFCB_data.tar https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/Ec2z0uC4lghEg-9MjzoJ9QkBK5n74QjS-LszB9dlNrPfaw?download=1
else:
  print("Data already exists. Skipping download.")

if not os.path.isdir("data"):
  print("Extracting the tar file...")
  !tar -xf "IFCB_data.tar"
  print("Done. Removing the tar file.")
  !rm -f IFCB_data.tar #Remove the original file to save space

Data already exists. Skipping download.


# Download CSV with information about the images


In [0]:
import pandas as pd

if not os.path.isfile('IFCB.csv.zip'):
  print("CSV data do not exist. Downloading...")
  !wget -O IFCB.csv.zip "https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/EfsVLhFsYJpPjO0KZlpWUq0BU6LaqJ989Re4XzatS9aG4Q?download=1"

data = pd.read_csv('IFCB.csv.zip',compression='infer', header=0,sep=',',quotechar='"')
print(data)

                        Sample  roi_number  ...       AutoClass FunctionalGroup
0        IFCB1_2006_158_000036           1  ...             mix      Flagellate
1        IFCB1_2006_158_000036           2  ...     ciliate_mix         Ciliate
2        IFCB1_2006_158_000036           3  ...             mix      Flagellate
3        IFCB1_2006_158_000036           4  ...             mix      Flagellate
4        IFCB1_2006_158_000036           5  ...             mix      Flagellate
...                        ...         ...  ...             ...             ...
3457814  IFCB5_2014_353_205141        6850  ...  Leptocylindrus          Diatom
3457815  IFCB5_2014_353_205141        6852  ...             mix      Flagellate
3457816  IFCB5_2014_353_205141        6855  ...             mix      Flagellate
3457817  IFCB5_2014_353_205141        6856  ...             mix      Flagellate
3457818  IFCB5_2014_353_205141        6857  ...             mix      Flagellate

[3457819 rows x 5 columns]


# Create training set

Here we make a reestructuration of the images depending on which class we consider

In [0]:
import progressbar
from tqdm import tqdm
tqdm.pandas()

classcolumn = "AutoClass" #Autoclass means 51 classes
yearstraining = ['2006'] #Years to consider as training
yearsvalidation = ['2007']
trainingfolder = "training"
validationfolder = "validation"

classes = pd.unique(data[classcolumn])
print("Considering %i classes" % len(classes))

print("Computing image paths...")
#Compute data paths
data['year'] = data['Sample'].str[6:10].astype(str)
data['path']="data"+'/'+data['year']+'/'+data['OriginalClass'].astype(str)+'/'+data['Sample'].astype(str)+'_'+data['roi_number'].apply(lambda x: str(x).zfill(5))+'.png'
print('Done')

if not os.path.isdir(trainingfolder):
  print("Create folder structure for training set...")
  os.mkdir(trainingfolder)
  for folder in classes:
    os.mkdir(os.path.join(trainingfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['year'].isin(yearstraining)].progress_apply(lambda row: os.rename(row['path'],os.path.join(trainingfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")
else:
  print("Training data already there... Doing nothing")

if not os.path.isdir(validationfolder):
  print("Create folder structure for the validation set...")
  os.mkdir(validationfolder)
  for folder in classes:
    os.mkdir(os.path.join(validationfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['year'].isin(yearsvalidation)].progress_apply(lambda row: os.rename(row['path'],os.path.join(validationfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")  
else:
  print("Validation data already there... Doing nothing")


Considering 51 classes
Computing image paths...
Done
Training data already there... Doing nothing
Validation data already there... Doing nothing


# Lets configure SimCLR
Number of epocs, optimizer, resnet version to use ...
Things that we have to configure:


*   cuda:0 -> Change to cuda:1 to use second gpu
*   args.batch_size -> higher value its slower but better
*   args.resnet -> resnet18 | resnet50



In [0]:
import torch
from utils.yaml_config_hook import yaml_config_hook
import argparse

config = yaml_config_hook("./config/config.yaml")
args = argparse.Namespace(**config)

#Here we need to select which graphics card we want to use in case of having more than one
args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using %s" % args.device)

Using cpu


In [0]:
from pprint import pprint
args.dataset = "IFCB" #This value will be used only for the output dir
args.image_size = 128 #@param
args.batch_size =  128 #@param
args.resnet = "resnet18" #@param ['resnet18','resnet50']
#Means that we want to start training in this epoch. We should have a file checkpoint_{}.tar in the args.model_path dir
args.epoch_num =  10 #@param 
#How many epochs we want. If epochs = epoch num we just load the model and do nothing
args.epochs = 10 #@param  
#We want to save the checkpoints to google drive
args.out_dir = "../drive/My Drive/Colab Notebooks/{}_{}_b{}_s{}".format(args.dataset,args.resnet,args.batch_size,args.image_size)
args.model_path = args.out_dir #This is the directory from where we want to restore checkpoints
args.proportions = [1,10,100] #how many labeled data we are going to use for training

if not os.path.isdir(args.out_dir):
  raise SystemExit("The output folder {} does not exist!".format(args.out_dir))
pprint(vars(args))

{'batch_size': 128,
 'dataset': 'IFCB',
 'device': device(type='cpu'),
 'epoch_num': 10,
 'epochs': 10,
 'fp16': False,
 'fp16_opt_level': 'O2',
 'image_size': 128,
 'logistic_batch_size': 256,
 'logistic_epochs': 500,
 'model_path': '../drive/My Drive/Colab Notebooks/IFCB_resnet18_b128_s128',
 'normalize': True,
 'optimizer': 'Adam',
 'out_dir': '../drive/My Drive/Colab Notebooks/IFCB_resnet18_b128_s128',
 'pretrain': True,
 'projection_dim': 64,
 'proportions': [1, 10, 100],
 'resnet': 'resnet18',
 'seed': 42,
 'start_epoch': 0,
 'temperature': 0.5,
 'weight_decay': 1e-06,
 'workers': 16}


# Loading the training dataset

Use pytorch to load the training dataset

In [0]:
import torchvision
from torchvision import transforms
from modules.transformations import TransformsSimCLR

#This transform makes the magic and returns two augmented images from an original image
train_dataset = torchvision.datasets.ImageFolder(root=trainingfolder, transform=TransformsSimCLR(size=args.image_size))

train_sampler = None
train_loader = torch.utils.data.DataLoader(
  train_dataset,
  batch_size=args.batch_size,
  shuffle=(train_sampler is None),
  drop_last=True,
  num_workers=args.workers,
  sampler=train_sampler,
)

# Define the training function
This is the function that will do all the work for one epoch

In [0]:
import time

def train(args, train_loader, model, criterion, optimizer, writer):
  loss_epoch = 0
  start_time = time.time()
  for step, ((x_i, x_j), _) in enumerate(train_loader):
    optimizer.zero_grad()
    x_i = x_i.to(args.device)
    x_j = x_j.to(args.device)

    # positive pair, with encoding
    h_i, z_i = model(x_i)
    h_j, z_j = model(x_j)

    loss = criterion(z_i, z_j)

    #if apex and args.fp16:
    #    with amp.scale_loss(loss, optimizer) as scaled_loss:
    #        scaled_loss.backward()
    #else:
    loss.backward()

    optimizer.step()

    if step % 50 == 0:
      spent = time.time()-start_time
      print(f"Step [{step}/{len(train_loader)}]\t Loss: {loss.item()} \t Time: {spent} secs [{(args.batch_size*50)/spent} ej/sec]]")
      start_time = time.time()

    writer.add_scalar("Loss/Step", loss.item(), args.global_step)
    loss_epoch += loss.item()
    args.global_step += 1

  return loss_epoch

# Load the model
We only reload the model if **args.epoch_num** is different from zero. This case means that we want to continue training from a checkpoint (we should have the model in the **args.model_path** dir.

In [0]:
from model import load_model
model, optimizer, scheduler = load_model(args, train_loader,reload_model=(args.epoch_num!=0))

# Configure TensorBoard



In [0]:
from torch.utils.tensorboard import SummaryWriter

tb_dir = os.path.join(args.out_dir, "colab")
if not os.path.exists(tb_dir):
  os.makedirs(tb_dir)
writer = SummaryWriter(log_dir=tb_dir)

# Load the loss function
This function tries to minimize the difference between the two augmented variations of the image and maximize the difference between these and the rest of the batch

In [0]:
from modules import NT_Xent

criterion = NT_Xent(args.batch_size, args.temperature, args.device)

# Training the CNN
We make a checkpoint each 5 epochs just in case

In [0]:
from model import save_model

args.global_step = 0
if args.epoch_num!=0: #If we have loaded a model trained til an epoch, lets start training in the next
  args.start_epoch=args.epoch_num+1
args.current_epoch = args.start_epoch #Variable for controlling in which epoch we are

for epoch in range(args.start_epoch, args.epochs):
  lr = optimizer.param_groups[0]['lr']
  loss_epoch = train(args, train_loader, model, criterion, optimizer, writer)

  if scheduler:
    scheduler.step()

  if epoch % 5 == 0:
    save_model(args, model, optimizer)

  writer.add_scalar("Loss/train epoch", loss_epoch / len(train_loader), epoch)
  writer.add_scalar("Misc/learning_rate", lr, epoch)
  print(f"Epoch [{epoch+1}/{args.epochs}]\t Loss: {loss_epoch / len(train_loader)}\t lr: {round(lr, 5)}")
  args.current_epoch += 1

## end training
if args.start_epoch!=args.epochs:
  save_model(args, model, optimizer)

In [0]:
#!tensorboard dev upload --logdir "$tb_dir" --name "IFCBv3" --description "Training with 2006 image size 128 batch size 256"

# **Trainining the classifier using the deep features**
Now we will be trying to find out if the network has learnt something useful from the unlabeled data. We will train a Logistic Regression classifier with the labeled examples and testing against a validation dataset

# Define train and test functions

In [0]:
def train(args, loader, simclr_model, model, criterion, optimizer):
  loss_epoch = 0
  accuracy_epoch = 0
  for step, (x, y) in enumerate(loader):
    optimizer.zero_grad()

    x = x.to(args.device)
    y = y.to(args.device)

    output = model(x)
    loss = criterion(output, y)

    predicted = output.argmax(1)
    acc = (predicted == y).sum().item() / y.size(0)
    accuracy_epoch += acc

    loss.backward()
    optimizer.step()

    loss_epoch += loss.item()

  return loss_epoch, accuracy_epoch

In [0]:
def test(args, loader, simclr_model, model, criterion, optimizer):
  loss_epoch = 0
  accuracy_epoch = 0
  model.eval()
  for step, (x, y) in enumerate(loader):
    model.zero_grad()

    x = x.to(args.device)
    y = y.to(args.device)

    output = model(x)
    loss = criterion(output, y)

    predicted = output.argmax(1)
    acc = (predicted == y).sum().item() / y.size(0)
    accuracy_epoch += acc

    loss_epoch += loss.item()

  return loss_epoch, accuracy_epoch

# Load data
We have to load the data again because before the data loader was doing the special agumentation for the contrastive learning. Now we only want to resize the images.




In [0]:
train_dataset = torchvision.datasets.ImageFolder(root=trainingfolder, transform=TransformsSimCLR(size=args.image_size).test_transform,)
test_dataset = torchvision.datasets.ImageFolder(root=validationfolder, transform=TransformsSimCLR(size=args.image_size).test_transform,)

train_loader = torch.utils.data.DataLoader(
  train_dataset,
  batch_size=args.logistic_batch_size,
  shuffle=False,
  drop_last=True,
  num_workers=args.workers,
)

test_loader = torch.utils.data.DataLoader(
  test_dataset,
  batch_size=args.logistic_batch_size,
  shuffle=False,
  drop_last=True,
  num_workers=args.workers,
)

# Load de pretrained CNN and its weights

In [0]:
simclr_model, _, _ = load_model(args, train_loader, reload_model=True)
simclr_model = simclr_model.to(args.device)
simclr_model.eval()

SimCLR(
  (encoder): ResNet(
    (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (relu): ReLU(inplace=True)
    (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (layer1): Sequential(
      (0): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (1): BasicBlock(
        (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_runn

# Configure Logistic Regression

In [0]:
from modules import LogisticRegression

print("Configuring Logistic Regression. %i features and %i classes" % (simclr_model.n_features,len(classes)))
model = LogisticRegression(simclr_model.n_features, len(classes))
model = model.to(args.device)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
criterion = torch.nn.CrossEntropyLoss()

Configuring Logistic Regression. 512 features and 51 classes


# Compute deep features for both training and validation sets
Here we use our CNN pretrained using contrastive learning with unlabelled data for computing the features from all the examples.

In [0]:
import numpy as np

#This function computes the deep features
def inference(loader, context_model, device):
  feature_vector = []
  labels_vector = []
  for step, (x, y) in enumerate(loader):
    x = x.to(device)

    # get encoding
    with torch.no_grad():
      h, z = context_model(x)

    h = h.detach()

    feature_vector.extend(h.cpu().detach().numpy())
    labels_vector.extend(y.numpy())

    if step % 20 == 0:
      print(f"Step [{step}/{len(loader)}]")

  feature_vector = np.array(feature_vector)
  labels_vector = np.array(labels_vector)
  print("Features shape {}".format(feature_vector.shape))
  return feature_vector, labels_vector

def get_features(context_model, train_loader, test_loader, device):
  print("Computing deep features for training set...")
  train_X, train_y = inference(train_loader, context_model, device)
  print("Computing deep features for validation set...")
  test_X, test_y = inference(test_loader, context_model, device)
  return train_X, train_y, test_X, test_y

def create_data_loaders_from_arrays(X_train, y_train, X_test, y_test, batch_size, proportions):
  #We want to create multiple train loaders with different labelled data proportions
  train_loaders = []
  for p in proportions:
    X_train_sub,_,y_train_sub,_ = train_test_split(X_train, y_train, train_size=p, stratify=y_train, random_state=42)
    train = torch.utils.data.TensorDataset(torch.from_numpy(X_train_sub), torch.from_numpy(y_train_sub))
    train_loaders.append(torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=False))

  test = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
  test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
  return train_loader, test_loader

(train_X, train_y, test_X, test_y) = get_features(simclr_model, train_loader, test_loader, args.device)
print("Done")

#We create the data loaders from the arrays with the deep features
arr_train_loader, arr_test_loader = create_data_loaders_from_arrays(train_X, train_y, test_X, test_y, args.logistic_batch_size,args.proportions)


Computing deep features for training set...
Step [0/511]


# Lets train the classifier and see how it works!
We are going to train the classifier using different labeled data proportions

In [0]:
for epoch in range(args.logistic_epochs):
  loss_epoch, accuracy_epoch = train(args, arr_train_loader, simclr_model, model, criterion, optimizer)
  
  if epoch % 10 == 0:
    print(f"Epoch [{epoch}/{args.logistic_epochs}]\t Loss: {loss_epoch / len(train_loader)}\t Accuracy: {accuracy_epoch / len(train_loader)}")


# final testing
loss_epoch, accuracy_epoch = test(args, arr_test_loader, simclr_model, model, criterion, optimizer)
print(f"[FINAL]\t Loss: {loss_epoch / len(test_loader)}\t Accuracy: {accuracy_epoch / len(test_loader)}")