<a href="https://colab.research.google.com/github/pglez82/IFCB_semisupervised/blob/master/IFCB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Showing system info

In [1]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

Thu Jun 11 16:14:01 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 440.82       Driver Version: 418.67       CUDA Version: 10.1     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   70C    P8    30W / 149W |      0MiB / 11441MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID   Type   Process name                             Usage      |
|  No ru

# Download SimCLR code
In this step we download the SimCLR code for **PyTorch** and install its dependencies



In [2]:
import os

if not os.path.isdir("SimCLR"):
  !git clone https://github.com/spijkervet/SimCLR.git
 
%cd SimCLR
!sh setup.sh || python3 -m pip install -r requirements.txt || exit 1
!pip install  pyyaml --upgrade

/content/SimCLR
setup.sh: 2: setup.sh: conda: not found
setup.sh: 2: setup.sh: conda: not found
Requirement already up-to-date: pyyaml in /usr/local/lib/python3.6/dist-packages (5.3.1)


# Use of Google TPU [Optional]

In [0]:
use_tpu = False

if use_tpu:
  assert os.environ['COLAB_TPU_ADDR'] #make sure we are in a TPU enviroment
  VERSION = "20200325"  #@param ["1.5" , "20200325", "nightly"]
  !curl https://raw.githubusercontent.com/pytorch/xla/master/contrib/scripts/env-setup.py -o pytorch-xla-env-setup.py
  !python pytorch-xla-env-setup.py --version $VERSION

  # imports the torch_xla package for TPU support
  import torch_xla
  import torch_xla.core.xla_model as xm
  dev = xm.xla_device()
  print(dev)

# Download the images
In this section, we **donwload** the data and **uncompress** it. The code has checks in order to ensure that already downloaded data is not redownloaded

In [4]:
if not os.path.isfile("IFCB_data.tar") and not os.path.isdir("data"):
  print("Data do not exist in local. Downloading...")
  !wget -O IFCB_data.tar https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/Ec2z0uC4lghEg-9MjzoJ9QkBK5n74QjS-LszB9dlNrPfaw?download=1
else:
  print("Data already exists. Skipping download.")

if not os.path.isdir("data"):
  print("Extracting the tar file...")
  !tar -xf "IFCB_data.tar"
  print("Done. Removing the tar file.")
  !rm -f IFCB_data.tar #Remove the original file to save space

Data already exists. Skipping download.


# Download CSV with information about the images


In [5]:
import pandas as pd

if not os.path.isfile('IFCB.csv.zip'):
  print("CSV data do not exist. Downloading...")
  !wget -O IFCB.csv.zip "https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/EfsVLhFsYJpPjO0KZlpWUq0BU6LaqJ989Re4XzatS9aG4Q?download=1"

data = pd.read_csv('IFCB.csv.zip',compression='infer', header=0,sep=',',quotechar='"')
print(data)

                        Sample  roi_number  ...       AutoClass FunctionalGroup
0        IFCB1_2006_158_000036           1  ...             mix      Flagellate
1        IFCB1_2006_158_000036           2  ...     ciliate_mix         Ciliate
2        IFCB1_2006_158_000036           3  ...             mix      Flagellate
3        IFCB1_2006_158_000036           4  ...             mix      Flagellate
4        IFCB1_2006_158_000036           5  ...             mix      Flagellate
...                        ...         ...  ...             ...             ...
3457814  IFCB5_2014_353_205141        6850  ...  Leptocylindrus          Diatom
3457815  IFCB5_2014_353_205141        6852  ...             mix      Flagellate
3457816  IFCB5_2014_353_205141        6855  ...             mix      Flagellate
3457817  IFCB5_2014_353_205141        6856  ...             mix      Flagellate
3457818  IFCB5_2014_353_205141        6857  ...             mix      Flagellate

[3457819 rows x 5 columns]


# Create training set

Here we make a reestructuration of the images depending on which class we consider

In [6]:
import progressbar
from tqdm import tqdm
tqdm.pandas()

classcolumn = "AutoClass" #Autoclass means 51 classes
yearstraining = ['2006'] #Years to consider as training
trainingfolder = "training"

classes = pd.unique(data[classcolumn])
print("Considering %i classes" % len(classes))

print("Creating training set...")

if not os.path.isdir(trainingfolder):
  print("Create folder structure...")
  os.mkdir(trainingfolder)
  for folder in classes:
    os.mkdir(os.path.join(trainingfolder,folder))
  print("Done.\nMoving images to the respective folders...")

  #Compute data paths
  data['year'] = data['Sample'].str[6:10].astype(str)
  data['path']="data"+'/'+data['year']+'/'+data['OriginalClass'].astype(str)+'/'+data['Sample'].astype(str)+'_'+data['roi_number'].apply(lambda x: str(x).zfill(5))+'.png'
  #Move images to the training directory following the structure
  data[data['year'].isin(yearstraining)].progress_apply(lambda row: os.rename(row['path'],os.path.join(trainingfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")
else:
  print("Data structure already created.")


Considering 51 classes
Creating training set...
Data structure already created.


# Loading the training dataset

Use pytorch to load the training dataset

In [0]:
import torch
import torchvision
from torchvision import transforms
from modules.transformations import TransformsSimCLR

#This transform makes the magic and returns two augmented images from an original image
train_dataset = torchvision.datasets.ImageFolder(root=trainingfolder, transform=TransformsSimCLR(size=64))

In [0]:
import time

def train(args, train_loader, model, criterion, optimizer, writer):
  loss_epoch = 0
  start_time = time.time()
  for step, ((x_i, x_j), _) in enumerate(train_loader):
    optimizer.zero_grad()
    x_i = x_i.to(args.device)
    x_j = x_j.to(args.device)

    # positive pair, with encoding
    h_i, z_i = model(x_i)
    h_j, z_j = model(x_j)

    loss = criterion(z_i, z_j)

    #if apex and args.fp16:
    #    with amp.scale_loss(loss, optimizer) as scaled_loss:
    #        scaled_loss.backward()
    #else:
    loss.backward()

    if use_tpu:
      xm.optimizer_step(optimizer)
      xm.mark_step()
    else:
      optimizer.step()

    if step % 50 == 0:
      spent = time.time()-start_time
      print(f"Step [{step}/{len(train_loader)}]\t Loss: {loss.item()} \t Time: {spent} secs [{(args.batch_size*50)/spent} ej/sec]]")
      start_time = time.time()

    writer.add_scalar("Loss/train_epoch", loss.item(), args.global_step)
    loss_epoch += loss.item()
    args.global_step += 1

  return loss_epoch

# Lets configure SimCLR
Number of epocs, optimizer, resnet version to use ...
Things that we have to configure:


*   cuda:0 -> Change to cuda:1 to use second gpu
*   args.batch_size -> higher value its slower but better
*   args.resnet -> resnet18 | resnet50



In [9]:
from utils.yaml_config_hook import yaml_config_hook
import argparse

config = yaml_config_hook("./config/config.yaml")
args = argparse.Namespace(**config)

#Here we need to select which graphics card we want to use in case of having more than one
if use_tpu:
  args.device = dev
else:
  args.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using %s" % args.device)

Using cuda:0


In [10]:
from pprint import pprint
args.batch_size = 64 #@param
args.resnet = "resnet18" #@param ['resnet18','resnet50']
args.epoch_num =  0#@param #Means that we want to start training in this epoch. We should have a file checkpoint_{}.tar in the args.model_path dir
#We want to save the checkpoints to google drive
args.out_dir = "../drive/My Drive/Colab Notebooks/IFCBv2" #change to local directory
args.model_path = args.out_dir #This is the directory from where we want to restore checkpoints
if not os.path.isdir(args.out_dir):
  raise SystemExit("The output folder does not exist!")
pprint(vars(args))

{'batch_size': 64,
 'dataset': 'CIFAR10',
 'device': device(type='cuda', index=0),
 'epoch_num': 0,
 'epochs': 100,
 'fp16': False,
 'fp16_opt_level': 'O2',
 'logistic_batch_size': 256,
 'logistic_epochs': 500,
 'model_path': '../drive/My Drive/Colab Notebooks/IFCBv2',
 'normalize': True,
 'optimizer': 'Adam',
 'out_dir': '../drive/My Drive/Colab Notebooks/IFCBv2',
 'pretrain': True,
 'projection_dim': 64,
 'resnet': 'resnet18',
 'seed': 42,
 'start_epoch': 0,
 'temperature': 0.5,
 'weight_decay': 1e-06,
 'workers': 16}


# Prepare the model for the data

In [0]:
train_sampler = None
train_loader = torch.utils.data.DataLoader(
    train_dataset,
    batch_size=args.batch_size,
    shuffle=(train_sampler is None),
    drop_last=True,
    num_workers=args.workers,
    sampler=train_sampler,
)

# Load the model
We only reload the model if **args.epoch_num** is different from zero. This case means that we want to continue training from a checkpoint (we should have the model in the **args.model_path** dir.

In [0]:
from model import load_model
model, optimizer, scheduler = load_model(args, train_loader,reload_model=(args.epoch_num!=0))

# Configure TensorBoard



In [0]:
from torch.utils.tensorboard import SummaryWriter

tb_dir = os.path.join(args.out_dir, "colab")
if not os.path.exists(tb_dir):
  os.makedirs(tb_dir)
writer = SummaryWriter(log_dir=tb_dir)

# Load the loss function
This function tries to minimize the difference between the two augmented variations of the image and maximize the difference between these and the rest of the batch

In [0]:
from modules import NT_Xent

criterion = NT_Xent(args.batch_size, args.temperature, args.device)

# Training the CNN
We make a checkpoint each 5 epochs just in case

In [0]:
from model import save_model

args.global_step = 0
if args.epoch_num!=0: #If we have loaded a model trained til an epoch, lets start training in the next
  args.start_epoch=args.epoch_num+1
args.current_epoch = args.start_epoch #Variable for controlling in which epoch we are

for epoch in range(args.start_epoch, args.epochs):
    lr = optimizer.param_groups[0]['lr']
    loss_epoch = train(args, train_loader, model, criterion, optimizer, writer)

    if scheduler:
        scheduler.step()

    if epoch % 5 == 0:
        save_model(args, model, optimizer)

    writer.add_scalar("Loss/train", loss_epoch / len(train_loader), epoch)
    writer.add_scalar("Misc/learning_rate", lr, epoch)
    print(
        f"Epoch [{epoch}/{args.epochs}]\t Loss: {loss_epoch / len(train_loader)}\t lr: {round(lr, 5)}"
    )
    args.current_epoch += 1

## end training
save_model(args, model, optimizer)

Step [0/2046]	 Loss: 4.853462219238281 	 Time: 5.148331642150879 secs [621.5605797032723 ej/sec]]
Step [50/2046]	 Loss: 4.8086700439453125 	 Time: 9.259265422821045 secs [345.5997699464422 ej/sec]]
Step [100/2046]	 Loss: 4.5811238288879395 	 Time: 9.154468536376953 secs [349.5560651373933 ej/sec]]
Step [150/2046]	 Loss: 4.57763671875 	 Time: 9.30465316772461 secs [343.91394738924356 ej/sec]]
Step [200/2046]	 Loss: 4.797720432281494 	 Time: 9.267889976501465 secs [345.27816019757796 ej/sec]]
Step [250/2046]	 Loss: 4.810873031616211 	 Time: 9.477900743484497 secs [337.6275070405029 ej/sec]]
Step [300/2046]	 Loss: 4.839878559112549 	 Time: 9.210098505020142 secs [347.4447095496078 ej/sec]]
Step [350/2046]	 Loss: 4.455430507659912 	 Time: 9.244152069091797 secs [346.1647943567839 ej/sec]]
Step [400/2046]	 Loss: 4.581033229827881 	 Time: 9.174200057983398 secs [348.8042532073798 ej/sec]]
Step [450/2046]	 Loss: 4.482931137084961 	 Time: 9.113032102584839 secs [351.14547649759135 ej/sec]]
Ste

In [0]:
!tensorboard dev upload --logdir "$tb_dir" --name "IFCB" --description "Training with 2006"