<a href="https://colab.research.google.com/github/pglez82/IFCB_semisupervised/blob/master/IFCB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Showing system info

In [None]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Select the Runtime > "Change runtime type" menu to enable a GPU accelerator, ')
  print('and then re-execute this cell.')
else:
  print(gpu_info)

from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('Your runtime has {:.1f} gigabytes of available RAM\n'.format(ram_gb))

# Download SimCLR code
In this step we download the SimCLR code for **PyTorch** and install its dependencies



In [None]:
import os

if not os.path.isdir("SimCLR"):
  !git clone https://github.com/pglez82/SimCLR.git
 
%cd SimCLR
!sh setup.sh || python3 -m pip install -r requirements.txt || exit 1
!pip install  pyyaml --upgrade

# Download the images
In this section, we **donwload** the data and **uncompress** it. The code has checks in order to ensure that already downloaded data is not redownloaded

In [None]:
%cd ..
if not os.path.isfile("IFCB_data.tar") and not os.path.isdir("data"):
  print("Data do not exist in local. Downloading...")
  !wget -O IFCB_data.tar https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/Ec2z0uC4lghEg-9MjzoJ9QkBK5n74QjS-LszB9dlNrPfaw?download=1
else:
  print("Data already exists. Skipping download.")

if not os.path.isdir("data"):
  print("Extracting the tar file...")
  !tar -xf "IFCB_data.tar"
  print("Done. Removing the tar file.")
  !rm -f IFCB_data.tar #Remove the original file to save space

# Download CSV with information about the images


In [None]:
import pandas as pd

if not os.path.isfile('IFCB.csv.zip'):
  print("CSV data do not exist. Downloading...")
  !wget -O IFCB.csv.zip "https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/EfsVLhFsYJpPjO0KZlpWUq0BU6LaqJ989Re4XzatS9aG4Q?download=1"

data = pd.read_csv('IFCB.csv.zip',compression='infer', header=0,sep=',',quotechar='"')
print(data)

# Create training set

Here we make a reestructuration of the images depending on which class we consider

In [None]:
import progressbar
from tqdm import tqdm
from shutil import copyfile
import numpy as np

tqdm.pandas()

classcolumn = "AutoClass" #Autoclass means 51 classes
#nclasses = 51 #Pick this number of most abundant classes to make sure that we do not have empty classes
yearstraining = ['2012'] #Years to consider as training
yearsvalidation = ['2013']
trainingfolder = "training"
validationfolder = "validation"

classes=np.unique(data[classcolumn])
#In order to ensure that all tests have same number of classes, 
#classes=list(pd.crosstab(index=data[classcolumn],columns='count').sort_values('count',ascending=False)[0:nclasses].index)
#print("Considering %i classes" % len(classes))
#print(classes)

print("Computing image paths...")
#Compute data paths
data['year'] = data['Sample'].str[6:10].astype(str)
data['path']="data"+'/'+data['year']+'/'+data['OriginalClass'].astype(str)+'/'+data['Sample'].astype(str)+'_'+data['roi_number'].apply(lambda x: str(x).zfill(5))+'.png'
print('Done')

#data[classcolumn][~data[classcolumn].isin(classes)]='mix' #Put ignored classes into mix category

#Check data by year
print(pd.crosstab(index=data['year'],columns='count'))

if not os.path.isdir(trainingfolder):
  print("Create folder structure for training set... Using years:")
  print(yearstraining)
  os.mkdir(trainingfolder)
  for folder in classes:
    os.mkdir(os.path.join(trainingfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['year'].isin(yearstraining)].progress_apply(lambda row: copyfile(row['path'],os.path.join(trainingfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")
else:
  print("Training data already there... Doing nothing")

if not os.path.isdir(validationfolder):
  print("Create folder structure for the validation set... Using years:")
  print(yearsvalidation)
  os.mkdir(validationfolder)
  for folder in classes:
    os.mkdir(os.path.join(validationfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['year'].isin(yearsvalidation)].progress_apply(lambda row: copyfile(row['path'],os.path.join(validationfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")  
else:
  print("Validation data already there... Doing nothing")

# Lets configure SimCLR
Number of epocs, optimizer, resnet version to use ...
Things that we have to configure:


*   cuda:0 -> Change to cuda:1 to use second gpu
*   args.batch_size -> higher value its slower but better
*   args.resnet -> resnet18 | resnet50



In [None]:
import torch
from SimCLR.utils.yaml_config_hook import yaml_config_hook
import argparse
import random

torch.manual_seed(0) #Reproducible
random.seed(0) #it seems that the transforms uses this random

config = yaml_config_hook("./SimCLR/config/config.yaml")
args = argparse.Namespace(**config)

#Here we need to select which graphics card we want to use in case of having more than one
args.device = torch.device("cuda:1" if torch.cuda.is_available() else "cpu")
print("Using %s" % args.device)

In [None]:
from pprint import pprint
args.dataset = "IFCB_12_13" #This value will be used only for the output dir
args.image_size = 224 #@param
args.batch_size =  256 #@param
args.resnet = "resnet18" #@param ['resnet18','resnet50']
#Means that we want to start training in this epoch. We should have a file checkpoint_{}.tar in the args.model_path dir
args.epoch_num =  0 #@param 
#How many epochs we want. If epochs = epoch num we just load the model and do nothing
args.epochs = 100 #@param  
#We want to save the checkpoints to google drive
args.out_dir = "drive/My Drive/Colab Notebooks/{}_{}_b{}_s{}".format(args.dataset,args.resnet,args.batch_size,args.image_size)
args.model_path = args.out_dir #This is the directory from where we want to restore checkpoints
args.proportions = [0.01,0.05,0.08,0.09,0.1,0.15,0.2,0.3,0.4,0.5,0.8,1] #how many labeled data we are going to use for training
args.logistic_epochs = 500
args.logistic_batch_size = 512
args.workers = 4

if not os.path.isdir(args.out_dir):
  raise SystemExit("The output folder {} does not exist!".format(args.out_dir))
pprint(vars(args))

# Loading the training dataset

Use pytorch to load the training dataset

In [None]:
import torchvision
from torchvision import transforms
from SimCLR.modules.transformations import TransformsSimCLR

#This transform makes the magic and returns two augmented images from an original image
train_dataset = torchvision.datasets.ImageFolder(root=trainingfolder, transform=TransformsSimCLR(size=args.image_size))

train_loader = torch.utils.data.DataLoader(
  train_dataset,
  batch_size=args.batch_size,
  shuffle=False,
  drop_last=True,
  num_workers=args.workers,
)

# Show some example pictures

In [None]:
import matplotlib.pyplot as plt
import torchvision.transforms as T

train_dataset_display = torchvision.datasets.ImageFolder(root=trainingfolder,
                                                         transform=T.Compose([T.Resize(size=args.image_size),
                                                                              T.CenterCrop(size=args.image_size)]))
fig, axs = plt.subplots(nrows=4, ncols=8, constrained_layout=False, figsize=(16,8))
fig.tight_layout()
for _, ax in enumerate(axs.flat):
  index = random.randint(0, len(train_dataset_display)-1)
  ax.imshow(train_dataset_display[index][0])
  ax.title.set_text(train_dataset_display.classes[train_dataset_display[index][1]])
plt.show()

# Define the training function
This is the function that will do all the work for one epoch

In [None]:
import time

def train(args, train_loader, model, criterion, optimizer, writer):
  loss_epoch = 0
  start_time = time.time()
  for step, ((x_i, x_j), _) in enumerate(train_loader):
    optimizer.zero_grad()
    x_i = x_i.to(args.device)
    x_j = x_j.to(args.device)

    # positive pair, with encoding
    h_i, z_i = model(x_i)
    h_j, z_j = model(x_j)

    loss = criterion(z_i, z_j)

    loss.backward()

    optimizer.step()

    if step % 50 == 0:
      spent = time.time()-start_time
      print(f"Step [{step}/{len(train_loader)}]\t Loss: {loss.item()} \t Time: {spent} secs [{(args.batch_size*50)/spent} ej/sec]]")
      start_time = time.time()

    writer.add_scalar("Loss/Step", loss.item(), args.global_step)
    loss_epoch += loss.item()
    args.global_step += 1

  return loss_epoch

# Load the model
We only reload the model if **args.epoch_num** is different from zero. This case means that we want to continue training from a checkpoint (we should have the model in the **args.model_path** dir.

In [None]:
from SimCLR.model import load_model
model, optimizer, scheduler = load_model(args, train_loader,reload_model=(args.epoch_num!=0))

# Configure TensorBoard



In [None]:
from torch.utils.tensorboard import SummaryWriter

tb_dir = os.path.join(args.out_dir, "colab")
if not os.path.exists(tb_dir):
  os.makedirs(tb_dir)
writer = SummaryWriter(log_dir=tb_dir)

# Load the loss function
This function tries to minimize the difference between the two augmented variations of the image and maximize the difference between these and the rest of the batch

In [None]:
from SimCLR.modules import NT_Xent

criterion = NT_Xent(args.batch_size, args.temperature, args.device)

# Training the CNN
We make a checkpoint each 5 epochs just in case

In [None]:
from SimCLR.model import save_model

args.global_step = 0
if args.epoch_num!=0: #If we have loaded a model trained til an epoch, lets start training in the next
  args.start_epoch=args.epoch_num+1
args.current_epoch = args.start_epoch #Variable for controlling in which epoch we are

for epoch in range(args.start_epoch, args.epochs):
  lr = optimizer.param_groups[0]['lr']
  loss_epoch = train(args, train_loader, model, criterion, optimizer, writer)

  if scheduler:
    scheduler.step()

  if epoch % 5 == 0:
    save_model(args, model, optimizer)

  writer.add_scalar("Loss/train epoch", loss_epoch / len(train_loader), epoch)
  writer.add_scalar("Misc/learning_rate", lr, epoch)
  print(f"Epoch [{epoch+1}/{args.epochs}]\t Loss: {loss_epoch / len(train_loader)}\t lr: {round(lr, 5)}")
  args.current_epoch += 1

## end training
if args.start_epoch!=args.epochs:
  save_model(args, model, optimizer)

In [None]:
#!tensorboard dev upload --logdir "$tb_dir" --name "IFCBv3" --description "Training with 2006 image size 128 batch size 256"

# **Trainining the classifier using the deep features**
Now we will be trying to find out if the network has learnt something useful from the unlabeled data. We will train a Logistic Regression classifier with the labeled examples and testing against a validation dataset

# Define train and test functions

In [None]:
def train(args, loader, model, criterion, optimizer):
  loss_epoch = 0
  accuracy_epoch = 0
  for step, (x, y) in enumerate(loader):
    optimizer.zero_grad()

    x = x.to(args.device)
    y = y.to(args.device)

    output = model(x)
    loss = criterion(output, y)

    predicted = output.argmax(1)
    acc = (predicted == y).sum().item() / y.size(0)
    accuracy_epoch += acc

    loss.backward()
    optimizer.step()

    loss_epoch += loss.item()

  return loss_epoch, accuracy_epoch

In [None]:
def make_preds(args, model, loader):
  """
  Check the accuracy of the model.
  """
  # Set the model to eval mode
  model.eval()
  y_true = []
  y_pred = []
  for x, y in loader:
    x = x.to(args.device)
    y = y.to(args.device)
    # Run the model forward, and compare the argmax score with the ground-truth
    # category.
    output = model(x)
    predicted = output.argmax(1)
    y_true.extend(y.cpu().numpy())
    y_pred.extend(predicted.cpu().numpy())
  return y_true,y_pred

# Load data
We have to load the data again because before the data loader was doing the special agumentation for the contrastive learning. Now we only want to resize the images.




In [None]:
train_dataset = torchvision.datasets.ImageFolder(root=trainingfolder, transform=TransformsSimCLR(size=args.image_size).test_transform,)
test_dataset = torchvision.datasets.ImageFolder(root=validationfolder, transform=TransformsSimCLR(size=args.image_size).test_transform,)

train_loader = torch.utils.data.DataLoader(
  train_dataset,
  batch_size=args.logistic_batch_size,
  num_workers=args.workers,
)

test_loader = torch.utils.data.DataLoader(
  test_dataset,
  batch_size=args.logistic_batch_size,
  shuffle=False,
  num_workers=args.workers,
)

In [None]:
def print_class_distribution(labels,class_to_idx):
  class_mapping = {v: k for k, v in class_to_idx.items()}
  df = pd.DataFrame(data=labels)
  df=df.replace(class_mapping)
  c=pd.crosstab(index=df[0],columns='count')
  print(c)


labels_train = list(train_dataset.targets)
labels_test = list(test_dataset.targets)
print("Printing distribution of training set...")
print_class_distribution(labels_train,train_dataset.class_to_idx)
print("Printing distribution of testing set...")
print_class_distribution(labels_test,test_dataset.class_to_idx)


# Load de pretrained CNN and its weights

In [None]:
simclr_model, _, _ = load_model(args, train_loader, reload_model=True)
simclr_model = simclr_model.to(args.device)
simclr_model.eval()

# Compute deep features for both training and validation sets
Here we use our CNN pretrained using contrastive learning with unlabelled data for computing the features from all the examples.

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split

def create_balanced_splits(y,proportions):
  """
  This function creates different balanced splits following the proportions
  """
  classes, globalcounts = np.unique(y, return_counts=True) #If some classes do not have examples, they are not in y
  subsets = {}
  for p in proportions:
    subsets[p]=[]
    counts = np.rint(globalcounts*p)
    counts[counts==0]=1
    print("Building subset with %d elements"%sum(counts))
    for i in range(len(classes)):
      #print("For class %d we have %d examples and we are taking %d" % (classes[i],globalcounts[i],counts[i]))
      classelements, = np.where(y==classes[i])
      subsets[p].extend(classelements[0:int(counts[i])])
  return subsets 

#This function computes the deep features
def inference(loader, context_model, device):
  feature_vector = []
  labels_vector = []
  for step, (x, y) in enumerate(loader):
    x = x.to(device)

    # get encoding
    with torch.no_grad():
      h, z = context_model(x)

    h = h.detach()

    feature_vector.extend(h.cpu().detach().numpy())
    labels_vector.extend(y.numpy())

    if step % 20 == 0:
      print(f"Step [{step}/{len(loader)}]")

  feature_vector = np.array(feature_vector)
  labels_vector = np.array(labels_vector)
  print("Features shape {}".format(feature_vector.shape))
  return feature_vector, labels_vector

def get_features(context_model, train_loader, test_loader, device):
  print("Computing deep features for training set...")
  train_X, train_y = inference(train_loader, context_model, device)
  print("Computing deep features for validation set...")
  test_X, test_y = inference(test_loader, context_model, device)
  return train_X, train_y, test_X, test_y

def create_data_loaders_from_arrays(X_train, y_train, X_test, y_test, batch_size, proportions):
  #We want to create multiple train loaders with different labelled data proportions
  indexes = create_balanced_splits(y_train,args.proportions)
  train_loaders = {}
  for p in args.proportions:
    d = torch.utils.data.TensorDataset(torch.from_numpy(X_train[indexes[p]]), torch.from_numpy(y_train[indexes[p]]))
    train_loaders[p]=torch.utils.data.DataLoader(d, batch_size=batch_size, shuffle=True)
  test = torch.utils.data.TensorDataset(torch.from_numpy(X_test), torch.from_numpy(y_test))
  test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
  return train_loaders, test_loader

(train_X, train_y, test_X, test_y) = get_features(simclr_model, train_loader, test_loader, args.device)
print("Done")

#We create the data loaders from the arrays with the deep features
arr_train_loaders, arr_test_loader = create_data_loaders_from_arrays(train_X, train_y, test_X, test_y, args.logistic_batch_size,args.proportions)


# Lets train the classifier and see how it works!
We are going to train the classifier using different labeled data proportions

In [None]:
from SimCLR.modules import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    accuracy = np.trace(cm) / np.sum(cm).astype('float')
    misclass = 1 - accuracy

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(35,35),facecolor='white')
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label\naccuracy={:0.4f}; misclass={:0.4f}'.format(accuracy, misclass))
    plt.show()

for p in args.proportions:
    model = LogisticRegression(simclr_model.n_features, len(train_dataset.classes))
    model = model.to(args.device)
    optimizer = torch.optim.SGD(model.parameters(), lr=0.001)
    #optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
    criterion = torch.nn.CrossEntropyLoss()
    arr_train_loader = arr_train_loaders[p]
    print("Training dataset with proportion %f and %d labeled examples in training set"%(p,len(arr_train_loader)*args.logistic_batch_size))
    for epoch in range(args.logistic_epochs):
        loss_epoch, accuracy_epoch = train(args, arr_train_loader, model, criterion, optimizer)
        if epoch % 10 == 0:
            print(f"Epoch [{epoch}/{args.logistic_epochs}]\t Loss: {loss_epoch / len(arr_train_loader)}\t Accuracy: {accuracy_epoch / len(arr_train_loader)}")

    # final testing
    y_true,y_pred = make_preds(args,model,arr_test_loader)
    #Build the confusion matrix here
    cm=confusion_matrix(y_true, y_pred)
    #Esto no está bien. habra que ver como sacar los nombres de las labesl que existan en el train(subset) y test
    labelswithexamples=np.union1d(np.unique(y_true),np.unique(y_pred))  
    labelswithexamples_names = np.array(list(test_dataset.class_to_idx.keys()))[labelswithexamples]
    plot_confusion_matrix(cm=cm,target_names=labelswithexamples_names,normalize=False)
    print(classification_report(y_true, y_pred,target_names=labelswithexamples_names))