<a href="https://colab.research.google.com/github/pglez82/IFCB_semisupervised/blob/master/IFCB_FT_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the data
We are going to finetune a resnet and extract features with it

In [1]:
import os

if not os.path.isfile("IFCB_data.tar") and not os.path.isdir("data"):
  print("Data do not exist in local. Downloading...")
  !wget -O IFCB_data.tar https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/Ec2z0uC4lghEg-9MjzoJ9QkBK5n74QjS-LszB9dlNrPfaw?download=1
else:
  print("Data already exists. Skipping download.")

if not os.path.isdir("data"):
  print("Extracting the tar file...")
  !tar -xf "IFCB_data.tar"
  print("Done. Removing the tar file.")
  !rm -f IFCB_data.tar #Remove the original file to save space

Data already exists. Skipping download.


# Download CSV with information about the images


In [5]:
import pandas as pd

if not os.path.isfile('IFCB.csv.zip'):
  print("CSV data do not exist. Downloading...")
  !wget -O IFCB.csv.zip "https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/EfsVLhFsYJpPjO0KZlpWUq0BU6LaqJ989Re4XzatS9aG4Q?download=1"

data = pd.read_csv('IFCB.csv.zip',compression='infer', header=0,sep=',',quotechar='"')
data['year'] = data['Sample'].str[6:10].astype(str) #Compute the year
samples=data.groupby('Sample').first()
samples=samples[['year']]
print(data)
print("Number of samples: {}".format(len(pd.unique(data['Sample']))))
print("FunctionalGroup: {}".format(len(pd.unique(data['FunctionalGroup']))))
print("AutoClass: {}".format(len(pd.unique(data['AutoClass']))))
print("OriginalClass: {}".format(len(pd.unique(data['OriginalClass']))))
print("Years: {}".format(pd.unique(data['year'])))

                        Sample  roi_number        OriginalClass  \
0        IFCB1_2006_158_000036           1                  mix   
1        IFCB1_2006_158_000036           2  Tontonia_gracillima   
2        IFCB1_2006_158_000036           3                  mix   
3        IFCB1_2006_158_000036           4                  mix   
4        IFCB1_2006_158_000036           5                  mix   
...                        ...         ...                  ...   
3457814  IFCB5_2014_353_205141        6850       Leptocylindrus   
3457815  IFCB5_2014_353_205141        6852                  mix   
3457816  IFCB5_2014_353_205141        6855                  mix   
3457817  IFCB5_2014_353_205141        6856                  mix   
3457818  IFCB5_2014_353_205141        6857                  mix   

              AutoClass FunctionalGroup  year  
0                   mix      Flagellate  2006  
1           ciliate_mix         Ciliate  2006  
2                   mix      Flagellate  2006  
3  

# Create training set

Here we make a reestructuration of the images depending on which years we consider

In [3]:
import progressbar
from tqdm import tqdm
from shutil import copyfile
import numpy as np

tqdm.pandas()

classcolumn = "AutoClass" #Autoclass means 51 classes
yearstraining = ['2006','2007','2008'] #Years to consider as training
samplesvalidation = ['IFCB1_2009_001_001602'] #Samples to consider for validation
sampleproduction = 'IFCB1_2009_001_001602' #Sample to consider for production. This will be used in other notebook
trainingfolder = "training"
validationfolder = "validation"
productionfolder = "production" #This is where images are going when we have no labels and we want to predict them

classes=np.unique(data[classcolumn])

print("Computing image paths...")
#Compute data paths
data['path']="data"+'/'+data['year']+'/'+data['OriginalClass'].astype(str)+'/'+data['Sample'].astype(str)+'_'+data['roi_number'].apply(lambda x: str(x).zfill(5))+'.png'
print('Done')

#Check data by year
print(pd.crosstab(index=data['year'],columns='count'))

if not os.path.isdir(trainingfolder):
  print("Create folder structure for training set... Using years:")
  print(yearstraining)
  os.mkdir(trainingfolder)
  for folder in classes:
    os.mkdir(os.path.join(trainingfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['year'].isin(yearstraining)].progress_apply(lambda row: copyfile(row['path'],os.path.join(trainingfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")
else:
  print("Training data already there... Doing nothing")

if not os.path.isdir(validationfolder):
  print("Create folder structure for the validation set... Using samples:")
  print(samplesvalidation)
  os.mkdir(validationfolder)
  for folder in classes:
    os.mkdir(os.path.join(validationfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['Sample'].isin(samplesvalidation)].progress_apply(lambda row: copyfile(row['path'],os.path.join(validationfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")  
else:
  print("Validation data already there... Doing nothing")

if not os.path.isdir(productionfolder):
  print("Create folder structure for production ... Using sample:")
  print(sampleproduction)
  os.mkdir(productionfolder)
  print("Done.\nMoving images to the same folder...")
  data[data['Sample']==sampleproduction].progress_apply(lambda row: copyfile(row['path'],os.path.join(productionfolder,os.path.basename(row['path']))),axis=1)
  print("Done")  
else:
  print("Production data already there... Doing nothing")


Computing image paths...
Done
col_0   count
year         
2006   131002
2007   273080
2008   427308
2009   732398
2010   327996
2011   419692
2012   394766
2013   422255
2014   329322
Training data already there... Doing nothing
Validation data already there... Doing nothing
Production data already there... Doing nothing


# Configure the process

In [4]:
import torch
import random
import numpy as np

torch.manual_seed(0) #Reproducible
random.seed(0) #it seems that the transforms uses this random
np.random.seed(0)

num_workers = 4 # @param
batch_size = 256 # @param 
batch_size_val = 512 # @param 

num_epochs_ft1 = 10 # @param
num_epochs_ft2 = 10 # @param

is_trained=False #Will take true if the network is already trained
model_save_path="model.pt" #Where to save the model once trained

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using %s"%device)

Using cuda:0


# Prepare de DataLoaders for the CNN
In this step it is important to consider that we have to use images with the same size than the original network (so we can reuse the weights)

In [5]:
import torchvision
import torchvision.transforms as T
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

#Define transofrmations
train_transform = T.Compose([
  T.Resize(size=256),
  T.RandomResizedCrop(size=224),
  T.RandomHorizontalFlip(),
  T.ToTensor(),            
  #T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

val_transform = T.Compose([
  T.Resize(size=256),
  T.CenterCrop(size=224),
  T.ToTensor(),
  #T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])
  

#Define data loader
num_classes = 51
train_dset = ImageFolder(trainingfolder, transform=train_transform)
train_loader = DataLoader(train_dset,batch_size=batch_size,num_workers=num_workers,shuffle=True)
val_dset = ImageFolder(validationfolder, transform=val_transform)
val_loader = DataLoader(val_dset,batch_size=batch_size_val,num_workers=num_workers)

# Define how to load the CNN
In this step we download a pretrained CNN with the weights from ImageNet. We change the last layer to match the number of classes that we have in our problem. In the case that model_trained_path is true, that means that we have already trained the network so we load the weights.

In [6]:
import torch.nn as nn

def load_network():
  global is_trained
  model = torchvision.models.resnet18(pretrained=True)
  print("Adjusting the CNN for %s classes" % num_classes)
  model.fc = nn.Linear(model.fc.in_features, num_classes)
  #Define loss function
  loss_fn = nn.CrossEntropyLoss()
  if os.path.isfile(model_save_path):
    model.load_state_dict(torch.load(model_save_path))
    is_trained=True
  model = model.to(device) #Send model to gpu
  return model,loss_fn

# Define finetuning util functions

In [7]:
import time
import torch.nn.functional as nnf

def run_epoch(model, loss_fn, loader, optimizer, device):
  """
  Train the model for one epoch.
  """
  loss_epoch = 0 
  start_time = time.time()
  # Set the model to training mode
  model.train()
  for step, (x, y) in enumerate(loader):
    
    x = x.to(device)
    y = y.to(device)

    # Run the model forward to compute scores and loss.
    scores = model(x)
    loss = loss_fn(scores, y)
    loss_epoch = loss_epoch + loss.item()
    # Run the model backward and take a step using the optimizer.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 50== 0:
      spent = time.time()-start_time
      print(f"Step [{step}/{len(loader)}]\t Loss: {loss.item()} \t Time: {spent} secs [{(batch_size*50)/spent} ej/sec]]")
      start_time = time.time()
  return loss_epoch

def make_preds(model, loader, device):
  """
  Check the accuracy of the model.
  """
  with torch.no_grad():
    # Set the model to eval mode
    model.eval()
    y_true = []
    y_pred = []
    y_probs = []
    for x, y in loader:
      x = x.to(device)
      y = y.to(device)
      # Run the model forward, and compare the argmax score with the ground-truth
      # category.
      output = model(x)
      predicted = output.argmax(1)
      prob = nnf.softmax(output, dim=1)
      y_probs.extend(prob.cpu().detach().numpy())
      y_true.extend(y.cpu().numpy())
      y_pred.extend(predicted.cpu().numpy())
  return y_true,y_pred,y_probs

# Define the finetuning
First we only update the last layer for a few epochs, then we update all the weights with a small learning rate

In [8]:
def finetune(model,loss_fn,train_loader,device):
  for param in model.parameters():
    param.requires_grad = False
  for param in model.fc.parameters():
    param.requires_grad = True

  optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-3)

  #First phase of finetuning
  for epoch in range(num_epochs_ft1):
    # Run an epoch over the training data.
    print('Starting epoch %d / %d' % (epoch + 1,num_epochs_ft1))
    loss_epoch = run_epoch(model, loss_fn, train_loader, optimizer, device)

    # Check accuracy on the train and val sets.
    print(f"Epoch [{epoch+1}/{num_epochs_ft1}]\t Loss: {loss_epoch / len(train_loader)}")

  #Allow updating all the weights in the second phase
  for param in model.parameters():
    param.requires_grad = True

  #Lower learning rate this time
  optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

  # Train the entire model for a few more epochs, checking accuracy on the
  # train sets after each epoch.
  for epoch in range(num_epochs_ft2):
    print('Starting epoch %d / %d' % (epoch + 1, num_epochs_ft2))
    loss_epoch = run_epoch(model, loss_fn, train_loader, optimizer, device)

    print(f"Epoch [{epoch+1}/{num_epochs_ft2}]\t Loss: {loss_epoch / len(train_loader)}")
    
  torch.save(model.state_dict(), "model.pt")
  print("Fine tune done and model saved.")
 


# Execute everything and print the results

In [None]:
from sklearn.metrics import classification_report

print('Starting process...')

model,loss_fn = load_network()

if not is_trained:
  finetune(model,loss_fn,train_loader,device)
else:
  print("Model was trained already")
    
print("Performing final validation in test examples...")
y_true,y_pred,y_probs = make_preds(model, val_loader, device)
labelswithexamples=np.union1d(np.unique(y_true),np.unique(y_pred))
labelswithexamples_names = np.array(list(val_dset.class_to_idx.keys()))[labelswithexamples]
print(classification_report(y_true, y_pred,target_names=labelswithexamples_names))

Starting process...
Adjusting the CNN for 51 classes
Starting epoch 1 / 10
Step [0/3248]	 Loss: 4.740101337432861 	 Time: 1.3697588443756104 secs [9344.710605489641 ej/sec]]
Step [50/3248]	 Loss: 0.9621421098709106 	 Time: 8.74581789970398 secs [1463.5566560828167 ej/sec]]
Step [100/3248]	 Loss: 0.8258771300315857 	 Time: 8.986668109893799 secs [1424.3321154709101 ej/sec]]
Step [150/3248]	 Loss: 0.8370218873023987 	 Time: 97.4044599533081 secs [131.4108204710115 ej/sec]]


# Preparing things for quantification
For quantification we need the training set classified with probabilities. As this takes a long time is better to do it here and save the results to a file. We can use this later to fit the quantifier.

In [None]:
torch.cuda.empty_cache()
y_true,y_pred,y_probs = make_preds(model, train_loader, device)
print("Printing results to check that everything is working...")
labelswithexamples=np.union1d(np.unique(y_true),np.unique(y_pred))
labelswithexamples_names = np.array(list(train_dset.class_to_idx.keys()))[labelswithexamples]
#We save it to disk so we can use it later
np.savetxt('results/classes.csv',labelswithexamples_names,fmt = '%s')
print(classification_report(y_true, y_pred,target_names=labelswithexamples_names))
print("Saving results to a csv...")
np.savetxt("results/traintrue.csv",y_true,fmt='%d')
np.savetxt("results/trainpred.csv",y_pred,fmt='%d')
np.savetxt("results/trainprobs.csv", y_probs, delimiter=",",fmt='%f')
print("Done.")