<a href="https://colab.research.google.com/github/pglez82/IFCB_semisupervised/blob/master/IFCB_FT_Baseline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load the data
We are going to finetune a resnet18 and extract features with it

In [1]:
import os


if not os.path.isfile("IFCB_data.tar") and not os.path.isdir("data"):
  print("Data do not exist in local. Downloading...")
  !wget -O IFCB_data.tar https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/Ec2z0uC4lghEg-9MjzoJ9QkBK5n74QjS-LszB9dlNrPfaw?download=1
else:
  print("Data already exists. Skipping download.")

if not os.path.isdir("data"):
  print("Extracting the tar file...")
  !tar -xf "IFCB_data.tar"
  print("Done. Removing the tar file.")
  !rm -f IFCB_data.tar #Remove the original file to save space

Data already exists. Skipping download.


# Download CSV with information about the images


In [2]:
import pandas as pd

if not os.path.isfile('IFCB.csv.zip'):
  print("CSV data do not exist. Downloading...")
  !wget -O IFCB.csv.zip "https://unioviedo-my.sharepoint.com/:u:/g/personal/gonzalezgpablo_uniovi_es/EfsVLhFsYJpPjO0KZlpWUq0BU6LaqJ989Re4XzatS9aG4Q?download=1"

data = pd.read_csv('IFCB.csv.zip',compression='infer', header=0,sep=',',quotechar='"')
print(data)

                        Sample  roi_number        OriginalClass  \
0        IFCB1_2006_158_000036           1                  mix   
1        IFCB1_2006_158_000036           2  Tontonia_gracillima   
2        IFCB1_2006_158_000036           3                  mix   
3        IFCB1_2006_158_000036           4                  mix   
4        IFCB1_2006_158_000036           5                  mix   
...                        ...         ...                  ...   
3457814  IFCB5_2014_353_205141        6850       Leptocylindrus   
3457815  IFCB5_2014_353_205141        6852                  mix   
3457816  IFCB5_2014_353_205141        6855                  mix   
3457817  IFCB5_2014_353_205141        6856                  mix   
3457818  IFCB5_2014_353_205141        6857                  mix   

              AutoClass FunctionalGroup  
0                   mix      Flagellate  
1           ciliate_mix         Ciliate  
2                   mix      Flagellate  
3                   mix    

# Create training set

Here we make a reestructuration of the images depending on which class we consider

In [3]:
import progressbar
from tqdm import tqdm
tqdm.pandas()

classcolumn = "AutoClass" #Autoclass means 51 classes
yearstraining = ['2006'] #Years to consider as training
yearsvalidation = ['2007']
trainingfolder = "training"
validationfolder = "validation"

classes = pd.unique(data[classcolumn])
print("Considering %i classes" % len(classes))

print("Computing image paths...")
#Compute data paths
data['year'] = data['Sample'].str[6:10].astype(str)
data['path']="data"+'/'+data['year']+'/'+data['OriginalClass'].astype(str)+'/'+data['Sample'].astype(str)+'_'+data['roi_number'].apply(lambda x: str(x).zfill(5))+'.png'
print('Done')

if not os.path.isdir(trainingfolder):
  print("Create folder structure for training set...")
  os.mkdir(trainingfolder)
  for folder in classes:
    os.mkdir(os.path.join(trainingfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['year'].isin(yearstraining)].progress_apply(lambda row: os.rename(row['path'],os.path.join(trainingfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")
else:
  print("Training data already there... Doing nothing")

if not os.path.isdir(validationfolder):
  print("Create folder structure for the validation set...")
  os.mkdir(validationfolder)
  for folder in classes:
    os.mkdir(os.path.join(validationfolder,folder))
  print("Done.\nMoving images to the respective folders...")
  data[data['year'].isin(yearsvalidation)].progress_apply(lambda row: os.rename(row['path'],os.path.join(validationfolder,row[classcolumn],os.path.basename(row['path']))),axis=1)
  print("Done")  
else:
  print("Validation data already there... Doing nothing")


Considering 51 classes
Computing image paths...


  from pandas import Panel


Done
Training data already there... Doing nothing
Validation data already there... Doing nothing


# Configure the process

In [4]:
import torch

num_workers = 16 # @param
batch_size = 256 # @param 
train_dir = './training'
num_epochs_ft1 = 10 # @param
num_epochs_ft2 = 10 # @param

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Using %s"%device)

Using cuda:0


# Prepare de DataLoaders for the CNN
In this step it is important to consider that we have to use images with the same size than the original network (so we can reuse the weights)

In [5]:
import torchvision
import torchvision.transforms as T
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

IMAGENET_MEAN = [0.485, 0.456, 0.406]
IMAGENET_STD = [0.229, 0.224, 0.225]

#Define transofrmations
train_transform = T.Compose([
  T.Resize(size=256),
  T.RandomResizedCrop(size=224),
  T.RandomHorizontalFlip(),
  T.ToTensor(),            
  T.Normalize(mean=IMAGENET_MEAN, std=IMAGENET_STD),
])

#Define data loader
train_dset = ImageFolder(train_dir, transform=train_transform)
train_loader = DataLoader(train_dset,batch_size=batch_size,num_workers=num_workers,shuffle=True)


# Load the CNN
In this step we download a pretrained CNN with the weights from ImageNet. We change the last layer to match the number of classes that we have in our problem

In [6]:
import torch.nn as nn

model = torchvision.models.resnet18(pretrained=True)
print("Adjusting the CNN for %s classes" % len(train_dset.classes))
num_classes = len(train_dset.classes)
model.fc = nn.Linear(model.fc.in_features, num_classes)
#Define loss function
loss_fn = nn.CrossEntropyLoss()
model = model.to(device)

Adjusting the CNN for 51 classes


# Perform finetuning
First we only update the last layer for a few epochs, then we update all the weights with a small learning rate

In [None]:
import time

def run_epoch(model, loss_fn, loader, optimizer, device):
  """
  Train the model for one epoch.
  """
  start_time = time.time()
  # Set the model to training mode
  model.train()
  for step, (x, y) in enumerate(loader):
    
    x = x.to(device)
    y = y.to(device)

    # Run the model forward to compute scores and loss.
    scores = model(x)
    loss = loss_fn(scores, y)

    # Run the model backward and take a step using the optimizer.
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    if step % 50== 0:
      spent = time.time()-start_time
      print(f"Step [{step}/{len(loader)}]\t Loss: {loss.item()} \t Time: {spent} secs [{(batch_size*50)/spent} ej/sec]]")
      start_time = time.time()

def check_accuracy(model, loader, device):
  """
  Check the accuracy of the model.
  """
  # Set the model to eval mode
  model.eval()
  num_correct, num_examples = 0, 0
  for x, y in loader:
    x = x.to(device)

    # Run the model forward, and compare the argmax score with the ground-truth
    # category.
    scores = model(x)
    _, preds = scores.data.cpu().max(1)
    num_correct += (preds == y).sum()
    num_examples += x.size(0)

  # Return the fraction of datapoints that were correctly classified.
  acc = float(num_correct) / num_examples
  return acc

for param in model.parameters():
  param.requires_grad = False
for param in model.fc.parameters():
  param.requires_grad = True

optimizer = torch.optim.Adam(model.fc.parameters(), lr=1e-3)

#First phase of finetuning
for epoch in range(num_epochs_ft1):
  # Run an epoch over the training data.
  print('Starting epoch %d / %d' % (epoch + 1,num_epochs_ft1))
  run_epoch(model, loss_fn, train_loader, optimizer, device)

  # Check accuracy on the train and val sets.
  train_acc = check_accuracy(model, train_loader, device)
  print('Train accuracy: ', train_acc)

#Allow updating all the weights in the second phase
for param in model.parameters():
  param.requires_grad = True

#Lower learning rate this time
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

# Train the entire model for a few more epochs, checking accuracy on the
# train sets after each epoch.
for epoch in range(num_epochs_ft2):
  print('Starting epoch %d / %d' % (epoch + 1, num_epochs_ft2))
  run_epoch(model, loss_fn, train_loader, optimizer, device)

  train_acc = check_accuracy(model, train_loader, device)
  print('Train accuracy: ', train_acc)



Starting epoch 1 / 10
Step [0/512]	 Loss: 3.6819233894348145 	 Time: 2.361470937728882 secs [5420.350424600294 ej/sec]]
Step [50/512]	 Loss: 1.003257393836975 	 Time: 6.679912805557251 secs [1916.1926768492003 ej/sec]]
Step [100/512]	 Loss: 0.7287249565124512 	 Time: 6.611286163330078 secs [1936.0831892281444 ej/sec]]
Step [150/512]	 Loss: 0.8700928092002869 	 Time: 6.640687704086304 secs [1927.5112112445227 ej/sec]]
Step [200/512]	 Loss: 0.707870602607727 	 Time: 6.644641399383545 secs [1926.364303299726 ej/sec]]
Step [250/512]	 Loss: 0.6451399326324463 	 Time: 6.66452693939209 secs [1920.6164393068777 ej/sec]]
Step [300/512]	 Loss: 0.7302033305168152 	 Time: 6.6765053272247314 secs [1917.1706413242186 ej/sec]]
Step [350/512]	 Loss: 0.7266544103622437 	 Time: 6.684989929199219 secs [1914.7373646878905 ej/sec]]
Step [400/512]	 Loss: 0.7897779941558838 	 Time: 6.69628381729126 secs [1911.507987004317 ej/sec]]
Step [450/512]	 Loss: 0.6071487665176392 	 Time: 6.702399730682373 secs [1909.

Step [50/512]	 Loss: 0.4708089530467987 	 Time: 6.817950963973999 secs [1877.3968993961826 ej/sec]]
Step [100/512]	 Loss: 0.6480696797370911 	 Time: 6.759070873260498 secs [1893.7514105138282 ej/sec]]
Step [150/512]	 Loss: 0.6515858173370361 	 Time: 6.764707326889038 secs [1892.1735089885226 ej/sec]]
Step [200/512]	 Loss: 0.6472584009170532 	 Time: 6.768350601196289 secs [1891.1549880021923 ej/sec]]
Step [250/512]	 Loss: 0.5166594982147217 	 Time: 6.777374029159546 secs [1888.6370952714428 ej/sec]]
Step [300/512]	 Loss: 0.5609667301177979 	 Time: 6.772695302963257 secs [1889.9418071265684 ej/sec]]
Step [350/512]	 Loss: 0.5524332523345947 	 Time: 6.765438795089722 secs [1891.9689302769384 ej/sec]]
Step [400/512]	 Loss: 0.7188730835914612 	 Time: 6.759503126144409 secs [1893.630309969405 ej/sec]]
Step [450/512]	 Loss: 0.6563782691955566 	 Time: 6.755563497543335 secs [1894.7346146113093 ej/sec]]
Step [500/512]	 Loss: 0.572431206703186 	 Time: 6.736461400985718 secs [1900.1073765711817 ej

Step [100/512]	 Loss: 0.4388822913169861 	 Time: 18.6731379032135 secs [685.4766492029827 ej/sec]]
Step [150/512]	 Loss: 0.3754936754703522 	 Time: 18.68896174430847 secs [684.896259895128 ej/sec]]
Step [200/512]	 Loss: 0.475953608751297 	 Time: 18.706515789031982 secs [684.253558725506 ej/sec]]
Step [250/512]	 Loss: 0.48445141315460205 	 Time: 18.67817258834839 secs [685.2918795698865 ej/sec]]
Step [300/512]	 Loss: 0.40292614698410034 	 Time: 18.697928190231323 secs [684.567823224785 ej/sec]]
Step [350/512]	 Loss: 0.3928326368331909 	 Time: 18.71051263809204 secs [684.1073917953992 ej/sec]]
Step [400/512]	 Loss: 0.4390897750854492 	 Time: 18.72868037223816 secs [683.443774232682 ej/sec]]
Step [450/512]	 Loss: 0.4145894944667816 	 Time: 18.7358660697937 secs [683.181655564692 ej/sec]]
Step [500/512]	 Loss: 0.5420541763305664 	 Time: 18.681391954421997 secs [685.1737831543202 ej/sec]]
Train accuracy:  0.8558342620723348
Starting epoch 6 / 10
Step [0/512]	 Loss: 0.4273545444011688 	 Time