**Some imports**

In [1]:
import csv
import torch
import torch.nn as nn
from torch import optim
from torch.utils.data.dataloader import DataLoader
from torch.utils.data.dataset import Dataset
import torchvision
import torchvision.transforms as transforms
import numpy as np
import cv2
import matplotlib.pyplot as plt
from PIL import Image
import random
import time

# Global set-up tasks

Following classes or functions will be defined that will be needed for all datasets later

**Parameters that can be changed in the training methods**

*   Batch Size (when Dataloaders get initialized)
*   Epochs to train (paramater of train method)
*   Learning rate to train with (Defined when initializing optimizer)




## Siamese Network

In [2]:
class SiameseNetwork(nn.Module):

    def __init__(self):
        super(SiameseNetwork, self).__init__()

        # Load pre-trained VGG-19; structure: https://images.app.goo.gl/MtYeQkBbpEtGfvQE8
        self.model = torch.hub.load(
            'pytorch/vision:v0.6.0', 'vgg19', pretrained=True)

        # Remove last two FC layers
        self.model.classifier = self.model.classifier[:-6]
        # print(self.model)

    def forward_once(self, x):
        output = self.model(x)
        return output

    def forward(self, x1, x2):
        output1 = self.forward_once(x1)
        output2 = self.forward_once(x2)
        return output1, output2

## Constrastive Loss

In [3]:
class ContrastiveLoss(nn.Module):
    """
    Contrastive loss function; using cosine similarity instead of euclidian distance

    TODO: look if this is right, as the cosine similarity calculates the angel between the two vectors; what we need?

    Based on: http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf
    """

    def __init__(self, margin=0.5):
        super(ContrastiveLoss, self).__init__()
        self.cosine_similarity = nn.CosineSimilarity()
        self.margin = margin

    def forward(self, output1, output2, label):
        """
        Calculates contrastive loss using cosine similarity though.

        output1: Output 1
        output2: Output 2
        label: Similarity label (1 if genuine, 0 if imposter)
        """

        cos_sim = self.cosine_similarity(output1, output2)
        # euclidean_distance = F.pairwise_distance(output1, output2)
        loss_contrastive = torch.mean((label) * torch.pow(cos_sim, 2) +
                                      (1-label) * torch.pow(torch.clamp(self.margin - cos_sim, min=0.0), 2))
        return loss_contrastive

## Default Dataset

In [4]:
class DefaultDataset(Dataset):

    def __init__(self, root_dir, transform=None, should_invert=False):
        self.imageFolderDataset = datasets.ImageFolder(root=root_dir)
        # can be accomplished by self.imageFolderDataset.classes
        # self.categories = [directory for directory in os.listdir(root_dir)]
        self.transform = transform
        self.should_invert = should_invert

    def __getitem__(self, index):
        # TODO: maybe cut out squares of images before transforming them

        # tuple consists of imagepath and index of category/class
        img0_tuple = random.choice(self.imageFolderDataset.imgs)

        same_class = random.randint(0, 1)
        if same_class == 1:
            while True:
                img1_tuple = random.choice(self.imageFolderDataset.imgs)
                # Loop until another image of the same class has been found
                if img0_tuple[0] != img1_tuple[0] and img0_tuple[1] == img1_tuple[1]:
                    break
        else:
            while True:
                img1_tuple = random.choice(self.imageFolderDataset.imgs)
                # Loop until another image of another class has been found
                if img0_tuple[1] != img1_tuple[1]:
                    break

        img0 = Image.open(img0_tuple[0]).convert("RGB")
        img1 = Image.open(img1_tuple[0]).convert("RGB")

        if self.should_invert:
            img0 = PIL.ImageOps.invert(img0)
            img1 = PIL.ImageOps.invert(img1)

        if self.transform is not None:
            img0 = self.transform(img0)
            img1 = self.transform(img1)

        return img0, img1, torch.from_numpy(np.array([int(img1_tuple[1] == img0_tuple[1])], dtype=np.float32))

    def __len__(self):
        return len(self.imageFolderDataset.imgs)

## Transformations

Transformations that will be used for the train, validation and test dataloaders are defined here. These transforms are ought to be used on any dataset this network is trained with.

The `image_transforms` dict contains all needed transformations.

In [5]:
image_transforms = {
    'train': transforms.Compose([
              transforms.RandomResizedCrop(size=256, scale=(0.8, 1.0)),
              transforms.RandomRotation(degrees=15),
              transforms.RandomHorizontalFlip(),
              transforms.CenterCrop(size=224),
              transforms.ToTensor(),
              transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
    ]),
    'validation': transforms.Compose([
             transforms.Resize(size=256),
             transforms.CenterCrop(size=224),
             transforms.ToTensor(),
             transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])              
    ]),
    'test': transforms.Compose([
             transforms.Resize(size=256),
             transforms.CenterCrop(size=224),
             transforms.ToTensor(),
             transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])              
    ])
}

# not needed
default_transformations = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406],[0.229, 0.224, 0.225])
])

## Train and Validate method

The following method is to be used to train and validate a network with a single learning rate.

In [25]:
def hyperparameter_optimization(network, loss_criterion, lr_list, 
                                train_dataloader, validation_dataloader, epochs):
  for lr in lr_list:
    optimizer = optim.Adam(net.parameters(), lr=lr)
    final_network, history = train_and_validate(network, loss_criterion, optimizer, 
                                                train_dataloader, validation_dataloader, epochs)
    torch.save(final_network.state_dict(), f'/content/gdrive/MyDrive/Image Similarity/final_model_lr_{lr}.pth')

def train_and_validate(network, loss_criterion, optimizer,
                       train_dataloader, validation_dataloader, start_epoch, epochs):
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  print(f'Training on device: {device}')
  print('Every 50 mini-batches the runnin loss will be printed')

  lr = optimizer.param_groups[0]['lr']

  history = []
  best_acc = 0.0
  
  cos_similarity = nn.CosineSimilarity()

  for epoch in range(start_epoch, epochs):

    # start time for epoch-statistics
    epoch_start = time.time()

    # print current epoch
    print(f'Epoch : {epoch+1}/{epochs}')
    print(f'Learning Rate : {lr}')

    network.train()

    train_loss = 0.0
    train_acc = 0.0

    running_loss = 0.0

    # train-loop
    for i, data in enumerate(train_dataloader, 0):

      # Load data and move it to gpu if possible
      img0, img1, label = data
      img0 = img0.to(device)
      img1 = img1.to(device)
      label = label.to(device)

      # Clean existing gradients
      optimizer.zero_grad()

      # Forward both images through network
      output1, output2 = network(img0, img1)

      # Compute loss
      loss = loss_criterion(output1, output2, label)

      # Backpropagate
      loss.backward()

      # Update the parameters
      optimizer.step()

      # print stats every 50 mini-batches
      running_loss += loss.data
      if i % 50 == 49:
        # print the average loss of the last 50 predictions
        print(f'  Current loss: {running_loss / 50}')
        # add sum of losses of the last 50 predictions to train_loss
        train_loss += running_loss
        running_loss = 0.0

      # compute train-accuracy
      with torch.no_grad():
        network.eval()
        result = cos_similarity(output1, output2)
        current_acc = torch.zeros([32], dtype=torch.float)
        current_acc = current_acc.to(device)
        for i, label_val in enumerate(label.data):
              if label_val == 0:
                current_acc.data[i] += 1 - result.data[i] 
              else:
                current_acc.data[i] += result.data[i]
        train_acc += current_acc
      network.train()

    # calculate averages and append it to history
    avg_train_loss = train_loss / len(train_dataloader)
    avg_train_acc = train_acc / len(train_dataloader)

    # validation
    avg_valid_acc, avg_valid_loss = computeNetworkAccuracy(network, loss_criterion, validation_dataloader)

    # calculate mean of accuracy tensors
    avg_train_acc = torch.mean(avg_train_acc).item()
    avg_valid_acc = torch.mean(avg_valid_acc).item()

    history.append([avg_train_loss, avg_valid_loss, avg_train_acc, avg_valid_acc])

    epoch_end = time.time()
    
    print(f'Epoch : {epoch}, Training: Loss: {avg_train_loss}, Accuracy: {avg_train_acc*100}%, \n\t\tValidation : Loss : {avg_valid_loss}, Accuracy: {avg_valid_acc*100}%, Time: {epoch_end-epoch_start}s')

    # save if the model has the best acc till now; the saved model with the highest epoch will be the most accurate
    if avg_valid_acc > best_acc:
      best_acc = avg_valid_acc
      torch.save(network.state_dict(), f'/content/gdrive/MyDrive/Image Similarity/model_lr_{lr}_epoch_{epoch}.pth')
    
    torch.save(network.state_dict(), f'/content/gdrive/MyDrive/Image Similarity/model_lr_{lr}_epoch_latest.pth')

  return network, history

def computeNetworkAccuracy(network, criterion, validation_dataloader):
    similarity = nn.CosineSimilarity()
    sum_accuracy = torch.zeros([32], dtype=torch.float)
    sum_loss = 0

    sum_accuracy = sum_accuracy.to(device)

    with torch.no_grad():
        network.eval()
        for i, data in enumerate(validation_dataloader):
            img0, img1, label = data
            img0 = img0.to(device)
            img1 = img1.to(device)
            label = label.to(device)
            x1, x2 = network(img0, img1)
            x1 = x1.to(device)
            x2 = x2.to(device)
            result = similarity(x1, x2)

            for i, label_val in enumerate(label.data):
              if label_val == 0:
                sum_accuracy.data[i] += 1 - result.data[i] 
              else:
                sum_accuracy.data[i] += result.data[i]
            
            sum_loss += criterion(x1, x2, label)

    return sum_accuracy / len(validation_dataloader), sum_loss.data / len(validation_dataloader)
        

# Mount Google Drive

Useful for working with large datasets or as a storage for trained networks



In [7]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


Datasets can now be downloaded to "/content/gdrive/MyDrive"



---



# Set-up Caltech256

Download the Caltech256 dataset

## Define Utility methods
Caltech256 needs to be split up into train, validation and test datasets. In the following methods will be defined, which may accomplish this. The ratio of the split up images will be (75%, 12.5%, 12,5%).

In [8]:
# imports, may not be needed, but doesn't make a difference 
import numpy as np
import math
from typing import List
import os
import argparse
import glob
import shutil

def get_sub_folder_names(folder):
  sub_folders = [name for name in os.listdir(folder) if os.path.isdir(os.path.join(folder, name))]
  return sub_folders

def list_files(path):
    files = os.listdir(path)
    return np.asarray(files)

# split files will split caltech256 into train, test and validation datasets
def split_files(oldpath, newpath):
    for name in get_sub_folder_names(oldpath):
        full_dir = os.path.join(os.getcwd(), f"{oldpath}/{name}")

        files = list_files(full_dir)
        total_file = np.size(files,0)
        # We split data set into 3: train, validation and test

        train_size = math.ceil(total_file * 3/4) # 75% for training 

        validation_size = train_size + math.ceil(total_file * 1/8) # 12.5% for validation
        test_size = validation_size + math.ceil(total_file * 1/8) # 12.5x% for testing 

        train = files[0:train_size]
        validation = files[train_size:validation_size]
        test = files[validation_size:]

        move_files(train, full_dir, f"{newpath}/train/{name}")
        move_files(validation, full_dir, f"{newpath}/validation/{name}")
        move_files(test, full_dir, f"{newpath}/test/{name}")

def move_files(files, old_dir, new_dir):
    new_dir = os.path.join(os.getcwd(), new_dir);
    if not os.path.exists(new_dir):
        os.makedirs(new_dir)

    for file in np.nditer(files):
        old_file_path = os.path.join(os.getcwd(), f"{old_dir}/{file}")
        new_file_path = os.path.join(os.getcwd(), f"{new_dir}/{file}")

        shutil.move(old_file_path, new_file_path)


## Download Caltech256 with Kaggle
Download the Caltech256 dataset using kaggle. It will be saved to `./data/caltech256/` or rather `./data/caltech256/256_ObjectCategories`.

### Kaggle Initialization


In [9]:
!pip install -q kaggle
from google.colab import files

# upload api json from kaggle
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"simonreiterer","key":"06823cea5abaee1e5711eb235c406be9"}'}

In [10]:
!mkdir ~/.kaggle

!cp kaggle.json ~/.kaggle/

!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# verify that kaggle works
! kaggle datasets list

ref                                                         title                                              size  lastUpdated          downloadCount  
----------------------------------------------------------  ------------------------------------------------  -----  -------------------  -------------  
gpreda/reddit-vaccine-myths                                 Reddit Vaccine Myths                              223KB  2021-04-12 07:10:30           2488  
crowww/a-large-scale-fish-dataset                           A Large Scale Fish Dataset                          3GB  2021-02-17 16:10:44           1446  
dhruvildave/wikibooks-dataset                               Wikibooks Dataset                                   1GB  2021-02-18 10:08:27           1067  
promptcloud/careerbuilder-job-listing-2020                  Careerbuilder Job Listing 2020                     42MB  2021-03-05 06:59:52            246  
imsparsh/musicnet-dataset                                   MusicNet Dataset

### Download Caltech256 with Kaggle and split it up

In [11]:
! kaggle datasets download -d jessicali9530/caltech256

Downloading caltech256.zip to /content
100% 2.12G/2.12G [00:50<00:00, 37.6MB/s]
100% 2.12G/2.12G [00:50<00:00, 45.0MB/s]


In [12]:
! mkdir -p ./data/caltech256
! unzip caltech256.zip -d ./data/caltech256

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0077.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0078.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0079.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0080.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0081.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0082.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0083.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0084.jpg  
  inflating: ./data/caltech256/256_objectcategories/256_ObjectCategories/228.triceratops/228_0085.jpg  

In [13]:
# remove a directory as the kaggle source contains all images two times
! rm -rf ./data/caltech256/256_objectcategories

In [14]:
# split caltech256 up into training, test and validation data with a ratio of 75/12.5/12.5
split_files('./data/caltech256/256_ObjectCategories', './data/caltech256_split', )

## ~Download Caltech256 from Google Drive~

For some reasons the .tar that is downloaded this way is corrupted. Use Kaggle instead As of: 2021-04-09 12:17

In [None]:
# download caltech256 using torchvision
caltech256 = torchvision.datasets.Caltech256(
    root = './data/',
    download = True # toggle True/False accordingly
)

**Either the above or the below**

In [None]:
!wget -O './data/caltech256/256_ObjectCategories.tar' http://www.vision.caltech.edu/Image_Datasets/Caltech256/256_ObjectCategories.tar

!tar -xvf  './data/caltech256/256_ObjectCategories.tar' -C './data/caltech256/'

Use this to split the files up:

In [None]:
# split caltech256 up into training, test and validation data with a ratio of 75/12.5/12.5
split_files('./data/caltech256/256_ObjectCategories', './data/caltech256_split', )

Define siamese network, contrastive loss function

## Initialize Caltech256 Datasets

Creates the datasets for the train, validation and test Caltech256 datasets.

The default datasets can be found inside the `caltech_data` dict.


In [15]:
import torchvision.datasets as datasets
import torch.utils.data as loader

caltech_data = {
    'train': DefaultDataset(root_dir='./data/caltech256_split/train/', transform=image_transforms['train']),
    # 'train': DefaultDataset(root_dir='./data/caltech256_split/train/', transform=default_transformations),
    'validation': DefaultDataset(root_dir='./data/caltech256_split/validation/', transform=image_transforms['validation']),
    'test': DefaultDataset(root_dir='./data/caltech256_split/test/', transform=image_transforms['test'])
}

# Train with Caltech256

## Initialize objects

In [16]:
# Init network
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
net = SiameseNetwork()
net.to(device)

# Define epochs to train
num_epochs = 50

# Define learning rate to train with
lr = 0.001

# Init optimizer
optimizer = optim.Adam(net.parameters(), lr=lr)

# Init criterion (Contrastive Loss with Cos-Sim)
criterion = ContrastiveLoss()

# batch size of the dataloaders
batch_size = 32

# create a train dataloader 
train_data = DataLoader(caltech_data['train'], batch_size=batch_size, drop_last=True, shuffle=True)

# create a validation dataloader
validation_data = DataLoader(caltech_data['validation'], batch_size=batch_size, drop_last=True, shuffle=True)

# test datalader not needed here
# test_data = DataLoader(caltech_data['test'], batch_size=batch_size, shuffle=True)


Downloading: "https://github.com/pytorch/vision/archive/v0.6.0.zip" to /root/.cache/torch/hub/v0.6.0.zip
Downloading: "https://download.pytorch.org/models/vgg19-dcbb9e9d.pth" to /root/.cache/torch/hub/checkpoints/vgg19-dcbb9e9d.pth


HBox(children=(FloatProgress(value=0.0, max=574673361.0), HTML(value='')))




## Train/validate and save the network

In [26]:
#net.load_state_dict(torch.load(f'/content/gdrive/MyDrive/Image Similarity/<model>'))
start_epoch = 0 # necessary because colab keeps disconnecting
trained_model, history = train_and_validate(net, criterion, optimizer, train_data, validation_data, start_epoch, num_epochs)
torch.save(trained_model.state_dict(), '/content/gdrive/MyDrive/Image Similarity/final_model.pth')

Training on device: cuda
Every 50 mini-batches the runnin loss will be printed
Epoch : 1/50
Learning Rate : 0.001
  Current loss: 0.12718749046325684
  Current loss: 0.12656249105930328
  Current loss: 0.12671874463558197
  Current loss: 0.12687499821186066
  Current loss: 0.1224999949336052
  Current loss: 0.12578125298023224
  Current loss: 0.12859374284744263
  Current loss: 0.12484374642372131
  Current loss: 0.12296874821186066
  Current loss: 0.11999999731779099
  Current loss: 0.12562499940395355
  Current loss: 0.1237499937415123
  Current loss: 0.12296874821186066
  Current loss: 0.12578125298023224
Epoch : 0, Training: Loss: 0.12153863161802292, Accuracy: 50.0%, 
		Validation : Loss : 0.1225665882229805, Accuracy: 49.02663826942444%, Time: 1873.3613460063934s
Epoch : 2/50
Learning Rate : 0.001


KeyboardInterrupt: ignored

In [None]:
print(history)