# Task 3
This serves as a template which will guide you through the implementation of this task. It is advised to first read the whole template and get a sense of the overall structure of the code before trying to fill in any of the TODO gaps.
This is the jupyter notebook version of the template. For the python file version, please refer to the file `template_solution.py`.

First, we import necessary libraries:

In [39]:
# import libraries
from __future__ import print_function, division

# torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from torch.optim import lr_scheduler
import torch.backends.cudnn as cudnn
import torch.nn.functional as F
# torchvision
from torchvision import datasets, transforms
from torchvision.models import resnet50, ResNet50_Weights
# data handling
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import time
import os
import copy
from tqdm import tqdm

In [40]:
# The device is automatically set to GPU if available, otherwise CPU
# If you want to force the device to CPU, you can change the line to
# device = torch.device("cpu")
# When using the GPU, it is important that your model and all data are on the 
# same device.
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [41]:
"""
Transform, resize and normalize the images and then use a pretrained model to extract 
the embeddings.
"""
if not os.path.exists("dataset/embeddings.npy"):
    # TODO: define a transform to pre-process the images
    # The required pre-processing depends on the pre-trained model you choose 
    # below. 
    # See https://pytorch.org/vision/stable/models.html#using-the-pre-trained-models
    data_transforms = {
        "Advanced": transforms.Compose([
            transforms.Resize((256, 256)),
            transforms.RandomResizedCrop(224),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
        ]),
        "Alternative": transforms.Compose([
            transforms.ToTensor(),
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.Normalize([0.6110, 0.5012, 0.3752], [0.2575, 0.2659, 0.2801])
        ]), 
        "Simple": transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor()
        ]),
    }

    train_dataset = datasets.ImageFolder(root="dataset/", transform=data_transforms["Advanced"])
    # Hint: adjust batch_size and num_workers to your PC configuration, so that you don't 
    # run out of memory (VRAM if on GPU, RAM if on CPU)
    train_loader = DataLoader(dataset=train_dataset,
                            batch_size=64,
                            shuffle=False,
                            pin_memory=True, 
                            num_workers=0)

    # TODO: define a model for extraction of the embeddings (Hint: load a pretrained model, 
    # more info here: https://pytorch.org/vision/stable/models.html)

    model = resnet50(weights="DEFAULT") # other weights to try: 
    # The dimensionality of a vector embedding is equivalent to the size of the second-to-last layer in the model and, 
    # thus, interchangeable with the vector’s size or length.
    embedding_size = model.fc.in_features
    num_images = len(train_dataset)

    # Remove the last layer (fully connected layer)
    model = nn.Sequential(*(list(model.children())[:-1]))

    # Freeze the parameters of the model
    for param in model.parameters(): param.requires_grad = False

    # Move the model to the device
    model = model.to(device)

    # Set the model to evaluation mode
    model.eval()

    embeddings = []

    # Extract embeddings
    with torch.no_grad():
        for img, _ in train_loader:
            img = img.to(device)
            output = model(img)
            output = output.view(output.size(0), -1)  # Flatten the output
            embeddings.append(output.cpu().numpy())

    # Concatenate all embeddings
    embeddings = np.concatenate(embeddings, axis=0)
    # TODO: Use the model to extract the embeddings. Hint: remove the last layers of the 
    # model to access the embeddings the model generates. 

    assert embeddings.shape == (num_images, embedding_size)

    np.save('dataset/embeddings.npy', embeddings)

In [42]:
def get_data(file, train=True):
    """
    Load the triplets from the file and generate the features and labels.

    input: file: string, the path to the file containing the triplets
          train: boolean, whether the data is for training or testing

    output: X: numpy array, the features
            y: numpy array, the labels
    """
    triplets = []
    with open(file) as f:
        for line in f:
            triplets.append(line.strip())


    # generate training data from triplets
    train_dataset = datasets.ImageFolder(root="dataset/",
                                        transform=None)
    filenames = [s[0].split('/')[-1].split("\\")[-1].replace('.jpg', '') for s in train_dataset.samples]
    embeddings = np.load('dataset/embeddings.npy')
    # Normalize the embeddings
    embeddings = (embeddings - embeddings.mean(axis = 1)[:, np.newaxis]) / embeddings.std(axis = 1)[:, np.newaxis]

    file_to_embedding = {}
    for i in range(len(filenames)):
        file_to_embedding[filenames[i]] = embeddings[i]
    X = []
    y = []
    # use the individual embeddings to generate the features and labels for triplets
    for t in triplets:
        emb = [file_to_embedding[a] for a in t.split()]
        X.append(np.hstack([emb[0], emb[1], emb[2]]))
        y.append(1)
        # Generating negative samples (data augmentation)
        if train:
            X.append(np.hstack([emb[0], emb[2], emb[1]]))
            y.append(0)
    X = np.vstack(X)
    y = np.hstack(y)

    return X, y

Hint: adjust batch_size and num_workers to your PC configuration, so that you don't run out of memory (VRAM if on GPU, RAM if on CPU)

In [43]:
def create_loader_from_np(X, y = None, train = True, batch_size=50, shuffle=True, num_workers = 4):
    """
    Create a torch.utils.data.DataLoader object from numpy arrays containing the data.

    input: X: numpy array, the features
           y: numpy array, the labels
    
    output: loader: torch.data.util.DataLoader, the object containing the data
    """
    if train:
        dataset = TensorDataset(torch.from_numpy(X).type(torch.float), 
                                torch.from_numpy(y).type(torch.float))
    else:
        dataset = TensorDataset(torch.from_numpy(X).type(torch.float))
    loader = DataLoader(dataset=dataset,
                        batch_size=batch_size,
                        shuffle=shuffle,
                        pin_memory=True, num_workers=num_workers)
    return loader

TODO: define a model. Here, the basic structure is defined, but you need to fill in the details

In [44]:
class BinaryTasteClassifier(nn.Module):
    """
    Classifier with 4 fully connected layers, activated by ReLu and including dropout
    for regularization.
    """
    def __init__(self, input_size, hidden_size1, hidden_size2, hidden_size3, dropout_proba):
        #inherit from Module
        super().__init__()

        # specify layer objects
        self.fc1 = nn.Linear(input_size, hidden_size1)
        self.bn1 = nn.BatchNorm1d(hidden_size1)
        self.dropout1 = nn.Dropout(dropout_proba)
        self.fc2 = nn.Linear(hidden_size1, hidden_size2)
        self.bn2 = nn.BatchNorm1d(hidden_size2)
        self.dropout2 = nn.Dropout(dropout_proba)
        self.fc3 = nn.Linear(hidden_size2, hidden_size3)
        self.bn3 = nn.BatchNorm1d(hidden_size3)
        self.dropout3 = nn.Dropout(dropout_proba)
        self.fc4 = nn.Linear(hidden_size3, 1)

        # initialize weights using Xavier initialization
        nn.init.xavier_uniform_(self.fc1.weight)
        nn.init.xavier_uniform_(self.fc2.weight)
        nn.init.xavier_uniform_(self.fc3.weight)
        nn.init.xavier_uniform_(self.fc4.weight)

    def forward(self, x):
        """
        The forward pass of the model.

        input: x: torch.Tensor, the input to the model

        output: x: torch.Tensor, the output of the model
        """
        # layer1
        x = self.fc1(x)
        # x = self.bn1(x)
        x = F.relu(x)
        # x = self.dropout1(x)
        
        # layer2
        x = self.fc2(x)
        # x = self.bn2(x)
        x = F.relu(x)
        # x = self.dropout2(x)
        
        # layer3
        x = self.fc3(x)
        # x = self.bn3(x)
        x = F.relu(x)
        # x = self.dropout3(x)
        
        # layer 4
        x = self.fc4(x)

        return x


In [45]:

TRAIN_TRIPLETS = 'train_triplets.txt'

# load the training data
X, y = get_data(TRAIN_TRIPLETS)

# simple train loader
train_loader_simple = create_loader_from_np(X, y, train = True, batch_size=64)
print("simpl loader created")
# create KFold object
n_splits = 10
kfold = KFold(n_splits=n_splits, shuffle = True, random_state=42)
# Create data loaders for the training data
train_loader = {fold: create_loader_from_np(X = X[train_indices], y = y[train_indices], train = True, batch_size=64)
                       for fold, (train_indices, _) in enumerate(kfold.split(X))}
print("train loader created")
val_loader = {fold: create_loader_from_np(X = X[val_indices], y = y[val_indices], train = True, batch_size=64)
                       for fold, (_, val_indices) in enumerate(kfold.split(X))}
print("valid loader created")
# create the data loader
data_loader = {"Train": train_loader,
               "Valid": val_loader}

dataset_size = X.shape[0]
# delete the loaded training data to save memory, as the data loader copies
del X, y


simpl loader created
train loader created
valid loader created


In [46]:
TEST_TRIPLETS = 'test_triplets.txt'

# repeat for testing data
X_test, y_test = get_data(TEST_TRIPLETS, train=False)
test_loader = create_loader_from_np(X_test, train = False, batch_size=2048, shuffle=False)
del X_test
del y_test

In [48]:
def train_model(model:nn.Sequential, 
                n_epochs:int,
                criterion:nn,
                optimizer:torch.optim,
                scheduler:lr_scheduler,
                data_loader:DataLoader,
                prediction_threshold:float = 0.5):
    """
    This f

    Args:
        model (nn.Sequential): CNN model class
        n_epochs (int): number of epochs to train
        criterion (nn): loss function
        optimizer (nn.optim): optimization  algorithm
        scheduler (lr_scheduler): learning rate adaption over epochs
        data_loader (DataLoader): data loader object holding training and validation data
        prediction_threshold (float, optional): decision boundary for binary classification. Defaults to 0.5.

    Returns:
        dict: best model weigths found
    """
    since = time.time()

    best_model_wts = copy.deepcopy(model.state_dict())
    best_accuracy = 0.0

    for epoch in range(n_epochs): 
        # info
        print(53 * "#")
        print(20 * "~", f' EPOCH {epoch + 1}/{n_epochs}', 20 * "~")
        print(53 * "#")
        # start time of epoch
        epoch_since = time.time()
        # decide which fold to use
        if epoch % n_epochs == 0: fold = 0

        # train and validation phase looping   
        for train in [True, False]:
            # set phase string
            phase = "Train" if train else "Valid"

            # set model mode
            if train: 
                model.train()
            else:
                model.eval()
            
            # init data loader
            data_loader_phase = copy.deepcopy(data_loader[phase][fold])

            # init epoch loss and corrects
            running_loss = 0.0
            running_corrects = 0
            total_labels = 0
            i = 1

            with tqdm(data_loader_phase, unit = "batch") as tepoch:
            # iterate data loader
                for inputs, labels in tepoch: # tepoch: 
                    # set progressbar description and postfix
                    tepoch.set_description(f"{phase} phase")
                    tepoch.set_postfix({"LOSS": running_loss/i, "ACC": running_corrects/ max(total_labels)})
                    # copy to device
                    inputs.to(device)
                    labels.to(device)

                    # set the gradient to zero
                    optimizer.zero_grad()

                    # step forward, track only if in train
                    with torch.set_grad_enabled(train):
                        outputs = model(inputs).squeeze(1)# .round().to(torch.long) # removing singleton dimension at axis 1
                        preds = np.where(F.sigmoid(outputs).detach().numpy() >= prediction_threshold, 1, 0)
                        loss = criterion(outputs, labels)

                        # propagate backwards if in train phase
                        if train:
                            loss.backward()
                            optimizer.step()
                

                    # calculate stats
                    running_loss += loss.item()
                    running_corrects += np.sum(preds == labels.numpy()).item()
                    total_labels += len(labels)
                    i += 1

                if train: 
                    scheduler.step()

                epoch_loss = running_loss / i
                epoch_accuracy = running_corrects / max(1, total_labels)
                # epoch_time = time.time() - epoch_since

                # print(f'Final {phase}: Loss = {epoch_loss:.4f}, Acc = {epoch_accuracy:.4f}, Time = {epoch_time:.0f}s')

                if not train and epoch_accuracy > best_accuracy:
                        best_accuracy = epoch_accuracy
                        best_model_wts = copy.deepcopy(model.state_dict())

        fold += 1

    time_elapsed = time.time() - since
    print(f'Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s')
    print(f'Best Valid Acc: {best_accuracy:4f}')

    # load best model weights
    # model.load_state_dict(best_model_wts)

    return (best_accuracy, best_model_wts)

In [55]:
"""
The training procedure of the model; it accepts the training data, defines the model 
and then trains it.

input: train_loader: torch.data.util.DataLoader, the object containing the training data
    
compute: model: torch.nn.Module, the trained model
"""

model = BinaryTasteClassifier(input_size=6144, 
                              hidden_size1=1024, 
                              hidden_size2=64,
                              hidden_size3=32,
                              dropout_proba=0.5)
model.train()
model.to(device)
n_epochs = 10
# TODO: define a loss function, optimizer and proceed with training. Hint: use the part 
# of the training data as a validation split. After each epoch, compute the loss on the 
# validation split and print it out. This enables you to see how your model is performing 
# on the validation data before submitting the results on the server. After choosing the 
# best model, train it on the whole training data.

# define the loss criterion
criterion_CE = nn.CrossEntropyLoss()
criterion_MSE = nn.MSELoss()

# define possible optimizers
optimizer_sgd = optim.SGD(model.parameters(), lr=0.001, momentum=0.3)
optimizer_adam = optim.Adam(model.parameters(), lr=0.001)

# scheduler to adjust learning rate over epoch iterations
sgd_lr_scheduler = lr_scheduler.StepLR(optimizer_sgd, step_size=2, gamma=0.1)
adam_lr_scheduler = lr_scheduler.StepLR(optimizer_adam, step_size=7, gamma=0.1)

FineTuningMode = True

if FineTuningMode:
    _ = train_model(model=model,
                    n_epochs=10,
                    criterion=criterion_CE, 
                    optimizer=optimizer_sgd, 
                    scheduler=sgd_lr_scheduler,
                    data_loader=data_loader,
                    prediction_threshold=0.5
                    )
    # MSE with sgd: 0.717851
    # CE with sgd: 
    # MSE with adam: 
    # MSE with adam: 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 1/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [01:14<00:00, 22.48batch/s, LOSS=129, ACC=0.808]
Valid phase: 100%|██████████| 186/186 [00:06<00:00, 30.98batch/s, LOSS=127, ACC=0.729]


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 2/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:54<00:00, 30.59batch/s, LOSS=126, ACC=0.889]
Valid phase: 100%|██████████| 186/186 [00:05<00:00, 35.34batch/s, LOSS=125, ACC=0.791] 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 3/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:53<00:00, 31.24batch/s, LOSS=121, ACC=0.963]
Valid phase: 100%|██████████| 186/186 [00:05<00:00, 34.11batch/s, LOSS=119, ACC=0.851]


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 4/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:53<00:00, 31.31batch/s, LOSS=119, ACC=0.974]
Valid phase: 100%|██████████| 186/186 [00:04<00:00, 37.81batch/s, LOSS=118, ACC=0.856] 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 5/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:50<00:00, 32.88batch/s, LOSS=117, ACC=0.971]
Valid phase: 100%|██████████| 186/186 [00:04<00:00, 37.28batch/s, LOSS=116, ACC=0.849] 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 6/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:52<00:00, 31.82batch/s, LOSS=117, ACC=0.965]
Valid phase: 100%|██████████| 186/186 [00:05<00:00, 35.70batch/s, LOSS=116, ACC=0.845] 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 7/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:52<00:00, 31.92batch/s, LOSS=117, ACC=0.966]
Valid phase: 100%|██████████| 186/186 [00:04<00:00, 37.33batch/s, LOSS=115, ACC=0.845] 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 8/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:51<00:00, 32.62batch/s, LOSS=116, ACC=0.965]
Valid phase: 100%|██████████| 186/186 [00:05<00:00, 36.89batch/s, LOSS=117, ACC=0.847] 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 9/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [00:51<00:00, 32.22batch/s, LOSS=117, ACC=0.964]
Valid phase: 100%|██████████| 186/186 [00:05<00:00, 31.30batch/s, LOSS=116, ACC=0.846] 


#####################################################
~~~~~~~~~~~~~~~~~~~~  EPOCH 10/10 ~~~~~~~~~~~~~~~~~~~~
#####################################################


Train phase: 100%|██████████| 1674/1674 [01:08<00:00, 24.27batch/s, LOSS=117, ACC=0.964]
Valid phase: 100%|██████████| 186/186 [00:05<00:00, 31.23batch/s, LOSS=115, ACC=0.852]

Training complete in 55m 5s
Best Valid Acc: 0.855785





In [79]:
# finally train the model on all the data without validation
model = BinaryTasteClassifier(input_size=6144, 
                              hidden_size1=1024, 
                              hidden_size2=64,
                              hidden_size3=32,
                              dropout_proba=0.5)

model.train()
model.to(device)
n_epochs = 6

optimizer = optim.SGD(model.parameters(), lr=0.0001, momentum=0.3)
criterion = nn.CrossEntropyLoss()
scheduler = lr_scheduler.StepLR(optimizer, step_size=2, gamma=0.1)
prediction_threshold = 0.5


for epoch in range(n_epochs):
    print(f'Epoch={epoch + 1}/{n_epochs}')
    running_loss = 0.0     
    running_corrects = 0
    total_labels = 0

    for i, [features, labels] in enumerate(train_loader_simple):
        optimizer.zero_grad()
        outputs = model(features).squeeze(1)
        preds = np.where(F.sigmoid(outputs).detach().numpy() >= prediction_threshold, 1, 0)
        running_corrects += np.sum(preds == labels.numpy()).item()
        total_labels += len(labels)
        # if np.sum(preds == labels.numpy()).item() > 54:
        #     print(len(labels), len(preds))
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    
    print(f"Loss: {running_loss / i:.3f}, Acc:{running_corrects / total_labels:.3f}")
    

    scheduler.step()


Epoch=1/6
Loss: 130.181, Acc:0.636
Epoch=2/6
Loss: 127.722, Acc:0.721
Epoch=3/6
Loss: 126.308, Acc:0.750
Epoch=4/6
Loss: 126.047, Acc:0.757
Epoch=5/6
Loss: 125.833, Acc:0.762
Epoch=6/6
Loss: 125.817, Acc:0.763


In [78]:
"""
The testing procedure of the model; it accepts the testing data and the trained model and 
then tests the model on it.

input: model: torch.nn.Module, the trained model
       loader: torch.data.util.DataLoader, the object containing the testing data
        
compute: None, the function saves the predictions to a results.txt file
"""
model.eval()
predictions = []
# Iterate over the test data
with torch.no_grad(): # We don't need to compute gradients for testing
    for [x_batch] in test_loader:
        x_batch= x_batch.to(device)
        predicted = model(x_batch)
        predicted = predicted.cpu().numpy()
        # Rounding the predictions to 0 or 1
        predicted[predicted >= 0.5] = 1
        predicted[predicted < 0.5] = 0
        predictions.append(predicted)
    predictions = np.vstack(predictions)
np.savetxt("results.txt", predictions, fmt='%i')
print("Results saved to results.txt")

Results saved to results.txt
