## **Install requirements**

In [0]:
# Colab link: https://colab.research.google.com/drive/1SdMsKQcasyyXohrBsaeTpSvHN-gFu8xJ 

!pip3 install 'torch==1.3.1'
!pip3 install 'torchvision==0.4.2'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

## **Import libraries**

In [0]:
import os
import logging

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn
from torch.autograd import Function
import numpy as np
import matplotlib.cm as cm
import matplotlib.pyplot as plt
import os
import math
import copy


import torchvision
from torchvision import transforms
from torchvision.models import alexnet
from torch.utils.model_zoo import load_url as load_state_dict_from_url

from PIL import Image
from tqdm import tqdm

## **AlexNet implementation with DANN**

In [0]:

model_urls = {
    'alexnet': 'https://download.pytorch.org/models/alexnet-owt-4df8aa71.pth',
}

class ReverseLayerF(Function):
    # Forwards identity
    # Sends backward reversed gradients
    @staticmethod
    def forward(ctx, x, alpha):
        ctx.alpha = alpha

        return x.view_as(x)

    @staticmethod
    def backward(ctx, grad_output):
        output = grad_output.neg() * ctx.alpha

        return output, None


class RandomNetworkWithReverseGrad(nn.Module):
    def __init__(self, **kwargs):
        super(RandomNetworkWithReverseGrad, self).__init__()
        
        # Features extractor
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(64, 192, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )


        # Class classifier
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 1000),
        )

        # Domain classifier
        self.dann_classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, 1000),
        )


    def forward(self, x, alpha=None):
        features = self.features(x)
        # Flatten the features:
        features = features.view(features.size(0), -1)
        # If we pass alpha, we can assume we are training the discriminator
        if alpha is not None:
            # gradient reversal layer (backward gradients will be reversed)
            reverse_feature = ReverseLayerF.apply(features, alpha)
            discriminator_output = self.dann_classifier(reverse_feature)
            return discriminator_output
        # If we don't pass alpha, we assume we are training with supervision
        else:
            class_outputs = self.classifier(features)
            return class_outputs



def alexnetdann(pretrained=False, progress=True, **kwargs):
    '''AlexNet model architecture from the
    `"One weird trick..." <https://arxiv.org/abs/1404.5997>`_ paper with dann implementation
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    '''
    model = RandomNetworkWithReverseGrad(**kwargs)
    if pretrained:
        state_dict = load_state_dict_from_url(model_urls['alexnet'], progress=progress)
        model.load_state_dict(state_dict, strict=False)                                           # Set strict to False to avoid errors

        # Copy the weights and data from the AlexNet classifier to the dann one
        for i in [1, 4]:
          model.dann_classifier[i].weight.data = model.classifier[i].weight.data 
          model.dann_classifier[i].bias.data = model.classifier[i].bias.data
        
    return model




## **Train without DANN**

**Set arguments**

In [0]:
DEVICE = 'cuda' # 'cuda' or 'cpu'

BATCH_SIZE = 256     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = 1e-4            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 5e-5  # Regularization, you can keep this at the default

NUM_EPOCHS = 10      # Total number of training epochs (iterations over dataset)
STEP_SIZE = 10       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 5

**Define data preprocessing**

In [0]:
# Define transforms for training phase
train_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.CenterCrop(224),  # Crops a central square patch of the image
                                                                   # 224 because torchvision's AlexNet needs a 224x224 input!
                                                                   # Remember this when applying different transformations, otherwise you get an error
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225)) # Normalizes tensor with mean and standard deviation
])
# Define transforms for the evaluation phase
eval_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))                                    
])

**Prepare dataset**

In [0]:
# Clone github repository with data
if not os.path.isdir('./Homework3-PACS'):
  !git clone https://github.com/MachineLearning2020/Homework3-PACS.git

DATA_DIR = 'Homework3-PACS/PACS'

# Prepare Pytorch train/test Datasets
train_dataset = torchvision.datasets.ImageFolder(DATA_DIR+'/photo', transform=train_transform)
test_dataset = torchvision.datasets.ImageFolder(DATA_DIR + '/art_painting', transform=eval_transform)


# Check dataset sizes
print('Train Dataset: {}'.format(len(train_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))

**Prepare dataloaders**

In [0]:
# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
train_dataloader_targetdomain = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1,pin_memory=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

**Prepare network and training**

In [0]:
net = alexnetdann(pretrained=True)
net.classifier[6] = nn.Linear(4096, 7)
net.dann_classifier[6] = nn.Linear(4096, 2)

# Define loss function
criterion = nn.CrossEntropyLoss()
criterion_dann = nn.CrossEntropyLoss()

# Choose parameters to optimize
parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet


# Define optimizer
optimizer = optim.SGD(parameters_to_optimize, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
# optimizer = optim.Adam(parameters_to_optimize, lr=LR)

# Define scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)


**Train**

In [0]:

# By default, everything is loaded to cpu
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

cudnn.benchmark # Calling this optimizes runtime

current_step = 0
# Start iterating over the epochs
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))
  len_dataloader = min(len(train_dataloader), len(train_dataloader_targetdomain))
  # Iterate over the dataset
  for i, (images, labels) in enumerate(train_dataloader):
    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    net.train() # Sets module in training mode

    optimizer.zero_grad() # Zero-ing the gradients

    # Forward pass to the network
    outputs = net(images)
    # Compute loss based on output and ground truth
    loss_source = criterion(outputs, labels)
    loss_source.backward()
    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('1 Step {}, Loss {}'.format(current_step, loss_source.item()))
    
    optimizer.step() # update weights based on accumulated gradients

    current_step += 1
  
  # Step the scheduler
  scheduler.step() 


**Test**

In [0]:
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
net.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = net(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(test_dataset))

print('Test accuracy on art_painting without DANN without hyperparameters tuning: {}'.format(accuracy))

## **Train with DANN**

**Set arguments**

In [0]:
DEVICE = 'cuda' # 'cuda' or 'cpu'

BATCH_SIZE = 256     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = 1e-4            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 5e-5  # Regularization, you can keep this at the default

NUM_EPOCHS = 10      # Total number of training epochs (iterations over dataset)
STEP_SIZE = 10       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 5

**Define data preprocessing**

In [0]:
# Define transforms for training phase
train_transform = transforms.Compose([transforms.Resize(256),      # Resizes short size of the PIL image to 256
                                      transforms.CenterCrop(224),  # Crops a central square patch of the image
                                                                   # 224 because torchvision's AlexNet needs a 224x224 input!
                                                                   # Remember this when applying different transformations, otherwise you get an error
                                      transforms.ToTensor(), # Turn PIL Image to torch.Tensor
                                      transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225)) # Normalizes tensor with mean and standard deviation
])
# Define transforms for the evaluation phase
eval_transform = transforms.Compose([transforms.Resize(256),
                                      transforms.CenterCrop(224),
                                      transforms.ToTensor(),
                                      transforms.Normalize((0.485, 0.456, 0.406),
                                 (0.229, 0.224, 0.225))                                    
])

**Prepare dataset**

In [0]:
# Clone github repository with data
if not os.path.isdir('./Homework3-PACS'):
  !git clone https://github.com/MachineLearning2020/Homework3-PACS.git

DATA_DIR = 'Homework3-PACS/PACS'

# Prepare Pytorch train/test Datasets
train_dataset = torchvision.datasets.ImageFolder(DATA_DIR+'/photo', transform=train_transform)
test_dataset = torchvision.datasets.ImageFolder(DATA_DIR + '/art_painting', transform=eval_transform)


# Check dataset sizes
print('Train Dataset: {}'.format(len(train_dataset)))
print('Test Dataset: {}'.format(len(test_dataset)))

**Prepare dataloaders**

In [0]:
# Dataloaders iterate over pytorch datasets and transparently provide useful functions (e.g. parallelization and shuffling)
train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
train_dataloader_targetdomain = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1,pin_memory=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

**Prepare network and training**

In [0]:
net = alexnetdann(pretrained=True)
net.classifier[6] = nn.Linear(4096, 7)
net.dann_classifier[6] = nn.Linear(4096, 2)

# Define loss function
criterion = nn.CrossEntropyLoss()
criterion_dann = nn.CrossEntropyLoss()

# Choose parameters to optimize
parameters_to_optimize = net.parameters() # In this case we optimize over all the parameters of AlexNet


# Define optimizer
optimizer = optim.SGD(parameters_to_optimize, lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
# optimizer = optim.Adam(parameters_to_optimize, lr=LR)

# Define scheduler
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)


**Train**

In [0]:

# Paper implementation of the adaptive alpha
def adjust_alpha(i, epoch, min_len, nepochs):
    p = float(i + epoch * min_len) / nepochs / min_len
    o = -10
    alpha = 2. / (1. + math.exp(o * p)) - 1
    # print 'lamda: %.4f' % lamda
    return alpha

# By default, everything is loaded to cpu
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

cudnn.benchmark # Calling this optimizes runtime
points_first = []
points_second = []
points_third = []

current_step = 0
alpha = 0.0
# Start iterating over the epochs
dataloader_iterator = iter(train_dataloader_targetdomain)
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))
  len_dataloader = min(len(train_dataloader), len(train_dataloader_targetdomain))
  mean_first = 0
  mean_second = 0
  mean_third = 0
  # Iterate over the dataset
  for i, (images, labels) in enumerate(train_dataloader):
    alpha = adjust_alpha(i, epoch, len_dataloader, NUM_EPOCHS)
    if i % LOG_FREQUENCY == 0:
      print('alpha: ' + str(alpha))
    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)


    net.train() # Sets module in training mode

    optimizer.zero_grad() # Zero-ing the gradients
    
    
    # TRAIN ON SOURCE LABELS

    # Forward pass to the network
    outputs = net(images)
    #print(labels)
    # Compute loss based on output and ground truth
    loss_source = criterion(outputs, labels)
    # loss = criterion(outputs, labels)
    mean_first += loss_source.detach()
    loss_source.backward()
    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('1 Step {}, Loss {}'.format(current_step, loss_source.item()))
    
    
    # TRAIN THE DISCRIMINATOR ON SOURCE LABELS
    
    # Forward pass to the network
    outputs = net(images, alpha=alpha)

    
    labels_zeros = torch.zeros(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
    # Compute loss based on output and zeros-label        
    loss_source_dann = criterion_dann(outputs, labels_zeros)
    mean_second += loss_source_dann.detach()

    loss_source_dann.backward()

    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('2 Step {}, Loss {}'.format(current_step, loss_source_dann.item()))
    
  
    # TRAIN THE DISCRIMINATOR ON TARGET LABELS

    try:
      images2, _ = next(dataloader_iterator)
    except StopIteration:
      print("Target dataset reset")
      dataloader_iterator = iter(train_dataloader_targetdomain)
      images2, _ = next(dataloader_iterator)

    images2 = images2.to(DEVICE)
    labels_ones = torch.ones(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
    # Forward pass to the network
    outputs = net(images2, alpha=alpha)

    # Compute loss based on output and ones-label
    loss_target = criterion_dann(outputs, labels_ones)
    mean_third += loss_target.detach()
    loss_target.backward()

    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('3 Step {}, Loss {}'.format(current_step, loss_target.item()))
    
    optimizer.step() # update weights based on accumulated gradients

    current_step += 1
  
  # Save the mean losses
  points_first.append(mean_first/len(train_dataloader))
  points_second.append(mean_second/len(train_dataloader))
  points_third.append(mean_third/len(train_dataloader))
  
  # Step the scheduler
  scheduler.step() 

plt.plot(range(1, len(points_first)+1), points_first)
plt.plot(range(1, len(points_second)+1), points_second)
plt.plot(range(1, len(points_third)+1), points_third)
print(len(points_first))
plt.legend(['Loss on source labels', 'Loss on source discriminator', 'Loss on target discriminator'], loc='upper right')

plt.show()


**Test**

In [0]:
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda
net.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = net(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(test_dataset))

print('Test accuracy on art_painting with DANN without hyperparameters tuning: {}'.format(accuracy))



## **Cross domain validation**

### **Implementation**

In [0]:
def cdval(validationset1, validationset2, train_dataset):
  nested_dict = lambda: defaultdict(nested_dict)

  results = nested_dict()
  resultsB = nested_dict()

  best_acc = 0
  best_net_acc = None
  best_lr = 0
  best_batch = 0
  current_step = 0
  for batch_size in [128, 256, 512]:
    trainloaderA = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
    valloaderA = DataLoader(validationset1, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
    valloaderB = DataLoader(validationset2, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
    testloaderA = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1,pin_memory=True, drop_last=True)
    for lr in [0.005, 0.001, 0.01]:
      net = alexnetdann(pretrained=True)
      net.classifier[6] = nn.Linear(4096, 7)
      net.dann_classifier[6] = nn.Linear(4096, 2)
      optimizer = optim.SGD(net.parameters(), lr=lr, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
      scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
      print('***** LR: {}, BATCH: {} ******'.format(lr, batch_size))
      net = net.to('cuda')
      for epoch in range(5):
        # Iterate over the dataset
        for i, (images, labels) in enumerate(trainloaderA):

          images = images.cuda()
          labels = labels.cuda()

          net.train() # Sets module in training mode

          optimizer.zero_grad() # Zero-ing the gradients
          outputs = net(images)
          loss_source = criterion(outputs, labels)
          loss_source.backward()
          # Log loss
          if current_step % LOG_FREQUENCY == 0: 
            print('Step {}, Loss {}'.format(current_step, loss_source.item()))
          
          optimizer.step() # update weights based on accumulated gradients

          current_step += 1


        net.train(mode=False)                                                                   # Set the net in evaluation mode
        correct = 0                                                                             # Reset the number of correct predictions on validation
        with torch.no_grad():
          for data, target in valloaderA:                                                   # Start the evaluation using validation dataloader batch-by-batch
            data, target = data.to('cuda'), target.to('cuda')
            outputs = net(data)
            loss = criterion(outputs, target)
            # Get predictions
            _, preds = torch.max(outputs.data, 1)

            # Update Corrects
            correct += torch.sum(preds == target.data)                                          # Update the number of correct predictions on validation for this epoch

        # Calculate Accuracy
        accuracy = correct / float(len(valloaderA) * batch_size) 
        
        if not results[batch_size][lr]:
          results[batch_size][lr] = accuracy.item()
        elif results[batch_size][lr] < accuracy.item():
          results[batch_size][lr] = accuracy.item()

        print('\nCartoon set accuracy: {:.3f}%\n'.format(accuracy*100))                           
        
        
        accuracy = 0
        correct = 0
        with torch.no_grad():
          for data, target in valloaderB:                                                   # Start the evaluation using validation dataloader batch-by-batch
            data, target = data.to('cuda'), target.to('cuda')
            outputs = net(data)
            loss = criterion(outputs, target)
            # Get predictions
            _, preds = torch.max(outputs.data, 1)

            # Update Corrects
            correct += torch.sum(preds == target.data)                                          # Update the number of correct predictions on validation for this epoch

        # Calculate Accuracy
        accuracy = correct / float(len(valloaderB) * batch_size) 
        print('\nSketch set accuracy: {:.3f}%\n'.format(accuracy*100))  
        if not resultsB[batch_size][lr]:
          resultsB[batch_size][lr] = accuracy.item()
        elif resultsB[batch_size][lr] < accuracy.item():
          resultsB[batch_size][lr] = accuracy.item()

        # Step the scheduler
        scheduler.step() 

  print(results)
  print(resultsB)

  results_mean = nested_dict()

  for batch in [128, 256, 512]:
    for lr in [0.005, 0.001, 0.01]:
      results_mean[batch][lr] = (results[batch][lr] + resultsB[batch][lr])/2

  return results, resultsB, results_mean




def cdvaldann(validationset1, validationset2, train_dataset):
  nested_dict = lambda: defaultdict(nested_dict)

  results = nested_dict()
  resultsB = nested_dict()

  best_acc = 0
  best_net_acc = None
  best_lr = 0
  best_alpha = 0
  best_batch = 0
  current_step = 0
  for batch_size in [128, 256, 512]:
    trainloaderA = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
    valloaderA = DataLoader(validationset1, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
    dataloader_iterator = iter(valloaderA)
    testloaderA = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1,pin_memory=True, drop_last=True)
    for lr in [0.005, 0.001, 0.01]:
      for alpha in [0.03, 0.1]:
        net = alexnetdann(pretrained=True)
        net.classifier[6] = nn.Linear(4096, 7)
        net.dann_classifier[6] = nn.Linear(4096, 2)
        optimizer = optim.SGD(net.parameters(), lr=lr, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
        print('***** LR: {}, BATCH: {}, ALPHA: {} ******'.format(lr, batch_size, alpha))
        net = net.to('cuda')
        for epoch in range(5):
          # Iterate over the dataset
          for i, (images, labels) in enumerate(trainloaderA):

            images = images.to(DEVICE)
            labels = labels.to(DEVICE)

            labels

            net.train() # Sets module in training mode

            optimizer.zero_grad() # Zero-ing the gradients
            
            
            # TRAIN ON SOURCE LABELS

            # Forward pass to the network
            outputs = net(images)
            #print(labels)
            # Compute loss based on output and ground truth
            loss_source = criterion(outputs, labels)
            loss_source.backward()
            # Log loss
            if i % LOG_FREQUENCY == 0:
              print('1 Step {}, Loss {}'.format(current_step, loss_source.item()))
            

            # TRAIN THE DISCRIMINATOR ON SOURCE LABELS
            
            # Forward pass to the network
            outputs = net(images, alpha=alpha)

            # Compute loss based on output and ground truth
            labels_zeros = torch.zeros(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
                    
            loss_source_dann = criterion_dann(outputs, labels_zeros)

            loss_source_dann.backward()
            #print(outputs)
            # Log loss
            if i % LOG_FREQUENCY == 0:
              print('2 Step {}, Loss {}'.format(current_step, loss_source_dann.item()))
            
          
            # TRAIN THE DISCRIMINATOR ON TARGET LABELS

            try:
              images2, _ = next(dataloader_iterator)
            except StopIteration:
              dataloader_iterator = iter(valloaderA)
              images2, _ = next(dataloader_iterator)

            images2 = images2.to(DEVICE)
            labels_ones = torch.ones(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
            # Forward pass to the network
            outputs = net(images2, alpha=alpha)

            # Compute loss based on output and ground truth
            
            loss_target = criterion_dann(outputs, labels_ones)
            loss_target.backward()
            # Log loss
            if i % LOG_FREQUENCY == 0:
              print('3 Step {}, Loss {}'.format(current_step, loss_target.item()))

            #loss = loss_source_dann + loss_target + loss_source
            # Compute gradients for each layer and update weights
            
            #loss.backward()  # backward pass: computes gradients
            
            optimizer.step() # update weights based on accumulated gradients

            current_step += 1


          net.train(mode=False)                                                                   # Set the net in evaluation mode
          correct = 0                                                                             # Reset the number of correct predictions on validation
          with torch.no_grad():
            for data, target in valloaderA:                                                   # Start the evaluation using validation dataloader batch-by-batch
              data, target = data.to('cuda'), target.to('cuda')
              outputs = net(data)
              loss = criterion(outputs, target)
              # Get predictions
              _, preds = torch.max(outputs.data, 1)

              # Update Corrects
              correct += torch.sum(preds == target.data)                                          # Update the number of correct predictions on validation for this epoch

          # Calculate Accuracy
          accuracy = correct / float(len(valloaderA) * batch_size) 
          
          if not results[batch_size][lr][alpha]:
            results[batch_size][lr][alpha] = accuracy.item()
          elif results[batch_size][lr][alpha] < accuracy.item():
            results[batch_size][lr][alpha] = accuracy.item()

          print('\nCartoon set accuracy: {:.3f}%\n'.format(accuracy*100))                           
          

          # Step the scheduler
          scheduler.step() 

  current_step = 0
  for batch_size in [128, 256, 512]:
    trainloaderA = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
    valloaderB = DataLoader(validationset2, batch_size=batch_size, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
    dataloader_iterator = iter(valloaderB)
    testloaderA = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=1,pin_memory=True, drop_last=True)
    for lr in [0.005, 0.001, 0.01]:
      for alpha in [0.03, 0.1]:
        net = alexnetdann(pretrained=True)
        net.classifier[6] = nn.Linear(4096, 7)
        net.dann_classifier[6] = nn.Linear(4096, 2)
        optimizer = optim.SGD(net.parameters(), lr=lr, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
        print('***** LR: {}, BATCH: {}, ALPHA: {} ******'.format(lr, batch_size, alpha))
        net = net.to('cuda')
        for epoch in range(5):
          # Iterate over the dataset
          for i, (images, labels) in enumerate(trainloaderA):

            images = images.to(DEVICE)
            labels = labels.to(DEVICE)

            labels

            net.train() # Sets module in training mode

            optimizer.zero_grad() # Zero-ing the gradients
            
            
            # TRAIN ON SOURCE LABELS

            # Forward pass to the network
            outputs = net(images)
            #print(labels)
            # Compute loss based on output and ground truth
            loss_source = criterion(outputs, labels)
            loss_source.backward()
            # Log loss
            if i % LOG_FREQUENCY == 0:
              print('1 Step {}, Loss {}'.format(current_step, loss_source.item()))
            

            # TRAIN THE DISCRIMINATOR ON SOURCE LABELS
            
            # Forward pass to the network
            outputs = net(images, alpha=alpha)

            # Compute loss based on output and ground truth
            labels_zeros = torch.zeros(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
                    
            loss_source_dann = criterion_dann(outputs, labels_zeros)

            loss_source_dann.backward()
            #print(outputs)
            # Log loss
            if i % LOG_FREQUENCY == 0:
              print('2 Step {}, Loss {}'.format(current_step, loss_source_dann.item()))
            
          
            # TRAIN THE DISCRIMINATOR ON TARGET LABELS

            try:
              images2, _ = next(dataloader_iterator)
            except StopIteration:
              dataloader_iterator = iter(valloaderB)
              images2, _ = next(dataloader_iterator)

            images2 = images2.to(DEVICE)
            labels_ones = torch.ones(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
            # Forward pass to the network
            outputs = net(images2, alpha=alpha)

            # Compute loss based on output and ground truth
            
            loss_target = criterion_dann(outputs, labels_ones)
            loss_target.backward()
            # Log loss
            if i % LOG_FREQUENCY == 0:
              print('3 Step {}, Loss {}'.format(current_step, loss_target.item()))

            #loss = loss_source_dann + loss_target + loss_source
            # Compute gradients for each layer and update weights
            
            #loss.backward()  # backward pass: computes gradients
            
            optimizer.step() # update weights based on accumulated gradients

            current_step += 1


          net.train(mode=False)                                                                   # Set the net in evaluation mode
          correct = 0                                                                             # Reset the number of correct predictions on validation
          with torch.no_grad():
            for data, target in valloaderB:                                                   # Start the evaluation using validation dataloader batch-by-batch
              data, target = data.to('cuda'), target.to('cuda')
              outputs = net(data)
              loss = criterion(outputs, target)
              # Get predictions
              _, preds = torch.max(outputs.data, 1)

              # Update Corrects
              correct += torch.sum(preds == target.data)                                          # Update the number of correct predictions on validation for this epoch

          # Calculate Accuracy
          accuracy = correct / float(len(valloaderB) * batch_size) 
          
          if not resultsB[batch_size][lr][alpha]:
            resultsB[batch_size][lr][alpha] = accuracy.item()
          elif resultsB[batch_size][lr][alpha] < accuracy.item():
            resultsB[batch_size][lr][alpha] = accuracy.item()

          print('\nSketch set accuracy: {:.3f}%\n'.format(accuracy*100))                           
          

          # Step the scheduler
          scheduler.step() 


  results_mean = nested_dict()

  for batch in [128, 256, 512]:
    for lr in [0.005, 0.001, 0.01]:
      for alpha in [0.03, 0.1]:
        results_mean[batch][lr][alpha] = (results[batch][lr][alpha] + resultsB[batch][lr][alpha])/2

  return results, resultsB, results_mean


### **Running grid-search not using and using domain adaptation**

In [0]:
%%time
from collections import defaultdict

validationset1 = torchvision.datasets.ImageFolder(DATA_DIR+'/cartoon', transform=train_transform)
validationset2 = torchvision.datasets.ImageFolder(DATA_DIR+'/sketch', transform=train_transform)

results1a, results2a, results_meanA = cdval(validationset1, validationset2, train_dataset)

results2a, results2b, results_meanB = cdvaldann(validationset1, validationset2, train_dataset)


**Retrieve the two best set**

In [0]:
best_accA = 0
best_lrA = 0
best_batchA= 0

best_accB = 0.0
best_lrB = 0
best_batchB = 0 
best_aphaB = 0

for batch in [128, 256, 512]:
    for lr in [0.005, 0.001, 0.01]:
      if results_meanA[batch][lr] > best_accA:
          best_accA = results_meanA[batch][lr]
          best_lrA = lr
          best_batchA = batch
      for alpha in [0.03, 0.1]:
        if results_meanB[batch][lr][alpha] > best_accB:
          best_accB = results_meanB[batch][lr][alpha]
          best_lrB = lr
          best_batchB = batch
          best_aphaB = alpha 
     

### **Test on art_painting after grid-search**

#### **Without domain adaptation**

**Preparing net**

In [0]:
BATCH_SIZE = best_batchA     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = best_lrA            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 5e-5  # Regularization, you can keep this at the default

NUM_EPOCHS = 10      # Total number of training epochs (iterations over dataset)
STEP_SIZE = 10       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down

LOG_FREQUENCY = 5

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

net = alexnetdann(pretrained=True)
net.classifier[6] = nn.Linear(4096, 7)
net.dann_classifier[6] = nn.Linear(4096, 2)
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
net = net.to('cuda')

cudnn.benchmark # Calling this optimizes runtime

current_step = 0

**Train and test**

In [0]:
# Start iterating over the epochs
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))
  len_dataloader = min(len(train_dataloader), len(train_dataloader_targetdomain))
  # Iterate over the dataset
  for i, (images, labels) in enumerate(train_dataloader):
    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)

    net.train() # Sets module in training mode

    optimizer.zero_grad() # Zero-ing the gradients
    
    
    # TRAIN ON SOURCE LABELS

    # Forward pass to the network
    outputs = net(images)
    #print(labels)
    # Compute loss based on output and ground truth
    loss_source = criterion(outputs, labels)
    # loss = criterion(outputs, labels)
    loss_source.backward()
    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('1 Step {}, Loss {}'.format(current_step, loss_source.item()))
    
    optimizer.step() # update weights based on accumulated gradients

    current_step += 1
  
  # Step the scheduler
  scheduler.step() 

net.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = net(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(test_dataset))

print('Test accuracy on art_painting without DANN using hyperparameters tuning: {}'.format(accuracy))

#### **With domain adaptation**

**Preparing net**

In [0]:
BATCH_SIZE = best_batchB     # Higher batch sizes allows for larger learning rates. An empirical heuristic suggests that, when changing
                     # the batch size, learning rate should change by the same factor to have comparable results

LR = best_lrB            # The initial Learning Rate
MOMENTUM = 0.9       # Hyperparameter for SGD, keep this at 0.9 when using SGD
WEIGHT_DECAY = 5e-5  # Regularization, you can keep this at the default

NUM_EPOCHS = 10      # Total number of training epochs (iterations over dataset)
STEP_SIZE = 10       # How many epochs before decreasing learning rate (if using a step-down policy)
GAMMA = 0.1          # Multiplicative factor for learning rate step-down
ALPHA = best_alphaB

LOG_FREQUENCY = 5

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
train_dataloader_targetdomain = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=1, pin_memory=True, drop_last=True)
test_dataloader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=4)

net = alexnetdann(pretrained=True)
net.classifier[6] = nn.Linear(4096, 7)
net.dann_classifier[6] = nn.Linear(4096, 2)
optimizer = optim.SGD(net.parameters(), lr=LR, momentum=MOMENTUM, weight_decay=WEIGHT_DECAY)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=STEP_SIZE, gamma=GAMMA)
net = net.to('cuda')

cudnn.benchmark # Calling this optimizes runtime

current_step = 0

**Train and test**

In [0]:
# Paper implementation of the adaptive alpha
def adjust_alpha(i, epoch, min_len, nepochs):
    p = float(i + epoch * min_len) / nepochs / min_len
    o = -10
    alpha = 2. / (1. + math.exp(o * p)) - 1
    # print 'lamda: %.4f' % lamda
    return alpha

# By default, everything is loaded to cpu
net = net.to(DEVICE) # this will bring the network to GPU if DEVICE is cuda

cudnn.benchmark # Calling this optimizes runtime
points_first = []
points_second = []
points_third = []

current_step = 0
# Start iterating over the epochs
dataloader_iterator = iter(train_dataloader_targetdomain)
for epoch in range(NUM_EPOCHS):
  print('Starting epoch {}/{}, LR = {}'.format(epoch+1, NUM_EPOCHS, scheduler.get_lr()))
  len_dataloader = min(len(train_dataloader), len(train_dataloader_targetdomain))
  # Iterate over the dataset
  for i, (images, labels) in enumerate(train_dataloader):

    # Bring data over the device of choice
    images = images.to(DEVICE)
    labels = labels.to(DEVICE)


    net.train() # Sets module in training mode

    optimizer.zero_grad() # Zero-ing the gradients
    
    
    # TRAIN ON SOURCE LABELS

    # Forward pass to the network
    outputs = net(images)
    #print(labels)
    # Compute loss based on output and ground truth
    loss_source = criterion(outputs, labels)
    # loss = criterion(outputs, labels)
    loss_source.backward()
    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('1 Step {}, Loss {}'.format(current_step, loss_source.item()))
    
    
    # TRAIN THE DISCRIMINATOR ON SOURCE LABELS
    
    # Forward pass to the network
    outputs = net(images, alpha=ALPHA)

    
    labels_zeros = torch.zeros(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
    # Compute loss based on output and zeros-label        
    loss_source_dann = criterion_dann(outputs, labels_zeros)

    loss_source_dann.backward()

    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('2 Step {}, Loss {}'.format(current_step, loss_source_dann.item()))
    
  
    # TRAIN THE DISCRIMINATOR ON TARGET LABELS

    try:
      images2, _ = next(dataloader_iterator)
    except StopIteration:
      print("Target dataset reset")
      dataloader_iterator = iter(train_dataloader_targetdomain)
      images2, _ = next(dataloader_iterator)

    images2 = images2.to(DEVICE)
    labels_ones = torch.ones(labels.shape[0]).type(torch.LongTensor).to(DEVICE)
    # Forward pass to the network
    outputs = net(images2, alpha=ALPHA)

    # Compute loss based on output and ones-label
    loss_target = criterion_dann(outputs, labels_ones)
    loss_target.backward()

    # Log loss
    if i % LOG_FREQUENCY == 0:
      print('3 Step {}, Loss {}'.format(current_step, loss_target.item()))
    
    optimizer.step() # update weights based on accumulated gradients

    current_step += 1
  # Step the scheduler
  scheduler.step() 

  net.train(False) # Set Network to evaluation mode

running_corrects = 0
for images, labels in tqdm(test_dataloader):
  images = images.to(DEVICE)
  labels = labels.to(DEVICE)

  # Forward Pass
  outputs = net(images)

  # Get predictions
  _, preds = torch.max(outputs.data, 1)

  # Update Corrects
  running_corrects += torch.sum(preds == labels.data).data.item()

# Calculate Accuracy
accuracy = running_corrects / float(len(test_dataset))

print('Test accuracy on art_painting with DANN using hyperparameters tuning: {}'.format(accuracy))