**Installs**

In [0]:
!pip3 install 'torch==1.3.1'
!pip3 install 'torchvision==0.5.0'
!pip3 install 'Pillow-SIMD'
!pip3 install 'tqdm'

**Imports**

In [0]:
import os
import logging

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Subset, DataLoader
from torch.backends import cudnn

import torchvision
from torchvision import transforms
from torchvision.models import vgg19

from PIL import Image
from tqdm import tqdm

import numpy as np
import matplotlib.pyplot as plt
import copy

#NUM_CLASSES = 102
NUM_CLASSES = 5
DEVICE = 'cuda'
MOMENTUM = 0.9

**Model definition**

In [0]:
def get_datasets(train_data_dir, test_data_dir, compose=[transforms.Resize(224),
                                                         transforms.CenterCrop(224),
                                                         transforms.ToTensor()
                                                         ]):
    train_transform = transforms.Compose(compose)
    eval_transform = transforms.Compose([
          #transforms.Resize(224),
          transforms.CenterCrop(224),
          transforms.ToTensor()
          ])

    '''
    if not os.path.isdir('./Homework2-Caltech101'):
        !git clone https://github.com/MachineLearning2020/Homework2-Caltech101.git

    '''
    if not os.path.isdir('./AIML_project'):
        !git clone https://github.com/anphetamina/AIML_project.git
    
    train_dataset = torchvision.datasets.ImageFolder(train_data_dir, transform=train_transform)
    test_dataset = torchvision.datasets.ImageFolder(test_data_dir, transform=eval_transform)

    return train_dataset, test_dataset

def test_network(net, test_dataset, batch_size):
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    net.train(False)

    criterion = nn.CrossEntropyLoss()

    sum_test_losses = 0.0
    running_corrects = 0
    for images, labels in test_dataloader:
      images = images.to(DEVICE)
      labels = labels.to(DEVICE)

      # Forward Pass
      outputs = net(images)

      # Get predictions
      _, preds = torch.max(outputs.data, 1)
      test_loss = criterion(outputs, labels)
      sum_test_losses += test_loss.item()*images.size(0)

      # Update Corrects
      running_corrects += torch.sum(preds == labels.data).data.item()

    # Calculate Accuracy
    accuracy = running_corrects / float(len(test_dataset))

    # Calculate loss
    test_loss = sum_test_losses / float(len(test_dataset))

    return accuracy, test_loss

def train_network(net, parameters_to_optimize, learning_rate, num_epochs, batch_size, weight_decay, step_size, gamma, train_dataset, val_dataset=None, verbosity=False, plot=False):
  
    train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4, drop_last=False)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.SGD(parameters_to_optimize, lr=learning_rate, momentum=MOMENTUM, weight_decay=weight_decay)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    net = net.to(DEVICE)
    best_net = vgg19()
    best_net = best_net.to(DEVICE)
    best_net.classifier[6] = nn.Linear(4096, NUM_CLASSES)

    cudnn.benchmark

    train_accuracies = []
    train_losses = []
    val_accuracies = []
    val_losses = []

    current_step = 0
    best_val_accuracy = 0.0
    best_val_loss = 0.0
    for epoch in range(num_epochs):

        train_running_corrects = 0
        sum_train_losses = 0.0

        for images, labels in train_dataloader:
            images = images.to(DEVICE)
            labels = labels.to(DEVICE)
            net.train()
            optimizer.zero_grad()

            outputs = net(images)
            _, preds = torch.max(outputs.data, 1)
            train_running_corrects += torch.sum(preds == labels.data).data.item()
            loss = criterion(outputs, labels)
            sum_train_losses += loss.item()*images.size(0)
            loss.backward()

            optimizer.step()
            current_step += 1
        

        # Calculate accuracy on train set
        train_accuracy = train_running_corrects / float(len(train_dataset))
        train_accuracies.append(train_accuracy)

        if val_dataset is not None:
            val_accuracy, val_loss = test_network(net, val_dataset, batch_size)
            if val_accuracy > best_val_accuracy:
                best_val_accuracy = val_accuracy
                best_val_loss = val_loss
                best_net.load_state_dict(net.state_dict())
            val_accuracies.append(val_accuracy)
            val_losses.append(val_loss)
            #acc_diff = train_accuracy-val_accuracy
            #if acc_diff > 0.25:
              #print("overfit -> train_accuracy {}, val_accuracy {}".format(train_accuracy, val_accuracy))
              #return best_net, best_val_accuracy, best_val_loss

        

        # Calculate loss on training set
        train_loss = sum_train_losses/float(len(train_dataset))
        train_losses.append(loss)

        if verbosity:
            if val_dataset is not None:
                print("train_acc: {}, val_acc: {}, train_loss: {}, val_loss: {} ({} / {})".format(train_accuracy, val_accuracy, train_loss, val_loss, epoch+1, num_epochs))
            else:
                print("train_acc: {}, train_loss: {} ({} / {})".format(train_accuracy, train_loss, epoch+1, num_epochs))

        if train_accuracy < 0.25 and epoch > num_epochs*0.1 or train_accuracy < 0.35 and epoch > num_epochs*0.5:
          print("underfit -> train_accuracy = {}".format(train_accuracy))
          return best_net, best_val_accuracy, best_val_loss


        scheduler.step()

    if plot:

        fig, ax = plt.subplots()
        line1, = ax.plot(train_losses, label='Loss on training set')
        line2, = ax.plot(train_accuracies, label='Accuracy on training set')
        ax.legend()
        plt.xlabel("Epochs")
        plt.show()

        if val_dataset is not None:
            fig, ax = plt.subplots()
            line1, = ax.plot(val_accuracies, label='Accuracy on validation set', color='C2')
            line2, = ax.plot(train_accuracies, label='Accuracy on training set', color='C3')
            ax.legend()
            plt.xlabel("Epochs")
            plt.show()
        
            fig, ax = plt.subplots()
            line1, = ax.plot(val_losses, label='Loss on validation set', color='C1')
            line2, = ax.plot(train_losses, label='Loss on training set', color='C7')
            ax.legend()
            plt.xlabel("Epochs")
            plt.show()

    
    return best_net, best_val_accuracy, best_val_loss


**Train + validation**

In [0]:
# ({'lr': 0.0005, 'batch_size': 8, 'weight_decay': 5e-05, 'gamma': 0.1}), val accuracy 0.7881773399014779, val loss 1.4585814757887365
# lr 0.0017290126152359597, batch 11, decay 1.0506062245241487e-06, gamma 0.10515207194838522, val accuracy 0.6108374384236454, val loss 1.1022712842290625 [3 / 50]
BATCH_SIZE = 11
LR = 0.0017290126152359597
MOMENTUM = 0.9
WEIGHT_DECAY = 1.0506062245241487e-06
NUM_EPOCHS = 100
STEP_SIZE = 60
GAMMA = 0.10515207194838522

TRAIN_DATA_DIR = 'AIML_project/ravdess-emotional-song-spec-224'
#TRAIN_DATA_DIR = 'Homework2-Caltech101/101_ObjectCategories'
compose=[#transforms.Resize(224),
         transforms.CenterCrop(224),
         transforms.RandomGrayscale(),
         transforms.ColorJitter(brightness=0.5, contrast=0.5),
         transforms.ToTensor()
         ]
train_dataset, val_dataset = get_datasets(TRAIN_DATA_DIR, TRAIN_DATA_DIR, compose)
train_indexes = [idx for idx in range(len(train_dataset)) if idx % 2]
val_indexes = [idx for idx in range(len(train_dataset)) if not idx % 2]
val_dataset = Subset(val_dataset, val_indexes)
train_dataset = Subset(train_dataset, train_indexes)
print('training set {}'.format(len(train_dataset)))
print('validation set {}'.format(len(val_dataset)))

net = vgg19()
net.classifier[6] = nn.Linear(4096, NUM_CLASSES)
best_net, val_accuracy, val_loss = train_network(net, net.parameters(), LR, NUM_EPOCHS, BATCH_SIZE, WEIGHT_DECAY, STEP_SIZE, GAMMA, train_dataset, val_dataset=val_dataset, verbosity=True, plot=True)

print('val accuracy {}'.format(val_accuracy))
print('val loss {}'.format(val_loss))

training set 414
validation set 414
train_acc: 0.2318840579710145, val_acc: 0.2222222222222222, train_loss: 1.5968541238043044, val_loss: 1.5734839088099015 (1 / 100)
train_acc: 0.2246376811594203, val_acc: 0.2222222222222222, train_loss: 1.5851994980360575, val_loss: 1.5602227325600702 (2 / 100)
train_acc: 0.22946859903381642, val_acc: 0.24879227053140096, train_loss: 1.5821979924100609, val_loss: 1.564067614539234 (3 / 100)
train_acc: 0.2463768115942029, val_acc: 0.2391304347826087, train_loss: 1.5706412527296278, val_loss: 1.5450032229584771 (4 / 100)
train_acc: 0.26811594202898553, val_acc: 0.40096618357487923, train_loss: 1.5593176086743672, val_loss: 1.4996689350708672 (5 / 100)
train_acc: 0.30676328502415456, val_acc: 0.2463768115942029, train_loss: 1.5293991703918015, val_loss: 1.5016534314063437 (6 / 100)
train_acc: 0.28502415458937197, val_acc: 0.3888888888888889, train_loss: 1.5305622562694088, val_loss: 1.4488196583185795 (7 / 100)
train_acc: 0.3502415458937198, val_acc: 0.

**Testing**

In [0]:
from torchvision import datasets
# Serve per il mapping fra label sul file con le labels per canzone di test e l'indice della label che la rete considera.
LABELS_FROM_FILE = {'angry' : 0, 'calming' : 1, 'happy' : 2, 'normal' : 3, 'sad' : 4}
NUM_CLASSES = 5
class ImageFolderWithPaths(datasets.ImageFolder):
    """Custom dataset that includes image file paths. Extends
    torchvision.datasets.ImageFolder
    """

    # override the __getitem__ method. this is the method that dataloader calls
    def __getitem__(self, index):
        # this is what ImageFolder normally returns 
        original_tuple = super(ImageFolderWithPaths, self).__getitem__(index)
        # the image file path
        path = self.imgs[index][0]
        # make a new tuple that includes original and the path
        tuple_with_path = (original_tuple + (path,))
        return tuple_with_path

def test_network_with_songs_data_return(net, test_dataset, batch_size):
    test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    net.train(False)
    net = net.to(DEVICE)

    test_songs_data = dict()
    for images, labels, paths in test_dataloader:
      torch.cuda.empty_cache()
      images = images.to(DEVICE)
      labels = labels.to(DEVICE)

      # Forward Pass
      outputs = net(images)

      # Get predictions
      _, preds = torch.max(outputs.data, 1)

      idx = 0;
      for pred in preds:

        image_name = paths[idx].split("/")[-1]
        song_idx = image_name.split("_")[0]
      
        if song_idx not in test_songs_data:
          test_songs_data[song_idx] = dict()
          test_songs_data[song_idx]["preds"] = np.zeros(NUM_CLASSES, dtype=int)
          # test_songs_data[song_idx]["outputs"] = []

        test_songs_data[song_idx]["preds"][pred] += 1
        # test_songs_data[song_idx]["outputs"].append(outputs[idx])

        idx += 1

      del labels
      del images
      del outputs

    return test_songs_data

def read_songs_labels(path):
  f = open(path,"r")
  lines = f.readlines()

  song_labels = dict()

  for line in lines:
    names = line.split(":")

    if names[0] not in song_labels:
      song_labels[names[0]] = []

    labels = names[1].replace(" ", "").split(",")
    labels[-1] = labels[-1][:-2]

    for label in labels:
      song_labels[names[0]].append(LABELS_FROM_FILE[label])
  
  return song_labels

def major_voting_analyze(songs_data, songs_labels):

  ordered_keys = sorted(songs_labels.keys())
  print(ordered_keys)
  prediction = dict()
  avg_outputs = dict()
  corrects = 0

  for key in songs_data.keys():
    num_slices = 0
    for value in songs_data[key]["preds"]:
      num_slices += value


    for value in songs_data[key]["preds"]:
      value = (float) (value/num_slices)
      # if value > 0.5:
      #   prediction[key] = idx_pred

    max = 0
    idx_pred = 0
    idx_max = 0
    for value in songs_data[key]["preds"]:
      if value > max:
        max = value
        idx_max = idx_pred
      idx_pred += 1

    prediction[key] = idx_max

    # for output in songs_data[key]["outputs"]:
    #   if sum_outputs is None:
    #     sum_outputs = output
    #   else:
    #     sum_outputs += output

    # avg_ouputs = sum_outputs/num_slices

  idx = 0
  keys = sorted(prediction.keys(), key=int)
  print(list(keys))
  for song_idx in keys:
    if prediction[song_idx] in songs_labels[ordered_keys[idx]]:
      corrects += 1
    print("Prediction for song {} - {}: {}; labels: {}".format(song_idx, ordered_keys[idx], prediction[song_idx], list(songs_labels[ordered_keys[idx]])))
    idx += 1

  test_accuracy = (float) (corrects / idx)
  print("Test accuracy: {}".format(test_accuracy))

def get_test_dataset(test_data_dir):
    eval_transform = transforms.Compose([
          #transforms.Resize(224),
          transforms.CenterCrop(224),
          transforms.ToTensor()
          ])
    
    if not os.path.isdir('./AIML_project'):
        !git clone https://github.com/anphetamina/AIML_project.git
    
    test_dataset = ImageFolderWithPaths(test_data_dir, transform=eval_transform)

    return test_dataset

torch.cuda.empty_cache()

TEST_DATA_DIR = 'AIML_project/CAL500_test_sliced_spectrograms'
test_dataset = get_test_dataset(TEST_DATA_DIR)
print('test set {}'.format(len(test_dataset)))

# net extracted by training
songs_data = test_network_with_songs_data_return(best_net, test_dataset, BATCH_SIZE)

print(songs_data)

songs_labels = read_songs_labels("AIML_project/songs_filtered_with_labels.txt")

print(songs_labels)

major_voting_analyze(songs_data, songs_labels)

**Random search**

In [4]:
best_accuracy = 0.0
best_loss = 0.0
val_accuracies = []
val_losses = []
import random
TRAIN_DATA_DIR = 'AIML_project/ravdess-emotional-song-spec-672'
#TRAIN_DATA_DIR = 'Homework2-Caltech101/101_ObjectCategories'
compose=[transforms.Resize(224),
        transforms.CenterCrop(224),
        transforms.RandomGrayscale(),
        transforms.ColorJitter(brightness=0.5, contrast=0.5),
        transforms.ToTensor()
        ]
train_dataset, val_dataset = get_datasets(TRAIN_DATA_DIR, TRAIN_DATA_DIR, compose)
train_indexes = [idx for idx in range(len(train_dataset)) if idx % 10]
val_indexes = [idx for idx in range(len(train_dataset)) if not idx % 10]
val_dataset = Subset(val_dataset, val_indexes)
train_dataset = Subset(train_dataset, train_indexes)
print('training set {}'.format(len(train_dataset)))
print('validation set {}'.format(len(val_dataset)))
best_net = vgg19()
best_net.classifier[6] = nn.Linear(4096, NUM_CLASSES)
best_net = best_net.to(DEVICE)
best_set = {}
N = 50
for i in range(N):
  BATCH_SIZE = int(random.uniform(8, 16))
  LR = random.uniform(0.0008, 0.003)
  MOMENTUM = 0.9
  WEIGHT_DECAY = 10**random.uniform(-5, -3)
  NUM_EPOCHS = 100
  STEP_SIZE = 60
  GAMMA = 10**random.uniform(-2, -1)
  set = {"lr": LR, "batch_size": BATCH_SIZE, "weight_decay": WEIGHT_DECAY, "gamma": GAMMA}
  print("-------------------------------------")
  print(set)
  net = vgg19()
  net.classifier[6] = nn.Linear(4096, NUM_CLASSES)
  current_net, val_accuracy, val_loss = train_network(net, net.parameters(), LR, NUM_EPOCHS, BATCH_SIZE, WEIGHT_DECAY, STEP_SIZE, GAMMA, train_dataset, val_dataset=val_dataset, verbosity=True)

  val_accuracies.append(val_accuracy)
  val_losses.append(val_loss)

  if val_accuracy > best_accuracy:
    best_accuracy = val_accuracy
    best_loss = val_loss
    best_net = copy.deepcopy(current_net)
    best_set = copy.deepcopy(set)

  print("lr {}, batch {}, decay {}, gamma {}, val accuracy {}, val loss {} [{} / {}]".format(LR, BATCH_SIZE, WEIGHT_DECAY, GAMMA, val_accuracy, val_loss, i+1, N))

print("--------------------------------------------")
print("\n{}, best val accuracy {}, best val loss {}".format(best_set, best_accuracy, best_loss))
print("val accuracies\n{}".format(val_accuracies))
print("val losses\n{}".format(val_losses))

training set 910
validation set 102
-------------------------------------
{'lr': 0.00271621936500431, 'batch_size': 13, 'weight_decay': 0.00018308707592117277, 'gamma': 0.06393470426207985}
train_acc: 0.17582417582417584, val_acc: 0.17647058823529413, train_loss: 1.776629032407488, val_loss: 1.7655110300755967 (1 / 100)
train_acc: 0.2, val_acc: 0.27450980392156865, train_loss: 1.7512230464390346, val_loss: 1.6718381549797805 (2 / 100)
train_acc: 0.1835164835164835, val_acc: 0.22549019607843138, train_loss: 1.759641420841217, val_loss: 1.756030163344215 (3 / 100)
train_acc: 0.23076923076923078, val_acc: 0.3235294117647059, train_loss: 1.7647455028125218, val_loss: 1.7353674956396514 (4 / 100)
train_acc: 0.23736263736263735, val_acc: 0.20588235294117646, train_loss: 1.7537101064409528, val_loss: 1.7741931232751584 (5 / 100)
train_acc: 0.26483516483516484, val_acc: 0.22549019607843138, train_loss: 1.7133156946727208, val_loss: 1.6680528311168445 (6 / 100)
train_acc: 0.3142857142857143, va

KeyboardInterrupt: ignored

**Grid search**

In [0]:
NUM_CLASSES = 6
DEVICE = 'cuda'
#BATCH_SIZE = 16
#LR = 0.001
MOMENTUM = 0.9
#WEIGHT_DECAY = 5e-5
NUM_EPOCHS = 100
STEP_SIZE = 60
#GAMMA = 0.1

lr_range = [0.005, 0.001, 0.0005]
batch_size_range = [16, 8]
weight_decay_range = [5e-5, 5e-3]
gamma_range = [0.1, 0.01]
hyperparameters_sets = []

for lr in lr_range:
  for batch_size in batch_size_range:
    for weight_decay in weight_decay_range:
      for gamma in gamma_range:
        hyperparameters_sets.append({'lr': lr, 'batch_size': batch_size, 'weight_decay': weight_decay, 'gamma': gamma})

for set in hyperparameters_sets:
  print(set)


TRAIN_DATA_DIR = 'AIML_project/ravdess-emotional-song-spec-224'
compose=[#transforms.Resize(224),
         transforms.CenterCrop(224),
         transforms.RandomGrayscale(),
         transforms.ColorJitter(brightness=0.5, contrast=0.5),
         transforms.ToTensor()
         ]
train_dataset, val_dataset = get_datasets(TRAIN_DATA_DIR, TRAIN_DATA_DIR, compose)

train_indexes = [idx for idx in range(len(train_dataset)) if idx % 5]
val_indexes = [idx for idx in range(len(train_dataset)) if not idx % 5]
val_dataset = Subset(val_dataset, val_indexes)
train_dataset = Subset(train_dataset, train_indexes)
print('training set {}'.format(len(train_dataset)))
print('validation set {}'.format(len(val_dataset)))

best_net = vgg19()
best_net = best_net.to(DEVICE)
best_net.classifier[6] = nn.Linear(4096, NUM_CLASSES)
best_set = {}
best_accuracy = 0.0
best_loss = 0.0
val_accuracies = []
val_losses = []

for set in hyperparameters_sets:

  net = vgg19()
  net.classifier[6] = nn.Linear(4096, NUM_CLASSES)
  current_net, val_accuracy, val_loss = train_network(net, net.parameters(), set['lr'], NUM_EPOCHS, set['batch_size'], set['weight_decay'], STEP_SIZE, set['gamma'], train_dataset, val_dataset=val_dataset)
  val_accuracies.append(val_accuracy)
  val_losses.append(val_loss)

  if val_accuracy > best_accuracy:
    best_accuracy = val_accuracy
    best_loss = val_loss
    best_net = copy.deepcopy(current_net)
    best_set = copy.deepcopy(set)
  
  print("({}), val accuracy {}, val loss {}".format(set, val_accuracy, val_loss))

print("\n\n({}), best val accuracy {}, best val loss {}\n".format(best_set, best_accuracy, best_loss))
print("\nval_accuracies")
print(val_accuracies)

{'lr': 0.005, 'batch_size': 16, 'weight_decay': 5e-05, 'gamma': 0.1}
{'lr': 0.005, 'batch_size': 16, 'weight_decay': 5e-05, 'gamma': 0.01}
{'lr': 0.005, 'batch_size': 16, 'weight_decay': 0.005, 'gamma': 0.1}
{'lr': 0.005, 'batch_size': 16, 'weight_decay': 0.005, 'gamma': 0.01}
{'lr': 0.005, 'batch_size': 8, 'weight_decay': 5e-05, 'gamma': 0.1}
{'lr': 0.005, 'batch_size': 8, 'weight_decay': 5e-05, 'gamma': 0.01}
{'lr': 0.005, 'batch_size': 8, 'weight_decay': 0.005, 'gamma': 0.1}
{'lr': 0.005, 'batch_size': 8, 'weight_decay': 0.005, 'gamma': 0.01}
{'lr': 0.001, 'batch_size': 16, 'weight_decay': 5e-05, 'gamma': 0.1}
{'lr': 0.001, 'batch_size': 16, 'weight_decay': 5e-05, 'gamma': 0.01}
{'lr': 0.001, 'batch_size': 16, 'weight_decay': 0.005, 'gamma': 0.1}
{'lr': 0.001, 'batch_size': 16, 'weight_decay': 0.005, 'gamma': 0.01}
{'lr': 0.001, 'batch_size': 8, 'weight_decay': 5e-05, 'gamma': 0.1}
{'lr': 0.001, 'batch_size': 8, 'weight_decay': 5e-05, 'gamma': 0.01}
{'lr': 0.001, 'batch_size': 8, 'w

**Testing**

In [0]:
# todo

**Mean / std computation**

In [0]:
TRAIN_DATA_DIR = 'AIML_project/ravdess-emotional-song-mel'
pixel_mean = np.zeros(3)
pixel_std = np.zeros(3)
k = 1
dataset, _ = get_datasets(TRAIN_DATA_DIR, TRAIN_DATA_DIR, [])
for image, _ in tqdm(dataset, "Computing mean/std", len(dataset), unit="samples"):
    image = np.array(image)
    pixels = image.reshape((-1, image.shape[2]))

    for pixel in pixels:
        diff = pixel - pixel_mean
        pixel_mean += diff / k
        pixel_std += diff * (pixel - pixel_mean)
        k += 1

pixel_std = np.sqrt(pixel_std / (k - 2))
print(pixel_mean)
print(pixel_std)