##Setup

In [1]:
import numpy as np
import librosa
import torch 
import torch.nn as nn
import torch.optim as optim
import json
import pandas as pd
from datetime import datetime
import pdb 
from google.colab import drive
import pandas as pd
from psutil import virtual_memory
import os
from zipfile import ZipFile
import torchvision
from collections import defaultdict
!pip install wandb --upgrade
import wandb
from sklearn import metrics
from torchvision import transforms

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
wandb.login()

[34m[1mwandb[0m: Currently logged in as: [33mnicholasmagal[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

## Model

In [3]:
#hyperparameters
config = {"learning rate": .001,
          "batch size": 64 *2,
          "epochs": 12,
          'patience': 3}

In [4]:
#Gpu support
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Device is on",device)

#model
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=False)
#Modifying Resnet to work on our audio dataset, changing the class output to that of our audio size
model.fc = nn.Linear(512, 50)
#model.fc = nn.Linear(2048, 50)
model.to(device)

#optimizer, loss, scheduler 
optimizer = torch.optim.Adam(model.parameters(), lr=config['learning rate'])
loss_function = torch.nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'max',
                                                       patience = config['patience'])



Device is on cuda:0


Using cache found in /root/.cache/torch/hub/pytorch_vision_v0.10.0


## Data

In [5]:
#Creating new directory for data
!mkdir data
%cd data
!mkdir mel_spectograms
%cd mel_spectograms

/content/data
/content/data/mel_spectograms


In [6]:
#unzipping
!unzip /content/drive/MyDrive/transfer_learning_audio_to_image/data/mel_spectograms.zip

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: og_dataset_mel_spectogram/b/solar_cycle-sunlight-03-traveling-30-59.npy  
  inflating: og_dataset_mel_spectogram/b/music_of_the_spheres-the_fantastics-01-pandolfi_la_berbabea_op_4_no_1_1660-291-320.npy  
  inflating: og_dataset_mel_spectogram/b/la_primavera-english_renaissance_music-14-wolseys_wilde_byrd-30-59.npy  
  inflating: og_dataset_mel_spectogram/b/elizabeth_wolff-moments_musicaux-12-moritz_moszkowski__4_moments_musicaux_op__84__maestoso-59-88.npy  
  inflating: og_dataset_mel_spectogram/b/falik-the_ballad_of_el_efe-11-the_ballad_of_el_efe-146-175.npy  
  inflating: og_dataset_mel_spectogram/b/jacob_heringman-jane_pickeringes_lute_book-10-my_lord_willoughbies_welcom_home_by_mr_byrde-0-29.npy  
  inflating: og_dataset_mel_spectogram/b/cargo_cult-alchemy-10-rain-30-59.npy  
  inflating: og_dataset_mel_spectogram/b/jacob_heringman-jane_pickeringes_lute_book-29-the_madlay-30-59.npy  
  inflating: og_datas

In [5]:
import pandas as pd
import torchaudio

class SongDataset(torch.utils.data.Dataset):
    def __init__(self, path_labels_df, transform = None):
        self.path_labels_df = path_labels_df
        self.transform = transform
    
    def __len__(self):
        return len(self.path_labels_df)
    
    def __getitem__(self, idx):
        #getting base track
        track_path = self.path_labels_df.iloc[idx]['mp3_path']
        loaded_track = torch.from_numpy(np.load(track_path, allow_pickle=True) )
        
        #adding transformations to create 3 channel tensor
        masking = torchaudio.transforms.FrequencyMasking(freq_mask_param=40)
        freq_masked = masking(loaded_track)
        masking = torchaudio.transforms.TimeMasking(time_mask_param=80)
        time_masked = torch.squeeze(masking(torch.unsqueeze(loaded_track, 0)))
        
        
        #Concatanating augmentations
        loaded_track = torch.stack([loaded_track, freq_masked, time_masked], dim = 0)
        loded_track = self.transform(loaded_track)
        #label
        label = self.path_labels_df.iloc[idx].drop('mp3_path').to_numpy(dtype = 'int')
        
        return loaded_track, torch.from_numpy(label).long()
        

In [6]:
#Preprocessing csv file for use in dataset

#file containing meta data
labels_path = "/content/annotations_final_proces.csv"
labels_df = pd.read_csv(labels_path, sep='\t')

#modifying melspectogram paths
mel_data_path = '/content/data/mel_spectograms/og_dataset_mel_spectogram/'
labels_df.mp3_path = labels_df.mp3_path.apply(lambda x: mel_data_path + x.split(".")[0]+'.npy')

#Splitting data off of https://github.com/jordipons/musicnn/issues/6
# splits = {"training": ['0','1','2','3','4','5','6','7','8','9','a','b'],
#           "validation": ['c'],'testing': ['d','e','f']}

splits = {"training": ['0','1','2','3','4','5','6','7','8','9','a','b'],
          "validation": ['c', 'd'],'testing': ['e','f']}

labels_df['folder'] = labels_df.mp3_path.apply(lambda x: x.split('/')[-2])
train_labels_df = labels_df[labels_df.folder.isin(splits['training'])].drop(columns=['folder'])
valid_labels_df = labels_df[labels_df.folder.isin(splits['validation'])].drop(columns=['folder'])
test_labels_df = labels_df[labels_df.folder.isin(splits['testing'])].drop(columns=['folder'])


In [10]:
print(len(train_labels_df))
print(len(valid_labels_df))
print(len(test_labels_df))

18706
3502
3652


In [7]:
#Creating Dataloaders
transform = transforms.Compose([
     transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

training_data = SongDataset(train_labels_df, transform)
validation_data = SongDataset(valid_labels_df,transform)
test_data = SongDataset(test_labels_df,transform)

training_data_loader = torch.utils.data.DataLoader(training_data,
                                                   batch_size = config['batch size'],
                                                   shuffle = True,
                                                   num_workers = 1,
                                                   pin_memory = True,
                                                   )

validation_data_loader = torch.utils.data.DataLoader(validation_data,
                                              batch_size = config['batch size'],
                                              shuffle = False,
                                              num_workers = 1,
                                              pin_memory = True,
                                              )

test_data_loader = torch.utils.data.DataLoader(test_data,
                                                   batch_size = config['batch size'],
                                                   shuffle = False,
                                                   num_workers = 1,
                                                   pin_memory = True)

## Training/Testing

In [8]:
#train/validate network
def train_validate_test_audio_network(model, train_loader, val_loader, test_loader, optimizer, loss_function, total_epochs, device, scheduler, config, model_save_location):
    
  wandb.init(project="resnet_audio_run_new_split", config=config)
    
  best_validation_roc = -float('inf')
  for epoch_num in range(total_epochs):
    starting_time = datetime.now()
    #summary stats
    train_loss_tracker = 0
    validation_loss_tracker = 0

    model.train()
        
    for x, y in train_loader:
      optimizer.zero_grad()
    
      x, y = x.to(device), y.to(device) 
      train_output = model(x)

      train_loss = loss_function(train_output, y.float())
      train_loss.backward()
      optimizer.step()
      
      #storing loss
      train_loss_tracker += train_loss.item()
    
    with torch.no_grad():
      model.eval()
      
      #storage for evaluation of performance
      ground_truth = []
      predictions = []

      for x, y in val_loader:
        x, y = x.to(device), y.to(device)
        
        #Calculating Loss
        val_output = model(x)
        val_loss = loss_function(val_output, y.float())

        validation_loss_tracker+= val_loss.item()

        #updating for ROC function 
        val_output = torch.nn.functional.sigmoid(val_output)
        ground_truth.append(y.cpu().detach().numpy())
        predictions.append(val_output.cpu().detach().numpy())

        
    #computing summary stats 
    avg_train_loss = train_loss_tracker/len(train_loader.dataset)
    avg_val_loss = (validation_loss_tracker/len(val_loader.dataset))

    ground_truth = np.concatenate((ground_truth), axis = 0)
    predictions = np.concatenate((predictions), axis = 0)
    roc_auc = metrics.roc_auc_score(ground_truth, predictions, average='macro')
    
    scheduler.step(avg_val_loss)
    
    epoch_runtime = datetime.now() - starting_time 
    print(f'Epoch: {epoch_num} Train Loss: {avg_train_loss:.4f} Val Loss: {avg_val_loss:.4f} ROC AUC {roc_auc:.4f}')
    print(f'Epoch Runtime {epoch_runtime}')
    wandb.log({"Training Average Loss":avg_train_loss, "Val Avg Loss":avg_val_loss, "ROC AUC": roc_auc})

    #if our loss is the new best metric save the model to load later
    if roc_auc > best_validation_roc:
      best_validation_roc = roc_auc
      torch.save(model.state_dict(), model_save_location)
      print("saving model")

  def test_audio_model(model, test_loader, device, config):
    with torch.no_grad():
      model.eval()

      #storage for evaluation of performance
      ground_truth = []
      predictions = []

      for x, y in test_loader:
        x, y = x.to(device), y.to(device)
        
        #Feedforward
        output = model(x)

        #updating for ROC function 
        output = torch.nn.functional.sigmoid(output)
        ground_truth.append(y.cpu().detach().numpy())
        predictions.append(output.cpu().detach().numpy())

      ground_truth = np.concatenate((ground_truth), axis = 0)
      predictions = np.concatenate((predictions), axis = 0)
      roc_auc = metrics.roc_auc_score(ground_truth, predictions, average='macro')
      return roc_auc

  model.load_state_dict(torch.load(model_save_location))
  roc_auc_test = test_audio_model(model, test_loader, device, config)
  wandb.log({"Test ROC AUC": roc_auc_test})
  wandb.finish()




In [9]:
model_save_location = '/content/drive/MyDrive/transfer_learning_audio_to_image/saves/resent18_audio_pretrained_normalized_bigger_split'
train_validate_test_audio_network(model,training_data_loader, validation_data_loader, test_data_loader, optimizer, loss_function, config['epochs'], device,scheduler, config, model_save_location)





Epoch: 0 Train Loss: 0.0015 Val Loss: 0.0014 ROC AUC 0.7629
Epoch Runtime 0:04:13.071683
saving model




Epoch: 1 Train Loss: 0.0013 Val Loss: 0.0017 ROC AUC 0.7523
Epoch Runtime 0:03:58.052611




Epoch: 2 Train Loss: 0.0012 Val Loss: 0.0022 ROC AUC 0.6834
Epoch Runtime 0:04:00.099990




Epoch: 3 Train Loss: 0.0011 Val Loss: 0.0029 ROC AUC 0.6584
Epoch Runtime 0:04:01.718136




Epoch: 4 Train Loss: 0.0011 Val Loss: 0.0019 ROC AUC 0.7776
Epoch Runtime 0:03:58.951838
saving model




Epoch: 5 Train Loss: 0.0011 Val Loss: 0.0014 ROC AUC 0.8239
Epoch Runtime 0:03:59.910839
saving model




Epoch: 6 Train Loss: 0.0010 Val Loss: 0.0016 ROC AUC 0.7951
Epoch Runtime 0:04:01.508676




Epoch: 7 Train Loss: 0.0010 Val Loss: 0.0037 ROC AUC 0.6630
Epoch Runtime 0:04:01.462781




Epoch: 8 Train Loss: 0.0010 Val Loss: 0.0016 ROC AUC 0.7945
Epoch Runtime 0:04:02.149730




Epoch: 9 Train Loss: 0.0010 Val Loss: 0.0018 ROC AUC 0.7770
Epoch Runtime 0:04:01.087026




0,1
ROC AUC,▅▅▂▁▆█▇▁▇▆
Test ROC AUC,▁
Training Average Loss,█▅▄▃▂▂▂▁▁▁
Val Avg Loss,▁▂▃▆▂▁▂█▂▂

0,1
ROC AUC,0.77704
Test ROC AUC,0.79895
Training Average Loss,0.00099
Val Avg Loss,0.00182
