In [1]:
import sys
sys.path.append('..')
from utils import compute_melspectrogram, load_parameters, AssembleModel

import torch
import numpy as np
import os
import glob
import numpy as np
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

from tqdm.notebook import tqdm

In [2]:
# define model parameters
main_dict = {"frontend_dict":
             {"list_out_channels":[128,128,256,256,256,256], 
              "list_kernel_sizes":[(3,3),(3,3),(3,3),(3,3),(3,3),(3,3)],
              "list_pool_sizes":  [(3,2),(2,2),(2,2),(2,1),(2,1),(2,1)], 
              "list_avgpool_flags":[False,False,False,False,False,True]},
             
             "backend_dict":
             {"n_class":50,
              "recurrent_units": None, 
              "bidirectional":False},
            }

In [None]:
def first_10_tags(filename, model, tagList, device):

    input_length = 5*16000//256 # 5 seconds of audio at 16000 Hz // 256 hop length

    whole_spec = compute_melspectrogram(filename)

    n_chunks = whole_spec.shape[1] // input_length
    spec = np.zeros((n_chunks,whole_spec.shape[0],input_length)) # stack of chunks
    for i in range(n_chunks):
        spec[i]=whole_spec[:,i*input_length:(i+1)*input_length]

    spec = torch.Tensor(spec[:,np.newaxis,:,:]).to(device)
    
    out = model(spec)
    
    out = out.detach().cpu().numpy()
    
    y_pred = out.mean(axis=0)
    
    print(np.array(tagList)[np.argsort(y_pred)[::-1]][:10].tolist())
    print(np.sort(y_pred)[::-1][:10].tolist())

In [None]:
with open('MSD_50tagList.txt') as f:
    tagList = f.readlines()
    
tagList = [line.replace('\n','') for line in tagList]

In [3]:
model = load_parameters(AssembleModel(main_dict),"MSD_transformer_autotagger.pth")

#### Setup

Set the model to eval mode and move to desired device.

In [4]:
# Set to GPU or CPU
device = "cuda"
model = model.eval()
model = model.to(device)

Test

In [None]:
first_10_tags("/homes/lm004/commercials/annotated_commercials/_ob7euNGFxw_trimmed.mp3", model, tagList, device)

#### Get embeddings

In [5]:
def get_features(name):
    def hook(model, input, output):
        features[name] = output.detach() # type: ignore (pylance bug)
    return hook

##### REGISTER HOOKS

In [6]:
model.module.frontend.register_forward_hook(get_features('frontend')) # type: ignore
model.module.backend.dense1.register_forward_hook(get_features('backend')) # type: ignore

<torch.utils.hooks.RemovableHandle at 0x7f72fea264d0>

    for audio_fn in glob.glob("/homes/lm004/commercials/annotated_commercials/*.mp3"):
        features = {}
        
        input_length = 5*16000//256 # 5 seconds of audio at 16000 Hz // 256 hop length
        whole_spec = compute_melspectrogram(audio_fn)

        n_chunks = whole_spec.shape[1] // input_length
        spec = np.zeros((n_chunks,whole_spec.shape[0],input_length)) # stack of chunks
        for i in range(n_chunks):
            spec[i]=whole_spec[:,i*input_length:(i+1)*input_length]

        spec = torch.Tensor(spec[:,np.newaxis,:,:]).to(device)
        
        _ = model(spec)

        stimulus_id = audio_fn.split('/')[-1].replace('_trimmed.mp3','')

        # NB: not saving for safety, uncomment to save

        # np.save(open(f"embeddings_msd/{stimulus_id}_frontend.npy", 'wb'), features['frontend'].cpu().numpy())
        # np.save(open(f"embeddings_msd/{stimulus_id}_backend.npy", 'wb'), features['backend'].cpu().numpy())

##### Now for the separated accompaniment (no voices)

In [7]:
for audio_fn in tqdm(glob.glob("/homes/lm004/commercials/annotated_commercials/*.mp3")):
    features = {}
    
    stimulus_id = audio_fn.split('/')[-1].replace('_trimmed.mp3','')

    audio_fn = audio_fn.replace('.mp3','/accompaniment.wav')

    input_length = 5*16000//256 # 5 seconds of audio at 16000 Hz // 256 hop length
    whole_spec = compute_melspectrogram(audio_fn)

    n_chunks = whole_spec.shape[1] // input_length
    spec = np.zeros((n_chunks,whole_spec.shape[0],input_length)) # stack of chunks
    for i in range(n_chunks):
        spec[i]=whole_spec[:,i*input_length:(i+1)*input_length]

    spec = torch.Tensor(spec[:,np.newaxis,:,:]).to(device)
    
    _ = model(spec)

    # NB: not saving for safety, uncomment to save

    np.save(open(f"embeddings_msd_novoice/{stimulus_id}_frontend.npy", 'wb'), features['frontend'].cpu().numpy())
    np.save(open(f"embeddings_msd_novoice/{stimulus_id}_backend.npy", 'wb'), features['backend'].cpu().numpy())
    

  0%|          | 0/606 [00:00<?, ?it/s]