In [1]:
import torch
import torch.nn as nn
from frontend import Frontend_mine
from backend import Backend
import warnings
import librosa
import numpy as np
import os
os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "3"

In [2]:
# define here all the parameters
main_dict = {"frontend_dict":
             {"list_out_channels":[128,128,256,256,256,256], 
              "list_kernel_sizes":[(3,3),(3,3),(3,3),(3,3),(3,3),(3,3)],
              "list_pool_sizes":  [(3,2),(2,2),(2,2),(2,1),(2,1),(2,1)], 
              "list_avgpool_flags":[False,False,False,False,False,True]},
             
             "backend_dict":
             {"n_class":50,
              "recurrent_units": None, 
              "bidirectional":False}, #  pass recurrent_units = None to deactivate
            }

In [3]:
def compute_melspectrogram(audio_fn):
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        x, _ = librosa.core.load(audio_fn, sr=16000, res_type='kaiser_fast')
        spec = librosa.core.amplitude_to_db(librosa.feature.melspectrogram(x, 
                                                                           sr=16000, 
                                                                           n_fft=512, 
                                                                           hop_length=256, 
                                                                           n_mels=96))
    return spec

def load_parameters(model, filename): 
    model = torch.nn.DataParallel(model)
    S = torch.load(filename)
    model.load_state_dict(S)
    return model

class AssembleModel(nn.Module):
    
    def __init__(self, main_dict):
        super(AssembleModel, self).__init__()
        

        self.frontend = Frontend_mine(main_dict["frontend_dict"])
        self.backend = Backend(main_dict)


    def forward(self, spec):
        
        x = self.backend(self.frontend(spec))
        
        return x

In [4]:
autotagger = load_parameters(AssembleModel(main_dict),"MSD_transformer_autotagger.pth")

In [5]:
with open('MSD_50tagList.txt') as f:
    tagList = f.readlines()
    
tagList = [line.replace('\n','') for line in tagList]

In [6]:
def first_10_tags(filename):

    input_length = 5*16000//256

    whole_spec = compute_melspectrogram(filename)

    n_chunks = whole_spec.shape[1] // input_length
    spec = np.zeros((n_chunks,whole_spec.shape[0],input_length)) # stack of chunks
    for i in range(n_chunks):
        spec[i]=whole_spec[:,i*input_length:(i+1)*input_length]

    spec = spec[:,np.newaxis,:,:]
    
    out = autotagger(torch.Tensor(spec))
    
    out = out.detach().cpu().numpy()
    
    y_pred = out.mean(axis=0)
    
    print(np.array(tagList)[np.argsort(y_pred)[::-1]][:10].tolist())
    print(np.sort(y_pred)[::-1][:10].tolist())

In [11]:
first_10_tags("/homes/lm004/commercials/annotated_commercials/_ob7euNGFxw_trimmed.mp3")

['rock', 'female vocalists', 'pop', 'indie', 'alternative', 'indie rock', 'alternative rock', 'punk', 'classic rock', 'indie pop']
[0.41414475440979004, 0.21096672117710114, 0.19580526649951935, 0.17829637229442596, 0.14380823075771332, 0.06705977767705917, 0.052302002906799316, 0.04887886345386505, 0.046104371547698975, 0.04410814121365547]
