In [1]:
from multitask_dataset import SingerMultiTaskDataset
from torch.utils.data import DataLoader
from torch.optim import Adam
from models import HuBERTLateFeatureFusion
import pandas as pd

import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)

from torchview import draw_graph
import torchvision
# from torchviz import make_dot
from torchsummary import summary

# import torch
# torch.autograd.detect_anomaly(True)

In [2]:
# train_audio_folder = '/media/maindisk/maximos/data/melos_singers/Rebetika_vowels/train/'
# csv_path = '/media/maindisk/maximos/data/melos_singers/features/multitask_targets.csv'
train_audio_folder = '/media/maximos/9C33-6BBD/data/melos_singers/Rebetika_vowels/train/'
csv_path = '/media/maximos/9C33-6BBD/data/melos_singers/features/multitask_targets.csv'

In [3]:
feats = pd.read_csv(csv_path, delimiter=',')
features_list = list(feats.columns)
del(features_list[:2])
print(features_list)

['singer_id', 'Pitch', 'SpectralCentroid', 'SpectralSpread', 'SpectralSkewness', 'SpectralKurtosis', 'SpectralFlatness', 'SpectralCrest', 'SpectralSlope', 'SpectralDecrease', 'SpectralRollOff', 'SpectralVariation', 'SpectralFlux', 'HarmonicSpectralDeviation', 'Tristimulus_1', 'Tristimulus_2', 'Tristimulus_3', 'HarmonicOddToEvenRatio', 'Inharmonicity', 'HarmonicEnergy', 'NoiseEnergy', 'Noisiness', 'HarmonicToNoiseEnergy', 'PartialsToNoiseEnergy', 'F1_Hz', 'F2_Hz', 'F3_HZ', 'F4_Hz', 'Rate', 'Depth', 'Regularity']


In [4]:
task_labels_num_out = {}
for i in range(1, len(features_list)-3, 1):
    task_labels_num_out[features_list[i]] = 1
# add singer identification
task_labels_num_out['singer_id'] = feats['singer_id'].max()+1 # accounting for zero
print(task_labels_num_out)

{'Pitch': 1, 'SpectralCentroid': 1, 'SpectralSpread': 1, 'SpectralSkewness': 1, 'SpectralKurtosis': 1, 'SpectralFlatness': 1, 'SpectralCrest': 1, 'SpectralSlope': 1, 'SpectralDecrease': 1, 'SpectralRollOff': 1, 'SpectralVariation': 1, 'SpectralFlux': 1, 'HarmonicSpectralDeviation': 1, 'Tristimulus_1': 1, 'Tristimulus_2': 1, 'Tristimulus_3': 1, 'HarmonicOddToEvenRatio': 1, 'Inharmonicity': 1, 'HarmonicEnergy': 1, 'NoiseEnergy': 1, 'Noisiness': 1, 'HarmonicToNoiseEnergy': 1, 'PartialsToNoiseEnergy': 1, 'F1_Hz': 1, 'F2_Hz': 1, 'F3_HZ': 1, 'F4_Hz': 1, 'singer_id': 6}


In [5]:
model = HuBERTLateFeatureFusion(task_labels_num_out=task_labels_num_out, gpu_index=0)

You are using a model of type hubert to instantiate a model of type wav2vec2. This is not supported for all configurations of models and can yield errors.
Some weights of Wav2Vec2Model were not initialized from the model checkpoint at facebook/hubert-base-ls960 and are newly initialized: ['encoder.pos_conv_embed.conv.parametrizations.weight.original0', 'encoder.pos_conv_embed.conv.parametrizations.weight.original1']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
training_data = SingerMultiTaskDataset(train_audio_folder, csv_path)

In [7]:
dataloader = DataLoader(training_data, batch_size=4, shuffle=True, collate_fn=model.collate_fn)

In [8]:
# keep hubert frozen
for p in model.hubert.parameters():
    p.requires_grad = False
# train projectors and classifiers
for k in model.projectors.keys():
    model.projectors[k].requires_grad = True
    model.classifiers[k].requires_grad = True

In [9]:
b = next(iter(dataloader))

In [10]:
optimizer = Adam( model.parameters(), lr=0.001 )

In [52]:
y = model(
    audio_normalized=b[0]['input_values'],
    attention_mask=b[0]['attention_mask'],
    labels=b[1],
    output_attentions=False,
    output_hidden_states=False,
    return_dict=True
)

In [53]:
print(y.loss)

{'regressors': tensor(1.1165, device='cuda:0', grad_fn=<AddBackward0>), 'classifier': tensor(1.7618, device='cuda:0', grad_fn=<NllLossBackward0>)}


In [51]:
print(model.intermediates['Pitch'].weight.grad)

tensor([[-7.0141e-12,  7.8632e-10,  1.3504e-10,  ...,  1.5434e-08,
          5.8117e-10, -3.8207e-08],
        [ 2.9763e-07,  9.2447e-07,  1.5113e-07,  ...,  1.7522e-07,
          3.0023e-07, -5.4433e-05],
        [-5.0201e-10, -1.9900e-08, -7.6184e-09,  ..., -4.2044e-09,
          5.1895e-07,  1.5639e-06],
        ...,
        [ 9.5809e-07,  2.7300e-06,  4.7807e-07,  ..., -2.6590e-06,
          8.2890e-07, -1.6958e-04],
        [ 1.2446e-06,  3.9087e-06,  7.8115e-07,  ..., -4.1634e-06,
         -1.0613e-05, -2.5155e-04],
        [-5.7263e-09, -1.4922e-08, -2.7208e-09,  ...,  3.8041e-08,
         -4.0654e-09,  9.6481e-07]], device='cuda:0')


In [44]:
# optimizer.zero_grad()
# y.loss['regressors'].backward(retain_graph=True)# retain_graph=True
# y.loss['classifier'].backward()# retain_graph=True
# optimizer.step()

In [50]:
optimizer.zero_grad()
(y.loss['regressors'] + y.loss['classifier']).backward()
optimizer.step()