In [1]:
# imports
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from dill.source import getname

from speaker_recognition.datasets import PodcastAudioDataset
from speaker_recognition.models import SpeechRecognitionNet
from speaker_recognition.utils import train_model, plot_train_history

In [2]:
# parameters

num_divs     = 1 # number of sub-samples per 5s sample
mel_spec     = True # convert waveform to MelSpectrogram
augments     = ["time_shift", "spec_augment"]
batch_size   = 16 * num_divs
lr           = 0.001
weight_decay = 0
num_epochs   = 20

In [3]:
# datasets and dataloaders

train_set = PodcastAudioDataset(split="train", num_divs=num_divs, mel_spec=mel_spec, augments=augments)
val_set   = PodcastAudioDataset(split="validation", num_divs=num_divs, mel_spec=mel_spec)
test_set  = PodcastAudioDataset(split="test", num_divs=num_divs, mel_spec=mel_spec)

train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16, shuffle=False)

batch_shape = next(iter(train_loader))[0].size() # batch shape, not size
print(batch_shape)

torch.Size([16, 1, 64, 157])


In [4]:
# model (network)

model = SpeechRecognitionNet(batch_shape=batch_shape)
print(f"Using model: {getname(model)}")

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# criterion = nn.CrossEntropyLoss()
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

conv_output_shape: torch.Size([16, 32, 2, 5]) 320
Using model: SpeechRecognitionNet(C3, FC3)


In [5]:
# train

stats = train_model(
	model=model,
	dataloaders=[train_loader, val_loader],
	num_epochs=num_epochs,
	criterion=criterion,
	optimizer=optimizer,
	print_progress=True
)

ValueError: Target size (torch.Size([16])) must be the same as input size (torch.Size([16, 2]))

In [None]:
# plot

plot_train_history(stats)