In [1]:
import os
import torch
import torchaudio
from torch.utils.data import Dataset

class SpeakerDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        """
        root_dir (str): path to the folder containing subfolders of speakers.
        transform (callable, optional): optional transform to apply to the audio waveform.
        """
        self.root_dir = root_dir
        self.transform = transform
        
        self.file_paths = []
        self.labels = []
        
        speaker_dirs = sorted(os.listdir(root_dir))
        
        # Go through each speaker folder
        for label_idx, speaker_folder in enumerate(speaker_dirs):
            speaker_path = os.path.join(root_dir, speaker_folder)
            if not os.path.isdir(speaker_path):
                continue
                
            # Collect all wav files in this speaker folder
            for fname in os.listdir(speaker_path):
                if fname.lower().endswith(".wav"):
                    file_path = os.path.join(speaker_path, fname)
                    self.file_paths.append(file_path)
                    self.labels.append(label_idx)  # numeric label for each speaker

    def __len__(self):
        return len(self.file_paths)
    
    def __getitem__(self, idx):
        file_path = self.file_paths[idx]
        label = self.labels[idx]
        
        # Load audio: returns waveform (tensor) and sample rate
        waveform, sr = torchaudio.load(file_path)
        
        # Optional: apply any custom transform
        if self.transform:
            waveform = self.transform(waveform, sr)
        
        # Return (waveform, label)
        return waveform, label

In [3]:
from torch.utils.data import random_split

# Suppose you have a dataset with total N items
dataset = SpeakerDataset(root_dir= r"C:\Users\VICTUS\Voice\A Dataset for Voice-Based Human Identity Recognition\output")

# Let's do a 80/10/10 split
train_size = int(0.8 * len(dataset))
val_size   = int(0.1 * len(dataset))
test_size  = len(dataset) - train_size - val_size

train_set, val_set, test_set = random_split(dataset, [train_size, val_size, test_size])

train_loader = torch.utils.data.DataLoader(train_set, batch_size=8, shuffle=True)
val_loader   = torch.utils.data.DataLoader(val_set, batch_size=8, shuffle=False)
test_loader  = torch.utils.data.DataLoader(test_set, batch_size=8, shuffle=False)

In [4]:
import torch.nn as nn
import torch.nn.functional as F

class SimpleAudioClassifier(nn.Module):
    def __init__(self, num_speakers=10):
        super(SimpleAudioClassifier, self).__init__()
        
        # Example: process a mel spectrogram
        # We'll define layers for 1D/2D CNN, or you can adapt as needed.
        self.conv1 = nn.Conv2d(in_channels=1, out_channels=16, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(16, 32, 3, stride=1, padding=1)
        self.pool  = nn.MaxPool2d(kernel_size=2, stride=2)
        
        # We don't know final shape yet, but let's guess or compute it after we see input dims
        self.fc1 = nn.Linear(32 * 32 * 32, 128)  # adjust dims accordingly
        self.fc2 = nn.Linear(128, num_speakers)
    
    def forward(self, x):
        # x shape: (batch, channels=1, freq, time) if we convert waveforms to spectrograms
        x = F.relu(self.conv1(x))
        x = self.pool(F.relu(self.conv2(x)))
        # Now flatten
        x = x.view(x.size(0), -1)
        
        x = F.relu(self.fc1(x))
        x = self.fc2(x)  # logits for num_speakers
        return x

In [5]:
import torchaudio.transforms as T

class MelSpectrogramTransform:
    def __init__(self, sample_rate=16000, n_mels=64):
        self.mel_transform = T.MelSpectrogram(sample_rate=sample_rate, n_mels=n_mels)
        
    def __call__(self, waveform, sr):
        # If the waveform has multiple channels, average to mono
        if waveform.shape[0] > 1:
            waveform = torch.mean(waveform, dim=0, keepdim=True)
        
        # Convert to mel spectrogram
        mel_spec = self.mel_transform(waveform)
        # mel_spec shape: (n_mels, time)
        
        # Add channel dimension for CNN: (1, n_mels, time)
        mel_spec = mel_spec.unsqueeze(0)
        return mel_spec

In [None]:
import torch
print(torch.__version__)           # Should print the installed PyTorch version
print(torch.cuda.is_available())   # Should return True if CUDA is available