In [1]:
import os
import sys
import glob
import random
from collections import defaultdict
from scipy.io import wavfile
import numpy as np

In [2]:
def get_speaker_roots_in_data_path (datapath ='accents'):
    speaker_list = []
    accent_subfolders = [f.path for f in os.scandir(datapath) if  f.is_dir()]
    for accent in accent_subfolders:
        for gender in ['female','male']:
            speaker_folders = os.listdir(os.path.join(accent,gender))
            for speaker in speaker_folders:
                if not speaker.startswith('.'):
                    speaker_list.append(os.path.join(accent,gender,speaker))
    return speaker_list

In [3]:
def get_wav_files_in_path(datapath):
    files = os.listdir(datapath)
    files_wav = [i for i in files if i.endswith('.wav')]
    return files_wav

In [4]:
def split_data(data, train_ratio, val_ratio, seed=42):
    random.seed(seed)
    speaker_data = defaultdict(list)
    for item in data:
        speaker_id = item.split('_')[0]
        speaker_data[speaker_id].append(item)
    
    train_data = []
    val_data = []
    test_data = []
    
    for speaker_id, speaker_utterances in speaker_data.items():
        n_utterances = len(speaker_utterances)
        n_train = int(n_utterances * train_ratio)
        n_val = int(n_utterances * val_ratio)
        
        random.shuffle(speaker_utterances)
        train_data.extend(speaker_utterances[:n_train])
        val_data.extend(speaker_utterances[n_train:n_train+n_val])
        test_data.extend(speaker_utterances[n_train+n_val:])
    
    return train_data, val_data, test_data

In [5]:
def segment_audio(filepath, chunk_length, sr):
    rate, audio = wavfile.read(filepath)
    chunk_samples = sr * chunk_length
    chunk_stride = chunk_samples // 2
    audio_length = len(audio)
    chunks = []
    start = 0
    while start < audio_length:
        end = start + chunk_samples
        if end >= audio_length:
            end = audio_length
        chunk = audio[start:end]
        chunks.append(chunk)
        start += chunk_stride
    return chunks

def segment_audios(wav_files, datapath, chunk_length, sr):
    all_chunks = []
    for file in wav_files:
        file_path = os.path.join(datapath, file)
        chunks = segment_audio(file_path, chunk_length, sr)
        all_chunks.extend(chunks)
    return all_chunks

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim

# Define the model architecture
class SpeechClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_classes):
        super(SpeechClassifier, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv1d(in_channels=input_dim, out_channels=32, kernel_size=3, padding=1),
            nn.BatchNorm1d(32),
            nn.ReLU(),
            nn.MaxPool1d(2),
            nn.Conv1d(32, 64, 3, padding=1),
            nn.BatchNorm1d(64),
            nn.ReLU(),
            nn.MaxPool1d(2),
        )
        self.lstm = nn.LSTM(input_size=64, hidden_size=hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        x = self.conv(x)
        x, _ = self.lstm(x.permute(0, 2, 1))
        x = x[:, -1, :]
        x = self.fc(x)
        return x

# Define the training loop
def train(model, train_loader, val_loader, criterion, optimizer, num_epochs):
    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0
        for i, (audio, labels) in enumerate(train_loader):
            optimizer.zero_grad()
            output = model(audio)
            loss = criterion(output, labels)
            loss.backward()
            optimizer.step()
            train_loss += loss.item()
        print("Epoch {}: Train Loss: {:.4f}".format(epoch+1, train_loss / len(train_loader)))

        model.eval()
        val_loss = 0.0
        correct = 0
        total = 0
        with torch.no_grad():
            for i, (audio, labels) in enumerate(val_loader):
                output = model(audio)
                loss = criterion(output, labels)
                val_loss += loss.item()
                _, predicted = torch.max(output.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()
        print("Epoch {}: Validation Loss: {:.4f} Accuracy: {:.2f}%".format(
            epoch+1, val_loss / len(val_loader), 100 * correct / total))

# Initialize the model, criterion, and optimizer
input_dim = 128
hidden_dim = 128
num_classes = 5
model = SpeechClassifier(input_dim, hidden_dim, num_classes)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
