In [1]:
import os
import sys
import glob
import random
from collections import defaultdict
from scipy.io import wavfile
import numpy as np

In [2]:
def get_speaker_roots_in_data_path (datapath ='accents'):
    speaker_list = []
    accent_subfolders = [f.path for f in os.scandir(datapath) if  f.is_dir()]
    for accent in accent_subfolders:
        for gender in ['female','male']:
            speaker_folders = os.listdir(os.path.join(accent,gender))
            for speaker in speaker_folders:
                if not speaker.startswith('.'):
                    speaker_list.append(os.path.join(accent,gender,speaker))
    return speaker_list

In [3]:
def get_wav_files_in_path(datapath):
    files = os.listdir(datapath)
    files_wav = [i for i in files if i.endswith('.wav')]
    return files_wav

In [4]:
def split_data(data, train_ratio, val_ratio, seed=42):
    random.seed(seed)
    speaker_data = defaultdict(list)
    for item in data:
        speaker_id = item.split('_')[0]
        speaker_data[speaker_id].append(item)
    
    train_data = []
    val_data = []
    test_data = []
    
    for speaker_id, speaker_utterances in speaker_data.items():
        n_utterances = len(speaker_utterances)
        n_train = int(n_utterances * train_ratio)
        n_val = int(n_utterances * val_ratio)
        
        random.shuffle(speaker_utterances)
        train_data.extend(speaker_utterances[:n_train])
        val_data.extend(speaker_utterances[n_train:n_train+n_val])
        test_data.extend(speaker_utterances[n_train+n_val:])
    
    return train_data, val_data, test_data

In [5]:
def segment_audio(filepath, chunk_length, sr):
    rate, audio = wavfile.read(filepath)
    chunk_samples = sr * chunk_length
    chunk_stride = chunk_samples // 2
    audio_length = len(audio)
    chunks = []
    start = 0
    while start < audio_length:
        end = start + chunk_samples
        if end >= audio_length:
            end = audio_length
        chunk = audio[start:end]
        chunks.append(chunk)
        start += chunk_stride
    return chunks

def segment_audios(wav_files, datapath, chunk_length, sr):
    all_chunks = []
    for file in wav_files:
        file_path = os.path.join(datapath, file)
        chunks = segment_audio(file_path, chunk_length, sr)
        all_chunks.extend(chunks)
    return all_chunks

In [6]:
# #first root path of speaker_list is passed, chunks of 3 seconds 
# all_chunks = segment_audios(wav_files, speaker_list[0], 3, 16000)

In [7]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Reshape, Conv1D, MaxPooling1D, Flatten, Dense, LSTM
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical

def create_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)
    x = Reshape((input_shape[0], input_shape[1], 1))(inputs)
    x = Conv1D(filters=32, kernel_size=3, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(filters=64, kernel_size=3, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = Conv1D(filters=128, kernel_size=3, activation='relu')(x)
    x = MaxPooling1D(pool_size=2)(x)
    x = LSTM(units=128, return_sequences=False)(x)
    x = Dense(units=64, activation='relu')(x)
    outputs = Dense(units=num_classes, activation='softmax')(x)
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [8]:
# Collect all speaker paths
speaker_list = get_speaker_roots_in_data_path()

# Get all wav files
wav_files = []
for currPath in speaker_list:
    curr_wavFiles = get_wav_files_in_path(currPath)
    for i in curr_wavFiles:
        wav_files.append(i)

# Split data into training, validation, and test sets
train_data, val_data, test_data = split_data(wav_files, 0.6, 0.2)

# Segment the audio files into chunks
train_chunks = segment_audios(train_data, speaker_list[0], 3, 16000)
val_chunks = segment_audios(val_data, speaker_list[0], 3, 16000)

max_len = max([len(chunk) for chunk in train_chunks])
for i, chunk in enumerate(train_chunks):
    if len(chunk) < max_len:
        padding = np.zeros(max_len - len(chunk), dtype=np.int16)
        train_chunks[i] = np.concatenate((chunk, padding))
    elif len(chunk) > max_len:
        train_chunks[i] = chunk[:max_len]
        
max_len = max([len(chunk) for chunk in val_chunks])
for i, chunk in enumerate(val_chunks):
    if len(chunk) < max_len:
        padding = np.zeros(max_len - len(chunk), dtype=np.int16)
        val_chunks[i] = np.concatenate((chunk, padding))
    elif len(chunk) > max_len:
        val_chunks[i] = chunk[:max_len]

# Convert the chunks into appropriate input data and labels
x_train = np.stack(train_chunks, axis=0)
x_val = np.stack(val_chunks, axis=0)
# x_train = np.array(train_chunks) # replace with appropriate input data, potentially using train_chunks
# y_train = np.array(train_data) # replace with appropriate labels, potentially using train_data
# x_val = np.array(val_chunks) # replace with appropriate input data, potentially using val_chunks
# y_val = np.array(val_data) # replace with appropriate labels, potentially using val_data

#Create labels for each accent
accent_subfolders = [f.path for f in os.scandir('accents') if f.is_dir()]
num_classes = len(accent_subfolders)
labels_dict = {}
for i, folder in enumerate(accent_subfolders):
    labels_dict[folder] = i

#Create labels for each audio chunk
y_train = [labels_dict[os.path.dirname(file_path)] if os.path.dirname(file_path) in labels_dict else -1 for file_path in train_data]
y_val = [labels_dict[os.path.dirname(file_path)] if os.path.dirname(file_path) in labels_dict else -1 for file_path in val_data]

#One-hot encode the labels
y_train = to_categorical(y_train, num_classes=num_classes)
y_val = to_categorical(y_val, num_classes=num_classes)

input_shape = (16000 * 3,)  # Based on the segment length of 3 seconds and the sample rate of 16000
#num_classes = len(accent_subfolders)  # number of accent subfolders

model = create_model(input_shape, num_classes)
model.fit(x_train, y_train, epochs=10, batch_size=32, validation_data=(x_val, y_val))

IndexError: tuple index out of range