# **Data preprocessing**

In [1]:
import os
import librosa
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

REFRESH_TIME = 0.25  # seconds
BATCH_SIZE = 32

# Directories with data
exhale_dir = 'small-data/exhale'
inhale_dir = 'small-data/inhale'
silence_dir = 'small-data/silence'

# Creating list of files
exhale_files = [os.path.join(exhale_dir, file) for file in os.listdir(exhale_dir)]
inhale_files = [os.path.join(inhale_dir, file) for file in os.listdir(inhale_dir)]
silence_files = [os.path.join(silence_dir, file) for file in os.listdir(silence_dir)]
train_data = []
files_list = [exhale_files, inhale_files, silence_files]
files_names = ['exhale', 'inhale', 'silence']

# Amount of sequences of every class
exhale_frames_size = 0
inhale_frames_size = 0
silence_frames_size = 0

# Main loop to preprocess data into MFCCs
for label, files in enumerate(files_list):

    # Iterate through all files (potentially longer audio recording with different classes)
    for file in files:  # file - wav file path

        # Load vaw file and keep file's sampling rate
        y, sr = librosa.load(file, mono=True)  # y - frames, sr - wav file's sampling rate

        # Calculate chunk size
        chunk_size = int(sr * 0.25)  # for example 48000 * 0.25 = 12000 frames per chunk

        # List of MFCCs for every data sequence (it will be a list of lists of tuples (mfcc coefficients, label))
        mfcc_sequence = []

        # Iterate trough every 0.25s audio chunk
        for i in range(0, len(y), chunk_size):

            # Get frames of current chunk
            frame = y[i:i + chunk_size]  # list of frames

            if len(frame) == chunk_size:  # Ignore the last frame if it's shorter

                # Calculate MFCCs (it will be a vector of MFCC coefficients - a vector of vectors)
                mfcc = librosa.feature.mfcc(y=frame, sr=sr)  # Default n_mfcc = 20 (20 coefficients per subframe)

                # Because we have a list of MFCC vectors, we can calculate the mean of every coefficient so we get just one set of coefficients for every 0.25s chunk
                mfcc_mean = mfcc.mean(axis=1)  # list of 20 MFCC coefficients

                # Append the mean of MFCCs to the list of MFCCs for the current data sequence
                mfcc_sequence.append((mfcc_mean, label))

        print(file)

        if mfcc_sequence:

            # Append the list of MFCCs for the current data sequence to the list of all data sequences
            train_data.append(mfcc_sequence)

    # Print the amount of sequences for every class
    if label == 0:
        exhale_frames_size = len(train_data)
        print("Exhale frames size: ", exhale_frames_size)
    elif label == 1:
        inhale_frames_size = len(train_data) - exhale_frames_size
        print("Inhale frames size: ", inhale_frames_size)
    else:
        silence_frames_size = len(train_data) - exhale_frames_size - inhale_frames_size
        print("Silence frames size: ", silence_frames_size)

small-data/exhale/master_exhale3.wav
small-data/exhale/master_exhale6.wav
small-data/exhale/master_exhale5.wav
small-data/exhale/master_exhale15.wav
small-data/exhale/master_exhale37.wav
small-data/exhale/master_exhale21.wav
small-data/exhale/master_exhale41.wav
small-data/exhale/master_exhale25.wav
small-data/exhale/master_exhale43.wav
small-data/exhale/master_exhale39.wav
small-data/exhale/master_exhale1.wav
small-data/exhale/master_exhale22.wav
small-data/exhale/master_exhale11.wav
small-data/exhale/master_exhale49.wav
small-data/exhale/master_exhale19.wav
small-data/exhale/master_exhale23.wav
small-data/exhale/master_exhale10.wav
small-data/exhale/master_exhale14.wav
small-data/exhale/master_exhale12.wav
small-data/exhale/master_exhale36.wav
small-data/exhale/master_exhale44.wav
small-data/exhale/master_exhale7.wav
small-data/exhale/master_exhale40.wav
small-data/exhale/master_exhale42.wav
small-data/exhale/master_exhale29.wav
small-data/exhale/master_exhale13.wav
small-data/exhale

# **Data Loader**

In [3]:
from model_classes import AudioDataset
import torch

# train_data is a list of sequences
# every sequence is a list of tuples (mfcc_mean, label)
# mfcc_mean is a list of 20 MFCC coefficients
# label is a class label (0, 1, 2)

# Split data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2)

# We need to create a DataLoader object for training and validation sets
train_dataset = AudioDataset(train_data)
val_dataset = AudioDataset(val_data)

# Padding for sequence (necessary for DataLoader)
def collate_fn(batch):

    # Get the sequences and labels from the batch
    sequences, labels = zip(*batch)

    # Get the lengths of sequences
    lengths = [seq.size(0) for seq in sequences]

    # Get the maximum length
    max_length = max(lengths)

    # Create padded sequences and labels
    padded_sequences = torch.zeros(len(sequences), max_length, 20)
    padded_labels = torch.zeros(len(sequences), max_length, dtype=torch.long)

    # Fill the padded sequences and labels
    for i, seq in enumerate(sequences):
        padded_sequences[i, :seq.size(0), :] = seq
        padded_labels[i, :len(labels[i])] = labels[i]

    # Return padded sequences and labels
    return padded_sequences, padded_labels

# And then we can create DataLoader objects, that we can use in training
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# **Training**

In [4]:
import time
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from model_classes import AudioClassifierLSTM as AudioClassifier
import torch.nn as nn

REFRESH_TIME = 0.25  # Refresh time in seconds in future realtime
NUM_EPOCHS = 100  # Number of epochs (the more epoch the better model, but it takes more time)
PATIENCE_TIME = 10  # Number of epochs without improvement in validation accuracy that will stop training
LEARNING_RATE = 0.001  # Learning rate
BATCH_SIZE = 32  # Batch size (amount of sequences in one batch)

# Check if CUDA is available (learning on GPU is much faster)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

total_time = time.time()
start_time = time.time()

# Create model object
print("Creating model...")
model = AudioClassifier()
model = model.to(device)
print("Model created, time: ", time.time() - start_time)

# Define loss function and optimizer (network parameters)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# These are just for early stopping
best_val_accuracy = 0.0
early_stopping_counter = 0

print("Training model...")
start_time = time.time()

# Iterate through epochs
for epoch in range(NUM_EPOCHS):

    # Enable training on model object
    model.train()

    # Initialize running loss and accuracy
    running_loss = 0.0
    running_accuracy = 0.0
    # It's just a fancy progress bar in console
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}', unit='batch')

    # Iterate through batches
    for inputs, labels in progress_bar:

        # Move inputs and labels to the device (GPU or CPU)
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Flattening outputs and labels from [batch_size, max_length, num_classes]
        outputs = outputs.view(-1, outputs.size(-1))  # Flattening to [batch_size * max_length, num_classes]
        labels = labels.view(-1)  # Flattening to [batch_size * max_length]

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass (calculate gradients)
        loss.backward()

        # Update weights according to the calculated gradients
        optimizer.step()

        # Calculate running loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())

        # Update progress bar
        progress_bar.set_postfix(loss=running_loss / len(progress_bar),
                                  accuracy=running_accuracy / len(progress_bar))

    # Print the loss and accuracy for the epoch
    print('Train Loss: {:.4f}, Train Accuracy: {:.4f}'.format(running_loss / len(train_loader),
                                                              running_accuracy / len(train_loader)))

    # After training on the whole training set, we can evaluate the model on the validation set
    model.eval()
    val_running_loss = 0.0
    val_running_accuracy = 0.0

    # We don't need to calculate gradients during validation
    with torch.no_grad():

        # Iterate through validation set
        for inputs, labels in val_loader:

            # Move inputs and labels to the device
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # As previous, we need to flatten outputs and labels
            outputs = outputs.view(-1, outputs.size(-1)) # Flattening to [batch_size * max_length, num_classes]
            labels = labels.view(-1) # Flattening to [batch_size * max_length]

            # Calculate loss
            loss = criterion(outputs, labels)

            # Calculate running loss (cumulative loss over batches) and add current epoch's accuracy to the running (cumulative) accuracy
            val_running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())

    # Calculate cumulative loss and accuracy for the validation set
    avg_val_loss = val_running_loss / len(val_loader)
    avg_val_accuracy = val_running_accuracy / len(val_loader)

    # And print it
    print('Val Loss: {:.4f}, Val Accuracy: {:.4f}'.format(avg_val_loss, avg_val_accuracy))

    # Learning rate scheduler (changing learning rate during training)
    scheduler.step()

    # Early stopping (if there is no improvement in validation accuracy for PATIENCE_TIME epochs, we stop training)
    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= PATIENCE_TIME:
            print("Early stopping triggered. No improvement in validation accuracy.")
            break

# And print final results
print('Finished Training, time: ', time.time() - start_time)
print('Saving model...')
start_time = time.time()
#TODO
torch.save(model.state_dict(), 'audio_rnn_classifier.pth')
print("Model saved, time: ", time.time() - start_time)
print("Finished, Total time: ", time.time() - total_time)

Device:  cpu
Creating model...
Model created, time:  0.016856670379638672
Training model...


  mfcc_sequence = torch.tensor(mfcc_sequence, dtype=torch.float32)
Epoch 1/100:   0%|          | 0/3 [00:00<?, ?batch/s]


AttributeError: 'tuple' object has no attribute 'view'