In [1]:
import numpy as np

def normalize_data(train_data, val_data):
    # Połącz wszystkie MFCC coefficients z danych treningowych w jedną tablicę
    all_mfccs = np.concatenate([np.array([item[0] for item in sequence]) for sequence in train_data])

    print(all_mfccs.shape())
    # Oblicz średnią i odchylenie standardowe
    mean = np.mean(all_mfccs, axis=0)
    std = np.std(all_mfccs, axis=0)

    print(mean.shape())
    print(std.shape())
    # Funkcja do normalizacji pojedynczej sekwencji
    def normalize_sequence(sequence):
        return [( (np.array(mfcc) - mean) / std, label ) for mfcc, label in sequence]

    # Znormalizuj dane treningowe i walidacyjne
    normalized_train_data = [normalize_sequence(sequence) for sequence in train_data]
    normalized_val_data = [normalize_sequence(sequence) for sequence in val_data]

    return normalized_train_data, normalized_val_data, mean, std


## Audio data normalization function

# **Data preprocessing**

In [13]:
import os
import librosa
import csv
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

REFRESH_TIME = 0.25  # seconds
BATCH_SIZE = 16

# Directories with data
data_dir = '../train-sequences'

# Function to load labels from csv file
def load_labels(csv_file_v):
    labels_v = []
    with open(csv_file_v, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Pomijamy nagłówek
        for row in reader:
            if row[0] == 'silence':
                labels_v.append((2, int(row[1]), int(row[2])))
            elif row[0] == 'inhale':
                labels_v.append((1, int(row[1]), int(row[2])))
            elif row[0] == 'exhale':
                labels_v.append((0, int(row[1]), int(row[2])))
    return labels_v

# Function to get the label for a given time
def get_label_for_time(labels_v, start_frame, end_frame):
    label_counts = [0, 0, 0]  # 0: exhale, 1: inhale, 2: silence

    for label_it, start, end in labels_v:
        if start < end_frame and end > start_frame:
            overlap_start = max(start, start_frame)
            overlap_end = min(end, end_frame)
            overlap_length = overlap_end - overlap_start
            label_counts[label_it] += overlap_length

    return label_counts.index(max(label_counts))

# Creating list of files
wav_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.wav')]
train_data = []

# Main loop to preprocess data into MFCCs
for wav_file in wav_files:
    csv_file = wav_file.replace('.wav', '.csv')
    if not os.path.exists(csv_file):
        continue

    # Load audio and labels
    y, sr = librosa.load(wav_file, mono=True)
    labels = load_labels(csv_file)

    # Calculate chunk size
    chunk_size = int(sr * REFRESH_TIME)

    # List of MFCCs for every data sequence (it will be a list of lists of tuples (mfcc coefficients, label))
    mfcc_sequence = []

    # Iterate through every 0.25s audio chunk
    for i in range(0, len(y), chunk_size):
        frame = y[i:i + chunk_size]
        if len(frame) == chunk_size:
            mfcc = librosa.feature.mfcc(y=frame, sr=sr)
            mfcc_mean = mfcc.mean(axis=1)
            label = get_label_for_time(labels, i, i + chunk_size)
            mfcc_sequence.append((mfcc_mean, label))

    if mfcc_sequence:
        train_data.append(mfcc_sequence)

In [14]:
# Check length of every sequence

lengths = [len(seq) for seq in train_data]
print("Min length: ", min(lengths))
print("Max length: ", max(lengths))
print(lengths)

Min length:  120
Max length:  131
[128, 126, 127, 126, 126, 125, 120, 123, 121, 122, 124, 128, 121, 131, 121, 127, 120, 122, 130, 123, 130, 120, 121, 128, 128, 125, 123, 123, 130, 131, 120, 130, 121, 124, 122, 121, 131, 122, 130, 125, 123, 128, 129, 128, 120, 120, 122, 124, 120, 122, 121, 121, 122, 121, 125, 128, 126, 123, 128, 129, 126, 121, 126, 124, 121, 124, 131, 123, 123, 121, 125, 126, 128, 122, 126, 126, 120, 121, 128, 122, 124, 128, 120, 123, 127, 122, 129, 124, 124, 125, 125, 124, 123, 125, 120, 125, 121, 127, 122, 124, 120, 121, 128, 124, 121, 122, 124, 123, 120, 126, 122, 121, 123, 128, 126, 124, 120, 124, 123, 121, 121, 128, 121, 120, 126, 127, 126, 125, 130, 123, 125, 127, 121, 121, 124, 127, 127, 125, 127, 123, 126, 124, 120, 126, 120, 120, 125, 129, 123, 125, 127, 122, 128, 127, 122, 127]


# **Data Loader**

In [15]:
# Split data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2)

# DataLoader and collate function
from model_classes import AudioDataset
import torch

train_dataset = AudioDataset(train_data)
val_dataset = AudioDataset(val_data)

def collate_fn(batch):
    sequences, labels_t = zip(*batch)
    lengths_t = [seq.size(0) for seq in sequences]
    max_length = max(lengths_t)
    padded_sequences = torch.zeros(len(sequences), max_length, 20)
    padded_labels = torch.zeros(len(sequences), max_length, dtype=torch.long)
    for j, seq in enumerate(sequences):
        padded_sequences[j, :seq.size(0), :] = seq
        padded_labels[j, :len(labels_t[j])] = labels_t[j]
    return padded_sequences, padded_labels

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# **Training**

In [16]:
import time
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from model_classes import AudioClassifierLSTM as AudioClassifier
import torch.nn as nn

REFRESH_TIME = 0.25  # Refresh time in seconds in future realtime
NUM_EPOCHS = 100  # Number of epochs (the more epoch the better model, but it takes more time)
PATIENCE_TIME = 10  # Number of epochs without improvement in validation accuracy that will stop training
LEARNING_RATE = 0.001  # Learning rate
BATCH_SIZE = 16  # Batch size (amount of sequences in one batch)

# Check if CUDA is available (learning on GPU is much faster)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

total_time = time.time()
start_time = time.time()

# Create model object
print("Creating model...")
model = AudioClassifier()
model = model.to(device)
print("Model created, time: ", time.time() - start_time)

# Define loss function and optimizer (network parameters)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# These are just for early stopping
best_val_accuracy = 0.0
early_stopping_counter = 0

print("Training model...")
start_time = time.time()

# Iterate through epochs
for epoch in range(NUM_EPOCHS):

    # Enable training on model object
    model.train()

    # Initialize running loss and accuracy
    running_loss = 0.0
    running_accuracy = 0.0
    # It's just a fancy progress bar in console
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}', unit='batch')

    # Iterate through batches
    for inputs, labels in progress_bar:

        # Move inputs and labels to the device (GPU or CPU)
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Jeśli model zwraca więcej niż jedną wartość, przypisz odpowiednią wartość do outputs
        if isinstance(outputs, tuple):
            outputs = outputs[0]

        # Flattening outputs and labels from [batch_size, max_length, num_classes]
        outputs = outputs.view(-1, outputs.size(-1))  # Flattening to [batch_size * max_length, num_classes]
        labels = labels.view(-1)  # Flattening to [batch_size * max_length]

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass (calculate gradients)
        loss.backward()

        # Update weights according to the calculated gradients
        optimizer.step()

        # Calculate running loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())

        # Update progress bar
        progress_bar.set_postfix(loss=running_loss / len(progress_bar),
                                  accuracy=running_accuracy / len(progress_bar))

    # Print the loss and accuracy for the epoch
    print('Train Loss: {:.4f}, Train Accuracy: {:.4f}'.format(running_loss / len(train_loader),
                                                              running_accuracy / len(train_loader)))

    # After training on the whole training set, we can evaluate the model on the validation set
    model.eval()
    val_running_loss = 0.0
    val_running_accuracy = 0.0

    # We don't need to calculate gradients during validation
    with torch.no_grad():

        # Iterate through validation set
        for inputs, labels in val_loader:

            # Move inputs and labels to the device
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Jeśli model zwraca więcej niż jedną wartość, przypisz odpowiednią wartość do outputs
            if isinstance(outputs, tuple):
                outputs = outputs[0]

            # As previous, we need to flatten outputs and labels
            outputs = outputs.view(-1, outputs.size(-1)) # Flattening to [batch_size * max_length, num_classes]
            labels = labels.view(-1) # Flattening to [batch_size * max_length]

            # Calculate loss
            loss = criterion(outputs, labels)

            # Calculate running loss (cumulative loss over batches) and add current epoch's accuracy to the running (cumulative) accuracy
            val_running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())

    # Calculate cumulative loss and accuracy for the validation set
    avg_val_loss = val_running_loss / len(val_loader)
    avg_val_accuracy = val_running_accuracy / len(val_loader)

    # And print it
    print('Val Loss: {:.4f}, Val Accuracy: {:.4f}'.format(avg_val_loss, avg_val_accuracy))

    # Learning rate scheduler (changing learning rate during training)
    scheduler.step()

    # Early stopping (if there is no improvement in validation accuracy for PATIENCE_TIME epochs, we stop training)
    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= PATIENCE_TIME:
            print("Early stopping triggered. No improvement in validation accuracy.")
            break

# And print final results
print('Finished Training, time: ', time.time() - start_time)
print('Saving model...')
start_time = time.time()
#TODO
torch.save(model.state_dict(), 'audio_rnn_classifier_only_our_data.pth')
print("Model saved, time: ", time.time() - start_time)
print("Finished, Total time: ", time.time() - total_time)

Device:  cpu
Creating model...
Model created, time:  0.015095710754394531
Training model...


  mfcc_sequence = torch.tensor(mfcc_sequence, dtype=torch.float32)
Epoch 1/100: 100%|██████████| 8/8 [00:02<00:00,  2.69batch/s, accuracy=0.38, loss=1.09]   


Train Loss: 1.0923, Train Accuracy: 0.3803
Val Loss: 1.0869, Val Accuracy: 0.4071


Epoch 2/100: 100%|██████████| 8/8 [00:02<00:00,  3.50batch/s, accuracy=0.388, loss=1.09]  


Train Loss: 1.0881, Train Accuracy: 0.3877
Val Loss: 1.0743, Val Accuracy: 0.4042


Epoch 3/100: 100%|██████████| 8/8 [00:02<00:00,  3.39batch/s, accuracy=0.382, loss=1.07]  


Train Loss: 1.0667, Train Accuracy: 0.3822
Val Loss: 1.0489, Val Accuracy: 0.4042


Epoch 4/100: 100%|██████████| 8/8 [00:02<00:00,  3.40batch/s, accuracy=0.391, loss=1.05]  


Train Loss: 1.0524, Train Accuracy: 0.3909
Val Loss: 1.0434, Val Accuracy: 0.3962


Epoch 5/100: 100%|██████████| 8/8 [00:02<00:00,  3.35batch/s, accuracy=0.418, loss=1.04]  


Train Loss: 1.0422, Train Accuracy: 0.4185
Val Loss: 1.0316, Val Accuracy: 0.4482


Epoch 6/100: 100%|██████████| 8/8 [00:02<00:00,  3.42batch/s, accuracy=0.444, loss=1.02]  


Train Loss: 1.0234, Train Accuracy: 0.4437
Val Loss: 1.0150, Val Accuracy: 0.4528


Epoch 7/100: 100%|██████████| 8/8 [00:02<00:00,  3.54batch/s, accuracy=0.448, loss=1.01]  


Train Loss: 1.0150, Train Accuracy: 0.4481
Val Loss: 1.0296, Val Accuracy: 0.4238


Epoch 8/100: 100%|██████████| 8/8 [00:02<00:00,  3.68batch/s, accuracy=0.477, loss=0.983] 


Train Loss: 0.9829, Train Accuracy: 0.4771
Val Loss: 0.9996, Val Accuracy: 0.4712


Epoch 9/100: 100%|██████████| 8/8 [00:02<00:00,  3.51batch/s, accuracy=0.526, loss=0.953] 


Train Loss: 0.9531, Train Accuracy: 0.5258
Val Loss: 0.9740, Val Accuracy: 0.5030


Epoch 10/100: 100%|██████████| 8/8 [00:02<00:00,  3.15batch/s, accuracy=0.571, loss=0.93]  


Train Loss: 0.9302, Train Accuracy: 0.5706
Val Loss: 0.9803, Val Accuracy: 0.5068


Epoch 11/100: 100%|██████████| 8/8 [00:02<00:00,  3.39batch/s, accuracy=0.571, loss=0.89] 


Train Loss: 0.8899, Train Accuracy: 0.5710
Val Loss: 0.9566, Val Accuracy: 0.5247


Epoch 12/100: 100%|██████████| 8/8 [00:02<00:00,  3.38batch/s, accuracy=0.595, loss=0.87]  


Train Loss: 0.8703, Train Accuracy: 0.5955
Val Loss: 1.0060, Val Accuracy: 0.5223


Epoch 13/100: 100%|██████████| 8/8 [00:02<00:00,  3.22batch/s, accuracy=0.602, loss=0.858] 


Train Loss: 0.8583, Train Accuracy: 0.6020
Val Loss: 0.9793, Val Accuracy: 0.5303


Epoch 14/100: 100%|██████████| 8/8 [00:02<00:00,  3.30batch/s, accuracy=0.606, loss=0.849] 


Train Loss: 0.8491, Train Accuracy: 0.6057
Val Loss: 0.9820, Val Accuracy: 0.5340


Epoch 15/100: 100%|██████████| 8/8 [00:02<00:00,  3.22batch/s, accuracy=0.619, loss=0.832]  


Train Loss: 0.8318, Train Accuracy: 0.6194
Val Loss: 0.9527, Val Accuracy: 0.5247


Epoch 16/100: 100%|██████████| 8/8 [00:02<00:00,  3.13batch/s, accuracy=0.618, loss=0.82]  


Train Loss: 0.8200, Train Accuracy: 0.6185
Val Loss: 0.9625, Val Accuracy: 0.5417


Epoch 17/100: 100%|██████████| 8/8 [00:02<00:00,  3.15batch/s, accuracy=0.624, loss=0.817] 


Train Loss: 0.8167, Train Accuracy: 0.6242
Val Loss: 0.9853, Val Accuracy: 0.5378


Epoch 18/100: 100%|██████████| 8/8 [00:02<00:00,  3.06batch/s, accuracy=0.634, loss=0.805]  


Train Loss: 0.8052, Train Accuracy: 0.6341
Val Loss: 0.9914, Val Accuracy: 0.5315


Epoch 19/100: 100%|██████████| 8/8 [00:02<00:00,  3.12batch/s, accuracy=0.643, loss=0.797]  


Train Loss: 0.7969, Train Accuracy: 0.6431
Val Loss: 0.9806, Val Accuracy: 0.5357


Epoch 20/100: 100%|██████████| 8/8 [00:02<00:00,  2.94batch/s, accuracy=0.633, loss=0.803]  


Train Loss: 0.8033, Train Accuracy: 0.6334
Val Loss: 0.9921, Val Accuracy: 0.5390


Epoch 21/100: 100%|██████████| 8/8 [00:03<00:00,  2.38batch/s, accuracy=0.641, loss=0.788]  


Train Loss: 0.7883, Train Accuracy: 0.6408
Val Loss: 0.9790, Val Accuracy: 0.5374


Epoch 22/100: 100%|██████████| 8/8 [00:03<00:00,  2.03batch/s, accuracy=0.648, loss=0.789]  


Train Loss: 0.7895, Train Accuracy: 0.6481
Val Loss: 0.9875, Val Accuracy: 0.5386


Epoch 23/100: 100%|██████████| 8/8 [00:05<00:00,  1.47batch/s, accuracy=0.644, loss=0.782] 


Train Loss: 0.7817, Train Accuracy: 0.6436
Val Loss: 0.9868, Val Accuracy: 0.5407


Epoch 24/100: 100%|██████████| 8/8 [00:02<00:00,  2.77batch/s, accuracy=0.653, loss=0.775] 


Train Loss: 0.7749, Train Accuracy: 0.6533
Val Loss: 0.9969, Val Accuracy: 0.5368


Epoch 25/100: 100%|██████████| 8/8 [00:03<00:00,  2.64batch/s, accuracy=0.659, loss=0.767] 


Train Loss: 0.7666, Train Accuracy: 0.6587
Val Loss: 1.0081, Val Accuracy: 0.5366


Epoch 26/100: 100%|██████████| 8/8 [00:02<00:00,  3.15batch/s, accuracy=0.655, loss=0.773]  

Train Loss: 0.7733, Train Accuracy: 0.6545
Val Loss: 1.0016, Val Accuracy: 0.5383
Early stopping triggered. No improvement in validation accuracy.
Finished Training, time:  74.32534551620483
Saving model...
Model saved, time:  0.008065223693847656
Finished, Total time:  75.63036322593689



