# **Data preprocessing**

In [15]:
import os
import librosa
import csv
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

REFRESH_TIME = 0.25  # seconds
BATCH_SIZE = 16

# Directories with data
data_dir = '../data/train'

# Function to load labels from csv file
def load_labels(csv_filee):
    labels_c = []
    with open(csv_filee, 'r') as file:
        reader = csv.reader(file)
        for row in reader:
            labels_c.append((int(row[0]), float(row[1])))
    return labels_c

# Function to get the label for a given time
def get_label_for_time(labels_t, time_t):
    elapsed_time = 0
    for label_t, duration in labels_t:
        elapsed_time += duration
        if time_t < elapsed_time:
            return label_t
    return labels_t[-1][0]  # Return the last label if time exceeds total duration

# Creating list of files
wav_files = [os.path.join(data_dir, file) for file in os.listdir(data_dir) if file.endswith('.wav')]
train_data = []

# Main loop to preprocess data into MFCCs
for wav_file in wav_files:
    csv_file = wav_file.replace('.wav', '.csv')
    if not os.path.exists(csv_file):
        continue

    # Load audio and labels
    y, sr = librosa.load(wav_file, mono=True)
    labels = load_labels(csv_file)

    # Calculate chunk size
    chunk_size = int(sr * REFRESH_TIME)

    # List of MFCCs for every data sequence (it will be a list of lists of tuples (mfcc coefficients, label))
    mfcc_sequence = []

    # Iterate through every 0.25s audio chunk
    for i in range(0, len(y), chunk_size):
        frame = y[i:i + chunk_size]
        if len(frame) == chunk_size:
            mfcc = librosa.feature.mfcc(y=frame, sr=sr)
            mfcc_mean = mfcc.mean(axis=1)
            time = i / sr * 1000  # Convert to milliseconds
            label = get_label_for_time(labels, time)
            mfcc_sequence.append((mfcc_mean, label))

    if mfcc_sequence:
        train_data.append(mfcc_sequence)

In [16]:
# Check length of every sequence

lengths = [len(seq) for seq in train_data]
print("Min length: ", min(lengths))
print("Max length: ", max(lengths))
print(lengths)

Min length:  50
Max length:  140
[122, 132, 120, 123, 124, 124, 124, 123, 120, 50, 121, 128, 125, 121, 127, 124, 140, 125, 124, 121, 122, 124, 123, 126, 125, 120, 123, 122, 123, 124, 125, 124, 125, 126, 120, 121, 121, 122, 120, 128, 120, 127, 126, 125, 120, 126, 121, 121, 127, 124, 124, 120, 121, 126, 122, 125, 120, 121, 125, 121, 120, 120, 120, 125, 122, 122, 124, 123, 123, 124, 124, 131, 120, 129, 121, 125, 126, 120, 121, 122, 125, 123, 126, 121, 120, 120, 122, 122, 120, 121, 125, 121, 123, 123, 120, 122, 121, 123, 121, 135, 123, 121, 121, 123, 122, 120, 124, 120, 122, 121, 128, 121, 123, 121, 129, 136, 131, 125, 122, 122, 121, 130, 125, 124, 121, 123, 126, 123, 121, 120, 123, 121, 122, 124, 131, 125, 120, 122, 120, 123, 121, 120, 123, 120, 120, 123, 120, 120, 124, 121, 120, 124, 122, 123, 130, 120, 121, 120, 125, 124, 122, 123, 124, 122, 122, 126, 125, 123, 121, 121, 126, 120, 124, 126, 127, 124, 129, 122, 125, 120, 121, 124, 122, 121, 129, 124, 131, 123, 120, 121, 125, 121, 122, 12

# **Data Loader**

In [18]:
# Split data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2)

# DataLoader and collate function
from model_classes import AudioDataset
import torch

train_dataset = AudioDataset(train_data)
val_dataset = AudioDataset(val_data)

def collate_fn(batch):
    sequences, labels_t = zip(*batch)
    lengths_t = [seq.size(0) for seq in sequences]
    max_length = max(lengths_t)
    padded_sequences = torch.zeros(len(sequences), max_length, 20)
    padded_labels = torch.zeros(len(sequences), max_length, dtype=torch.long)
    for j, seq in enumerate(sequences):
        padded_sequences[j, :seq.size(0), :] = seq
        padded_labels[j, :len(labels_t[j])] = labels_t[j]
    return padded_sequences, padded_labels

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# **Training**

In [19]:
import time
import torch.optim as optim
from tqdm import tqdm
from sklearn.metrics import accuracy_score
from model_classes import AudioClassifierLSTM as AudioClassifier
import torch.nn as nn

REFRESH_TIME = 0.25  # Refresh time in seconds in future realtime
NUM_EPOCHS = 100  # Number of epochs (the more epoch the better model, but it takes more time)
PATIENCE_TIME = 10  # Number of epochs without improvement in validation accuracy that will stop training
LEARNING_RATE = 0.001  # Learning rate
BATCH_SIZE = 16  # Batch size (amount of sequences in one batch)

# Check if CUDA is available (learning on GPU is much faster)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

total_time = time.time()
start_time = time.time()

# Create model object
print("Creating model...")
model = AudioClassifier()
model = model.to(device)
print("Model created, time: ", time.time() - start_time)

# Define loss function and optimizer (network parameters)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)

# These are just for early stopping
best_val_accuracy = 0.0
early_stopping_counter = 0

print("Training model...")
start_time = time.time()

# Iterate through epochs
for epoch in range(NUM_EPOCHS):

    # Enable training on model object
    model.train()

    # Initialize running loss and accuracy
    running_loss = 0.0
    running_accuracy = 0.0
    # It's just a fancy progress bar in console
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}', unit='batch')

    # Iterate through batches
    for inputs, labels in progress_bar:

        # Move inputs and labels to the device (GPU or CPU)
        inputs = inputs.to(device)
        labels = labels.to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(inputs)

        # Jeśli model zwraca więcej niż jedną wartość, przypisz odpowiednią wartość do outputs
        if isinstance(outputs, tuple):
            outputs = outputs[0]

        # Flattening outputs and labels from [batch_size, max_length, num_classes]
        outputs = outputs.view(-1, outputs.size(-1))  # Flattening to [batch_size * max_length, num_classes]
        labels = labels.view(-1)  # Flattening to [batch_size * max_length]

        # Calculate loss
        loss = criterion(outputs, labels)

        # Backward pass (calculate gradients)
        loss.backward()

        # Update weights according to the calculated gradients
        optimizer.step()

        # Calculate running loss and accuracy
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())

        # Update progress bar
        progress_bar.set_postfix(loss=running_loss / len(progress_bar),
                                  accuracy=running_accuracy / len(progress_bar))

    # Print the loss and accuracy for the epoch
    print('Train Loss: {:.4f}, Train Accuracy: {:.4f}'.format(running_loss / len(train_loader),
                                                              running_accuracy / len(train_loader)))

    # After training on the whole training set, we can evaluate the model on the validation set
    model.eval()
    val_running_loss = 0.0
    val_running_accuracy = 0.0

    # We don't need to calculate gradients during validation
    with torch.no_grad():

        # Iterate through validation set
        for inputs, labels in val_loader:

            # Move inputs and labels to the device
            inputs = inputs.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(inputs)

            # Jeśli model zwraca więcej niż jedną wartość, przypisz odpowiednią wartość do outputs
            if isinstance(outputs, tuple):
                outputs = outputs[0]

            # As previous, we need to flatten outputs and labels
            outputs = outputs.view(-1, outputs.size(-1)) # Flattening to [batch_size * max_length, num_classes]
            labels = labels.view(-1) # Flattening to [batch_size * max_length]

            # Calculate loss
            loss = criterion(outputs, labels)

            # Calculate running loss (cumulative loss over batches) and add current epoch's accuracy to the running (cumulative) accuracy
            val_running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())

    # Calculate cumulative loss and accuracy for the validation set
    avg_val_loss = val_running_loss / len(val_loader)
    avg_val_accuracy = val_running_accuracy / len(val_loader)

    # And print it
    print('Val Loss: {:.4f}, Val Accuracy: {:.4f}'.format(avg_val_loss, avg_val_accuracy))

    # Learning rate scheduler (changing learning rate during training)
    scheduler.step()

    # Early stopping (if there is no improvement in validation accuracy for PATIENCE_TIME epochs, we stop training)
    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= PATIENCE_TIME:
            print("Early stopping triggered. No improvement in validation accuracy.")
            break

# And print final results
print('Finished Training, time: ', time.time() - start_time)
print('Saving model...')
start_time = time.time()
#TODO
torch.save(model.state_dict(), 'audio_rnn_classifier.pth')
print("Model saved, time: ", time.time() - start_time)
print("Finished, Total time: ", time.time() - total_time)

Device:  cpu
Creating model...
Model created, time:  0.0136566162109375
Training model...


Epoch 1/100: 100%|██████████| 13/13 [00:03<00:00,  3.78batch/s, accuracy=0.393, loss=1.08] 


Train Loss: 1.0827, Train Accuracy: 0.3930
Val Loss: 1.0525, Val Accuracy: 0.4217


Epoch 2/100: 100%|██████████| 13/13 [00:03<00:00,  4.30batch/s, accuracy=0.525, loss=0.945] 


Train Loss: 0.9446, Train Accuracy: 0.5253
Val Loss: 0.7924, Val Accuracy: 0.6855


Epoch 3/100: 100%|██████████| 13/13 [00:02<00:00,  4.35batch/s, accuracy=0.685, loss=0.723]


Train Loss: 0.7235, Train Accuracy: 0.6849
Val Loss: 0.7009, Val Accuracy: 0.6701


Epoch 4/100: 100%|██████████| 13/13 [00:03<00:00,  4.25batch/s, accuracy=0.727, loss=0.629] 


Train Loss: 0.6293, Train Accuracy: 0.7267
Val Loss: 0.5109, Val Accuracy: 0.7988


Epoch 5/100: 100%|██████████| 13/13 [00:03<00:00,  3.58batch/s, accuracy=0.801, loss=0.492] 


Train Loss: 0.4916, Train Accuracy: 0.8015
Val Loss: 0.4248, Val Accuracy: 0.8322


Epoch 6/100: 100%|██████████| 13/13 [00:04<00:00,  2.77batch/s, accuracy=0.837, loss=0.421] 


Train Loss: 0.4214, Train Accuracy: 0.8373
Val Loss: 0.4098, Val Accuracy: 0.8411


Epoch 8/100: 100%|██████████| 13/13 [00:03<00:00,  3.26batch/s, accuracy=0.866, loss=0.355]]


Train Loss: 0.3552, Train Accuracy: 0.8657
Val Loss: 0.3558, Val Accuracy: 0.8612


Epoch 7/100: 100%|██████████| 13/13 [00:04<00:00,  2.98batch/s, accuracy=0.857, loss=0.379] 


Train Loss: 0.3792, Train Accuracy: 0.8571
Val Loss: 0.3603, Val Accuracy: 0.8689


Epoch 9/100: 100%|██████████| 13/13 [00:04<00:00,  3.05batch/s, accuracy=0.874, loss=0.335] 


Train Loss: 0.3354, Train Accuracy: 0.8744
Val Loss: 0.3500, Val Accuracy: 0.8632


Epoch 10/100: 100%|██████████| 13/13 [00:04<00:00,  2.83batch/s, accuracy=0.878, loss=0.323] 


Train Loss: 0.3231, Train Accuracy: 0.8779
Val Loss: 0.3212, Val Accuracy: 0.8773


Epoch 11/100: 100%|██████████| 13/13 [00:04<00:00,  3.14batch/s, accuracy=0.885, loss=0.301] 


Train Loss: 0.3013, Train Accuracy: 0.8849
Val Loss: 0.3137, Val Accuracy: 0.8779


Epoch 12/100: 100%|██████████| 13/13 [00:04<00:00,  2.77batch/s, accuracy=0.89, loss=0.299]  


Train Loss: 0.2990, Train Accuracy: 0.8895
Val Loss: 0.3170, Val Accuracy: 0.8778


Epoch 13/100: 100%|██████████| 13/13 [00:04<00:00,  2.83batch/s, accuracy=0.889, loss=0.29]  


Train Loss: 0.2904, Train Accuracy: 0.8894
Val Loss: 0.3094, Val Accuracy: 0.8826


Epoch 14/100: 100%|██████████| 13/13 [00:04<00:00,  2.81batch/s, accuracy=0.896, loss=0.282] 


Train Loss: 0.2818, Train Accuracy: 0.8961
Val Loss: 0.3151, Val Accuracy: 0.8799


Epoch 15/100: 100%|██████████| 13/13 [00:04<00:00,  2.98batch/s, accuracy=0.895, loss=0.278] 


Train Loss: 0.2775, Train Accuracy: 0.8949
Val Loss: 0.2913, Val Accuracy: 0.8947


Epoch 16/100: 100%|██████████| 13/13 [00:04<00:00,  3.11batch/s, accuracy=0.902, loss=0.265]


Train Loss: 0.2651, Train Accuracy: 0.9017
Val Loss: 0.2963, Val Accuracy: 0.8928


Epoch 17/100: 100%|██████████| 13/13 [00:04<00:00,  3.12batch/s, accuracy=0.9, loss=0.265]   


Train Loss: 0.2652, Train Accuracy: 0.8999
Val Loss: 0.2950, Val Accuracy: 0.8923


Epoch 18/100: 100%|██████████| 13/13 [00:03<00:00,  3.43batch/s, accuracy=0.903, loss=0.263] 


Train Loss: 0.2628, Train Accuracy: 0.9026
Val Loss: 0.3094, Val Accuracy: 0.8845


Epoch 19/100: 100%|██████████| 13/13 [00:04<00:00,  2.77batch/s, accuracy=0.9, loss=0.26]   


Train Loss: 0.2602, Train Accuracy: 0.8998
Val Loss: 0.3045, Val Accuracy: 0.8903


Epoch 20/100: 100%|██████████| 13/13 [00:04<00:00,  2.79batch/s, accuracy=0.905, loss=0.253] 


Train Loss: 0.2526, Train Accuracy: 0.9048
Val Loss: 0.2942, Val Accuracy: 0.8957


Epoch 21/100: 100%|██████████| 13/13 [00:04<00:00,  2.80batch/s, accuracy=0.905, loss=0.248] 


Train Loss: 0.2482, Train Accuracy: 0.9055
Val Loss: 0.2962, Val Accuracy: 0.8956


Epoch 22/100: 100%|██████████| 13/13 [00:04<00:00,  3.00batch/s, accuracy=0.906, loss=0.247] 


Train Loss: 0.2470, Train Accuracy: 0.9060
Val Loss: 0.2940, Val Accuracy: 0.8972


Epoch 23/100: 100%|██████████| 13/13 [00:04<00:00,  3.16batch/s, accuracy=0.907, loss=0.245] 


Train Loss: 0.2450, Train Accuracy: 0.9071
Val Loss: 0.2918, Val Accuracy: 0.8987


Epoch 24/100: 100%|██████████| 13/13 [00:04<00:00,  3.08batch/s, accuracy=0.907, loss=0.241] 


Train Loss: 0.2412, Train Accuracy: 0.9075
Val Loss: 0.3010, Val Accuracy: 0.8928


Epoch 25/100: 100%|██████████| 13/13 [00:04<00:00,  3.06batch/s, accuracy=0.908, loss=0.24]  


Train Loss: 0.2405, Train Accuracy: 0.9084
Val Loss: 0.2914, Val Accuracy: 0.8984


Epoch 26/100: 100%|██████████| 13/13 [00:04<00:00,  3.07batch/s, accuracy=0.909, loss=0.24]  


Train Loss: 0.2405, Train Accuracy: 0.9086
Val Loss: 0.2997, Val Accuracy: 0.8935


Epoch 27/100: 100%|██████████| 13/13 [00:04<00:00,  3.11batch/s, accuracy=0.91, loss=0.237]  


Train Loss: 0.2374, Train Accuracy: 0.9103
Val Loss: 0.2939, Val Accuracy: 0.8989


Epoch 28/100: 100%|██████████| 13/13 [00:03<00:00,  3.25batch/s, accuracy=0.911, loss=0.234] 


Train Loss: 0.2339, Train Accuracy: 0.9115
Val Loss: 0.2982, Val Accuracy: 0.8967


Epoch 29/100: 100%|██████████| 13/13 [00:04<00:00,  3.22batch/s, accuracy=0.911, loss=0.236] 


Train Loss: 0.2360, Train Accuracy: 0.9111
Val Loss: 0.2964, Val Accuracy: 0.8981


Epoch 30/100: 100%|██████████| 13/13 [00:04<00:00,  3.17batch/s, accuracy=0.913, loss=0.234] 


Train Loss: 0.2344, Train Accuracy: 0.9126
Val Loss: 0.2961, Val Accuracy: 0.8977


Epoch 31/100: 100%|██████████| 13/13 [00:04<00:00,  3.10batch/s, accuracy=0.912, loss=0.234] 


Train Loss: 0.2339, Train Accuracy: 0.9125
Val Loss: 0.2973, Val Accuracy: 0.8972


Epoch 32/100: 100%|██████████| 13/13 [00:04<00:00,  3.08batch/s, accuracy=0.911, loss=0.231] 


Train Loss: 0.2306, Train Accuracy: 0.9112
Val Loss: 0.2980, Val Accuracy: 0.8969


Epoch 33/100: 100%|██████████| 13/13 [00:04<00:00,  3.16batch/s, accuracy=0.911, loss=0.237] 


Train Loss: 0.2367, Train Accuracy: 0.9112
Val Loss: 0.2967, Val Accuracy: 0.8972


Epoch 34/100: 100%|██████████| 13/13 [00:04<00:00,  3.11batch/s, accuracy=0.913, loss=0.229] 


Train Loss: 0.2294, Train Accuracy: 0.9130
Val Loss: 0.2983, Val Accuracy: 0.8971


Epoch 35/100: 100%|██████████| 13/13 [00:04<00:00,  3.08batch/s, accuracy=0.912, loss=0.228] 


Train Loss: 0.2284, Train Accuracy: 0.9120
Val Loss: 0.2971, Val Accuracy: 0.8978


Epoch 36/100: 100%|██████████| 13/13 [00:03<00:00,  3.33batch/s, accuracy=0.913, loss=0.228] 


Train Loss: 0.2276, Train Accuracy: 0.9131
Val Loss: 0.2981, Val Accuracy: 0.8964


Epoch 37/100: 100%|██████████| 13/13 [00:04<00:00,  2.89batch/s, accuracy=0.911, loss=0.229] 


Train Loss: 0.2293, Train Accuracy: 0.9115
Val Loss: 0.2988, Val Accuracy: 0.8968
Early stopping triggered. No improvement in validation accuracy.
Finished Training, time:  161.5369589328766
Saving model...
Model saved, time:  0.010174989700317383
Finished, Total time:  161.56196188926697
