# **Model classes**

In [72]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset

# Model that uses LSTM network
class AudioClassifier(nn.Module):
    def __init__(self):
        super(AudioClassifier, self).__init__()

        # input_size - number of features in the input (20 MFCC coefficients)
        self.lstm = nn.LSTM(input_size=20, hidden_size=256, num_layers=3, batch_first=True, dropout=0.2)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3)

    def forward(self, x):
        x, _ = self.lstm(x)  # Zwraca wszystkie ukryte stany
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)  # Zwraca etykiety dla każdej klatki
        return x

class AudioDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):

        # Get the sequence consisting of MFCCs and labels
        sequence = self.data[idx]

        # Extract MFCCs and labels from the sequence
        mfcc_sequence = [item[0] for item in sequence]
        labels = [item[1] for item in sequence]

        # Convert the lists to PyTorch tensors
        mfcc_sequence = torch.tensor(mfcc_sequence, dtype=torch.float32)
        labels = torch.tensor(labels, dtype=torch.long)

        # Return lists of MFCC ndarrays and labels
        return mfcc_sequence, labels

# Model that use GRU network
class AudioClassifierGRU(nn.Module):
    def __init__(self):
        super(AudioClassifierGRU, self).__init__()
        self.gru = nn.GRU(input_size=20, hidden_size=256, num_layers=3, batch_first=True, dropout=0.2)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 3)

    def forward(self, x):
        x = x.permute(0, 2, 1)  # Swap the dimensions for GRU (batch, seq, feature)
        _, hn = self.gru(x)  # hn contains the last hidden state
        x = hn[-1]  # Take the last hidden state
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Padding for sequence
def collate_fn(batch):

    # Get the sequences and labels from the batch
    sequences, labels = zip(*batch)

    # Get the lengths of sequences
    lengths = [seq.size(0) for seq in sequences]

    # Get the maximum length
    max_length = max(lengths)

    # Create padded sequences and labels
    padded_sequences = torch.zeros(len(sequences), max_length, 20)
    padded_labels = torch.zeros(len(sequences), max_length, dtype=torch.long)

    # Fill the padded sequences and labels
    for i, seq in enumerate(sequences):
        padded_sequences[i, :seq.size(0), :] = seq
        padded_labels[i, :len(labels[i])] = labels[i]

    # Return padded sequences and labels
    return padded_sequences, padded_labels

# **Data preprocessing**

In [67]:
import os
import time
import librosa
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

REFRESH_TIME = 0.25  # seconds
BATCH_SIZE = 32

# Directories with data
exhale_dir = 'small-data/exhale'
inhale_dir = 'small-data/inhale'
silence_dir = 'small-data/silence'

# Creating list of files
exhale_files = [os.path.join(exhale_dir, file) for file in os.listdir(exhale_dir)]
inhale_files = [os.path.join(inhale_dir, file) for file in os.listdir(inhale_dir)]
silence_files = [os.path.join(silence_dir, file) for file in os.listdir(silence_dir)]
train_data = []
files_list = [exhale_files, inhale_files, silence_files]
files_names = ['exhale', 'inhale', 'silence']

# Amount of sequences of every class
exhale_frames_size = 0
inhale_frames_size = 0
silence_frames_size = 0

# Main loop to preprocess data into MFCCs
for label, files in enumerate(files_list):

    # Iterate through all files (potentially longer audio recording with different classes)
    for file in files:

        # Load vaw file and keep file's sampling rate
        y, sr = librosa.load(file, mono=True)

        # Calculate chunk size
        chunk_size = int(sr * 0.05)  # 0.25 seconds for example

        # List of MFCCs for every data sequence (it will be a list of MFCC coefficients lists)
        mfcc_sequence = []

        # Iterate trough every 0.25s audio chunk
        for i in range(0, len(y), chunk_size):

            # Get frames of current chunk
            frame = y[i:i + chunk_size]

            if len(frame) == chunk_size:  # Ignore the last frame if it's shorter

                # Calculate MFCCs (it will be a vector of MFCC coefficients - a vector of vectors)
                mfcc = librosa.feature.mfcc(y=frame, sr=sr)  # Default n_mfcc = 20 (20 coefficients per subframe)

                # Because we have a list of MFCC vectors, we can calculate the mean of every coefficient so we get just one set of coefficients for every 0.25s chunk
                mfcc_mean = mfcc.mean(axis=1)

                # Append the mean of MFCCs to the list of MFCCs for the current data sequence
                mfcc_sequence.append((mfcc_mean, label))

        print(file)

        if mfcc_sequence:

            # Append the list of MFCCs for the current data sequence to the list of all data sequences
            train_data.append(mfcc_sequence)

    # Print the amount of sequences for every class
    if label == 0:
        exhale_frames_size = len(train_data)
        print("Exhale frames size: ", exhale_frames_size)
    elif label == 1:
        inhale_frames_size = len(train_data) - exhale_frames_size
        print("Inhale frames size: ", inhale_frames_size)
    else:
        silence_frames_size = len(train_data) - exhale_frames_size - inhale_frames_size
        print("Silence frames size: ", silence_frames_size)



small-data/exhale/master_exhale3.wav
small-data/exhale/master_exhale6.wav
small-data/exhale/master_exhale5.wav
small-data/exhale/master_exhale15.wav
small-data/exhale/master_exhale37.wav
small-data/exhale/master_exhale21.wav
small-data/exhale/master_exhale41.wav
small-data/exhale/master_exhale25.wav
small-data/exhale/master_exhale43.wav
small-data/exhale/master_exhale39.wav
small-data/exhale/master_exhale1.wav
small-data/exhale/master_exhale22.wav
small-data/exhale/master_exhale11.wav
small-data/exhale/master_exhale49.wav
small-data/exhale/master_exhale19.wav
small-data/exhale/master_exhale23.wav
small-data/exhale/master_exhale10.wav
small-data/exhale/master_exhale14.wav
small-data/exhale/master_exhale12.wav
small-data/exhale/master_exhale36.wav
small-data/exhale/master_exhale44.wav
small-data/exhale/master_exhale7.wav
small-data/exhale/master_exhale40.wav
small-data/exhale/master_exhale42.wav
small-data/exhale/master_exhale29.wav
small-data/exhale/master_exhale13.wav
small-data/exhale

# **Data Loader**

In [74]:
# train_data is a list of sequences
# every sequence is a list of tuples (mfcc_mean, label)
# mfcc_mean is a list of 20 MFCC coefficients
# label is a class label (0, 1, 2)

# Split data into train and validation sets
train_data, val_data = train_test_split(train_data, test_size=0.2)

# Utwórz instancje AudioDataset
train_dataset = AudioDataset(train_data)
val_dataset = AudioDataset(val_data)

# Utwórz DataLoaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_fn)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_fn)

# **Training**

In [75]:
import time
import torch.optim as optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from sklearn.metrics import accuracy_score

REFRESH_TIME = 0.25
NUM_EPOCHS = 100
PATIENCE_TIME = 10
LEARNING_RATE = 0.001
BATCH_SIZE = 32

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: ", device)

total_time = time.time()
start_time = time.time()

print("Creating model...")
model = AudioClassifier()
model = model.to(device)
print("Model created, time: ", time.time() - start_time)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.5)


best_val_accuracy = 0.0
early_stopping_counter = 0

print("Training model...")
start_time = time.time()
for epoch in range(NUM_EPOCHS):
    model.train()
    running_loss = 0.0
    running_accuracy = 0.0
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch + 1}/{NUM_EPOCHS}', unit='batch')
    for inputs, labels in progress_bar:
        inputs = inputs.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(inputs)

        outputs = outputs.view(-1, outputs.size(-1))  # Spłaszczenie do [batch_size * max_length, num_classes]
        labels = labels.view(-1)  # Spłaszczenie do [batch_size * max_length]
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())

        progress_bar.set_postfix(loss=running_loss / len(progress_bar),
                                  accuracy=running_accuracy / len(progress_bar))
    print('Train Loss: {:.4f}, Train Accuracy: {:.4f}'.format(running_loss / len(train_loader),
                                                              running_accuracy / len(train_loader)))

    model.eval()
    val_running_loss = 0.0
    val_running_accuracy = 0.0
    with torch.no_grad():
        for inputs, labels in val_loader:
            inputs = inputs.to(device)
            labels = labels.to(device)
            outputs = model(inputs)

            outputs = outputs.view(-1, outputs.size(-1))  # Spłaszczenie do [batch_size * max_length, num_classes]
            labels = labels.view(-1)  # Spłaszczenie do [batch_size * max_length]
            loss = criterion(outputs, labels)

            val_running_loss += loss.item()
            _, predicted = torch.max(outputs, 1)
            val_running_accuracy += accuracy_score(labels.cpu(), predicted.cpu())
    avg_val_loss = val_running_loss / len(val_loader)
    avg_val_accuracy = val_running_accuracy / len(val_loader)
    print('Val Loss: {:.4f}, Val Accuracy: {:.4f}'.format(avg_val_loss, avg_val_accuracy))

    scheduler.step()

    # Early stopping
    if avg_val_accuracy > best_val_accuracy:
        best_val_accuracy = avg_val_accuracy
        early_stopping_counter = 0
    else:
        early_stopping_counter += 1
        if early_stopping_counter >= PATIENCE_TIME:
            print("Early stopping triggered. No improvement in validation accuracy.")
            break

print('Finished Training, time: ', time.time() - start_time)
print('Saving model...')
start_time = time.time()
torch.save(model.state_dict(), 'audio_rnn_classifier.pth')
print("Model saved, time: ", time.time() - start_time)
print("Finished, Total time: ", time.time() - total_time)

Device:  cpu
Creating model...
Model created, time:  0.0974419116973877
Training model...


Epoch 1/100: 100%|██████████| 3/3 [00:06<00:00,  2.18s/batch, accuracy=0.832, loss=1.04] 


Train Loss: 1.0425, Train Accuracy: 0.8315
Val Loss: 1.0242, Val Accuracy: 0.7793


Epoch 2/100: 100%|██████████| 3/3 [00:05<00:00,  1.94s/batch, accuracy=0.813, loss=0.96] 


Train Loss: 0.9597, Train Accuracy: 0.8131
Val Loss: 0.8672, Val Accuracy: 0.7793


Epoch 3/100: 100%|██████████| 3/3 [00:06<00:00,  2.06s/batch, accuracy=0.831, loss=0.642]


Train Loss: 0.6420, Train Accuracy: 0.8313
Val Loss: 0.6379, Val Accuracy: 0.7793


Epoch 4/100: 100%|██████████| 3/3 [00:06<00:00,  2.09s/batch, accuracy=0.827, loss=0.486]


Train Loss: 0.4863, Train Accuracy: 0.8274
Val Loss: 0.3003, Val Accuracy: 0.7793


Epoch 5/100: 100%|██████████| 3/3 [00:06<00:00,  2.28s/batch, accuracy=0.822, loss=0.28] 


Train Loss: 0.2805, Train Accuracy: 0.8217
Val Loss: 0.3025, Val Accuracy: 0.7793


Epoch 6/100: 100%|██████████| 3/3 [00:07<00:00,  2.55s/batch, accuracy=0.845, loss=0.259] 


Train Loss: 0.2593, Train Accuracy: 0.8449
Val Loss: 0.2813, Val Accuracy: 0.8373


Epoch 7/100: 100%|██████████| 3/3 [00:06<00:00,  2.01s/batch, accuracy=0.895, loss=0.231] 


Train Loss: 0.2312, Train Accuracy: 0.8952
Val Loss: 0.2701, Val Accuracy: 0.8387


Epoch 8/100: 100%|██████████| 3/3 [00:05<00:00,  1.94s/batch, accuracy=0.896, loss=0.235]


Train Loss: 0.2353, Train Accuracy: 0.8964
Val Loss: 0.2612, Val Accuracy: 0.8387


Epoch 9/100: 100%|██████████| 3/3 [00:06<00:00,  2.33s/batch, accuracy=0.9, loss=0.215]   


Train Loss: 0.2151, Train Accuracy: 0.9002
Val Loss: 0.2546, Val Accuracy: 0.8407


Epoch 10/100: 100%|██████████| 3/3 [00:06<00:00,  2.27s/batch, accuracy=0.906, loss=0.199] 


Train Loss: 0.1990, Train Accuracy: 0.9057
Val Loss: 0.2479, Val Accuracy: 0.8400


Epoch 11/100: 100%|██████████| 3/3 [00:07<00:00,  2.46s/batch, accuracy=0.909, loss=0.191]


Train Loss: 0.1907, Train Accuracy: 0.9086
Val Loss: 0.2460, Val Accuracy: 0.8353


Epoch 12/100: 100%|██████████| 3/3 [00:06<00:00,  2.25s/batch, accuracy=0.911, loss=0.185] 


Train Loss: 0.1847, Train Accuracy: 0.9109
Val Loss: 0.2706, Val Accuracy: 0.8303


Epoch 13/100: 100%|██████████| 3/3 [00:07<00:00,  2.43s/batch, accuracy=0.907, loss=0.187] 


Train Loss: 0.1875, Train Accuracy: 0.9067
Val Loss: 0.2941, Val Accuracy: 0.8303


Epoch 14/100: 100%|██████████| 3/3 [00:06<00:00,  2.21s/batch, accuracy=0.905, loss=0.179] 


Train Loss: 0.1791, Train Accuracy: 0.9053
Val Loss: 0.2999, Val Accuracy: 0.8303


Epoch 15/100: 100%|██████████| 3/3 [00:06<00:00,  2.11s/batch, accuracy=0.948, loss=0.165] 


Train Loss: 0.1653, Train Accuracy: 0.9480
Val Loss: 0.2963, Val Accuracy: 0.9587


Epoch 16/100: 100%|██████████| 3/3 [00:06<00:00,  2.13s/batch, accuracy=0.972, loss=0.171] 


Train Loss: 0.1714, Train Accuracy: 0.9717
Val Loss: 0.2924, Val Accuracy: 0.9640


Epoch 17/100: 100%|██████████| 3/3 [00:07<00:00,  2.41s/batch, accuracy=0.975, loss=0.155] 


Train Loss: 0.1547, Train Accuracy: 0.9748
Val Loss: 0.2868, Val Accuracy: 0.9670


Epoch 18/100: 100%|██████████| 3/3 [00:06<00:00,  2.31s/batch, accuracy=0.979, loss=0.161] 


Train Loss: 0.1607, Train Accuracy: 0.9788
Val Loss: 0.2809, Val Accuracy: 0.9680


Epoch 19/100: 100%|██████████| 3/3 [00:07<00:00,  2.51s/batch, accuracy=0.981, loss=0.147] 


Train Loss: 0.1468, Train Accuracy: 0.9812
Val Loss: 0.2748, Val Accuracy: 0.9723


Epoch 20/100: 100%|██████████| 3/3 [00:07<00:00,  2.41s/batch, accuracy=0.983, loss=0.155] 


Train Loss: 0.1549, Train Accuracy: 0.9830
Val Loss: 0.2677, Val Accuracy: 0.9733


Epoch 21/100: 100%|██████████| 3/3 [00:07<00:00,  2.45s/batch, accuracy=0.985, loss=0.13]  


Train Loss: 0.1303, Train Accuracy: 0.9855
Val Loss: 0.2642, Val Accuracy: 0.9733


Epoch 22/100: 100%|██████████| 3/3 [00:07<00:00,  2.49s/batch, accuracy=0.985, loss=0.134] 


Train Loss: 0.1340, Train Accuracy: 0.9849
Val Loss: 0.2605, Val Accuracy: 0.9737


Epoch 23/100: 100%|██████████| 3/3 [00:07<00:00,  2.39s/batch, accuracy=0.987, loss=0.127]


Train Loss: 0.1272, Train Accuracy: 0.9871
Val Loss: 0.2566, Val Accuracy: 0.9740


Epoch 24/100: 100%|██████████| 3/3 [00:07<00:00,  2.41s/batch, accuracy=0.986, loss=0.134]


Train Loss: 0.1337, Train Accuracy: 0.9861
Val Loss: 0.2528, Val Accuracy: 0.9740


Epoch 25/100: 100%|██████████| 3/3 [00:07<00:00,  2.35s/batch, accuracy=0.986, loss=0.132] 


Train Loss: 0.1320, Train Accuracy: 0.9865
Val Loss: 0.2493, Val Accuracy: 0.9737


Epoch 26/100: 100%|██████████| 3/3 [00:07<00:00,  2.57s/batch, accuracy=0.988, loss=0.118] 


Train Loss: 0.1183, Train Accuracy: 0.9881
Val Loss: 0.2474, Val Accuracy: 0.9737


Epoch 27/100: 100%|██████████| 3/3 [00:07<00:00,  2.50s/batch, accuracy=0.987, loss=0.118] 


Train Loss: 0.1177, Train Accuracy: 0.9874
Val Loss: 0.2452, Val Accuracy: 0.9737


Epoch 28/100: 100%|██████████| 3/3 [00:06<00:00,  2.26s/batch, accuracy=0.987, loss=0.119] 


Train Loss: 0.1187, Train Accuracy: 0.9873
Val Loss: 0.2432, Val Accuracy: 0.9737


Epoch 29/100: 100%|██████████| 3/3 [00:07<00:00,  2.61s/batch, accuracy=0.988, loss=0.114] 


Train Loss: 0.1139, Train Accuracy: 0.9878
Val Loss: 0.2413, Val Accuracy: 0.9740


Epoch 30/100: 100%|██████████| 3/3 [00:06<00:00,  2.33s/batch, accuracy=0.986, loss=0.118] 


Train Loss: 0.1179, Train Accuracy: 0.9863
Val Loss: 0.2393, Val Accuracy: 0.9740


Epoch 31/100: 100%|██████████| 3/3 [00:07<00:00,  2.43s/batch, accuracy=0.988, loss=0.11]  


Train Loss: 0.1100, Train Accuracy: 0.9884
Val Loss: 0.2383, Val Accuracy: 0.9740


Epoch 32/100: 100%|██████████| 3/3 [00:07<00:00,  2.47s/batch, accuracy=0.988, loss=0.124] 


Train Loss: 0.1236, Train Accuracy: 0.9883
Val Loss: 0.2371, Val Accuracy: 0.9740


Epoch 33/100: 100%|██████████| 3/3 [00:06<00:00,  2.20s/batch, accuracy=0.99, loss=0.105] 


Train Loss: 0.1052, Train Accuracy: 0.9903
Val Loss: 0.2360, Val Accuracy: 0.9740
Early stopping triggered. No improvement in validation accuracy.
Finished Training, time:  251.04413747787476
Saving model...
Model saved, time:  0.05625510215759277
Finished, Total time:  251.2057912349701
