In [116]:
# ----------------------------
# Prepare training data from Metadata file
# ----------------------------
import pandas as pd
from pathlib import Path

data_path = r'D:\0_Workplace\Project_Python\audio_process\data\train'

# Read metadata file
metadata_file = r"D:\0_Workplace\Project_Python\audio_process\data\train\audio_labels1.csv"
df = pd.read_csv(metadata_file)
df.head()

# Construct file path by concatenating fold and file name
df['Relative_path'] = '/'+df['Label'].astype(str) + '/' + df['Filename'].astype(str)

# Take relevant columns
# df = df[['relative_path', 'classID']]
df.head()

Unnamed: 0,Filename,Label,ClassID,Relative_path
0,enoutput_02024_10_07_22_49_51_552246.wav,en,2,/en/enoutput_02024_10_07_22_49_51_552246.wav
1,enoutput_10002024_10_07_22_49_51_552246.wav,en,2,/en/enoutput_10002024_10_07_22_49_51_552246.wav
2,enoutput_1002024_10_07_22_49_51_552246.wav,en,2,/en/enoutput_1002024_10_07_22_49_51_552246.wav
3,enoutput_10052024_10_07_22_49_51_552246.wav,en,2,/en/enoutput_10052024_10_07_22_49_51_552246.wav
4,enoutput_10102024_10_07_22_49_51_552246.wav,en,2,/en/enoutput_10102024_10_07_22_49_51_552246.wav


In [117]:
from torch.utils.data import Dataset, random_split
from pre_processing_data import AudioUtil

# ----------------------------
# Sound Dataset
# ----------------------------
class SoundDS(Dataset):
   def __init__(self, df, data_path):
      self.df = df
      self.data_path = str(data_path)
      self.duration = 5000
      self.sr = 44100
      self.channel = 2
      self.shift_pct = 0.4
            
   # ----------------------------
   # Number of items in dataset
   # ----------------------------
   def __len__(self):
      return len(self.df)    
      
   # ----------------------------
   # Get i'th item in dataset
   # ----------------------------
   def __getitem__(self, idx):
      # Absolute file path of the audio file - concatenate the audio directory with
      # the relative path
      audio_file = self.data_path + self.df.loc[idx, 'Relative_path']
      # Get the Class ID
      class_id = self.df.loc[idx, 'ClassID']

      aud = AudioUtil.open(audio_file)
      # Some sounds have a higher sample rate, or fewer channels compared to the
      # majority. So make all sounds have the same number of channels and same 
      # sample rate. Unless the sample rate is the same, the pad_trunc will still
      # result in arrays of different lengths, even though the sound duration is
      # the same.
      reaud = AudioUtil.resample(aud, self.sr)
      rechan = AudioUtil.rechannel(reaud, self.channel)

      dur_aud = AudioUtil.pad_trunc(rechan, self.duration)
      shift_aud = AudioUtil.time_shift(dur_aud, self.shift_pct)
      # sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None) #mel spetrogram
      sgram = AudioUtil.spectro_gram(shift_aud) # mfcc
      aug_sgram = AudioUtil.spectro_augment(sgram, max_mask_pct=0.1, n_freq_masks=2, n_time_masks=2)

      return aug_sgram, class_id

In [118]:
from torch.utils.data import random_split
import torch

myds = SoundDS(df, data_path)

# Random split of 80:20 between training and validation
num_items = len(myds)
num_train = round(num_items * 0.8)
num_val = num_items - num_train
train_ds, val_ds = random_split(myds, [num_train, num_val])

# Create training and validation data loaders
train_dl = torch.utils.data.DataLoader(train_ds, batch_size=16, shuffle=True)
val_dl = torch.utils.data.DataLoader(val_ds, batch_size=16, shuffle=False)

In [122]:
from torch import nn
from audioclassifier import AudioClassifier
from audioclassifierRNN import AudioClassifierRNN

# Create the model and put it on the GPU if available
# model = nn.DataParallel(AudioClassifier())
model = nn.DataParallel(AudioClassifierRNN(40, 128, 2,3))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
# Check that it is on Cuda
next(model.parameters()).device

device(type='cpu')

In [120]:
# # ----------------------------
# # Training Loop
# # ----------------------------
# import torch.utils
# import torch.utils.tensorboard


# def training(model, train_dl, num_epochs):
#     # Tensorboard
#     writer = torch.utils.tensorboard.SummaryWriter()
#     # Loss Function, Optimizer and Scheduler
#     criterion = nn.CrossEntropyLoss()
#     optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
#     scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
#                                                 steps_per_epoch=int(len(train_dl)),
#                                                 epochs=num_epochs,
#                                                 anneal_strategy='linear')
#     # Repeat for each epoch
#     for epoch in range(num_epochs):
#         running_loss = 0.0
#         correct_prediction = 0
#         total_prediction = 0

#         # Repeat for each batch in the training set
#         for i, data in enumerate(train_dl):
#             # Get the input features and target labels, and put them on the GPU
#             # print(data)
#             inputs = data[0].to(device)
#             labels = data[1].to(device)

#             # Normalize the inputs
#             inputs_m, inputs_s = inputs.mean(), inputs.std()
#             inputs = (inputs - inputs_m) / inputs_s

#             # Zero the parameter gradients
#             optimizer.zero_grad()

#             # forward + backward + optimize
#             outputs = model(inputs)
#             loss = criterion(outputs, labels)
#             loss.backward()
#             optimizer.step()
#             scheduler.step()

#             # Keep stats for Loss and Accuracy
#             running_loss += loss.item()

#             # Get the predicted class with the highest score
#             _, prediction = torch.max(outputs,1)
#             # Count of predictions that matched the target label
#             correct_prediction += (prediction == labels).sum().item()
#             total_prediction += prediction.shape[0]

#             #if i % 10 == 0:    # print every 10 mini-batches
#             #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
        
#         # Print stats at the end of the epoch
#         num_batches = len(train_dl)
#         avg_loss = running_loss / num_batches
#         avg_acc = correct_prediction/total_prediction
#         writer.add_scalar("Loss/train", avg_loss, epoch)
#         writer.add_scalar("Acc/train", avg_acc, epoch)
#         print(f'Epoch: {epoch}, Loss: {avg_loss:.3f}, Accuracy: {avg_acc:.3f}')
#         # Save model
#         torch.save(model.state_dict(), 'model_1310_2.pt')
#     print('Finished Training')
    

In [123]:
# ----------------------------
# Training Loop
# ----------------------------
import torch.utils
import torch.utils.tensorboard


def training(model, train_dl, num_epochs):
    # Tensorboard
    writer = torch.utils.tensorboard.SummaryWriter()
    # Loss Function, Optimizer and Scheduler
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(),lr=0.001)
    scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer, max_lr=0.001,
                                                steps_per_epoch=int(len(train_dl)),
                                                epochs=num_epochs,
                                                anneal_strategy='linear')
    # Repeat for each epoch
    for epoch in range(num_epochs):
        running_loss = 0.0
        correct_prediction = 0
        total_prediction = 0

        # Repeat for each batch in the training set
        for i, data in enumerate(train_dl):
            # Get the input features and target labels, and put them on the GPU
            inputs = data[0].to(device)
            labels = data[1].to(device)

            # inputs shape: (batch_size, num_channels, height, width)
            # You need to reshape it to (batch_size, sequence_length, input_size)
            
            batch_size, num_channels, height, width = inputs.shape

            # Reshape inputs to (batch_size, height, num_channels * width)
            inputs = inputs.view(batch_size, height, num_channels * width)

            # Zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = model(inputs)  # Now inputs have correct shape for LSTM
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            # Keep stats for Loss and Accuracy
            running_loss += loss.item()

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs, 1)
            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]


            #if i % 10 == 0:    # print every 10 mini-batches
            #    print('[%d, %5d] loss: %.3f' % (epoch + 1, i + 1, running_loss / 10))
        
        # Print stats at the end of the epoch
        num_batches = len(train_dl)
        avg_loss = running_loss / num_batches
        avg_acc = correct_prediction/total_prediction
        writer.add_scalar("Loss/train", avg_loss, epoch)
        writer.add_scalar("Acc/train", avg_acc, epoch)
        print(f'Epoch: {epoch}, Loss: {avg_loss:.3f}, Accuracy: {avg_acc:.3f}')
        # Save model
        torch.save(model.state_dict(), 'model_1310_2.pt')
    print('Finished Training')
    

In [124]:
num_epochs=20
training(model, train_dl, num_epochs)

RuntimeError: input.size(-1) must be equal to input_size. Expected 40, got 860

In [101]:
# ----------------------------
# Inference
# ----------------------------
def inference (model, test_dl):
    correct_prediction = 0
    total_prediction = 0

    # Disable gradient updates
    with torch.no_grad():
        for data in test_dl:
            # Get the input features and target labels, and put them on the GPU
            inputs, labels = data[0].to(device), data[1].to(device)

            # Normalize the inputs
            inputs_m, inputs_s = inputs.mean(), inputs.std()
            inputs = (inputs - inputs_m) / inputs_s

            # Get predictions
            outputs = model(inputs)

            # Get the predicted class with the highest score
            _, prediction = torch.max(outputs,1)
            # Count of predictions that matched the target label
            correct_prediction += (prediction == labels).sum().item()
            total_prediction += prediction.shape[0]
        
    acc = correct_prediction/total_prediction
    print(f'Accuracy: {acc:.2f}, Total items: {total_prediction}')

# Run inference on trained model with the validation set load best model weights
model_inf = nn.DataParallel(AudioClassifier())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_inf = model_inf.to(device)
model_inf.load_state_dict(torch.load('model_1310_2.pt'))
model_inf.eval()

inference(model_inf, val_dl)

  model_inf.load_state_dict(torch.load('model_1310_2.pt'))


Accuracy: 1.00, Total items: 694


In [102]:
# ----------------------------
# Predict with a single audio file
# ----------------------------
def predict(model, audio_file):
    # Set model to evaluation mode
    model.eval()

    # Load the audio file
    aud = AudioUtil.open(audio_file)
    reaud = AudioUtil.resample(aud, 44100)  # Ensure sampling rate is 44100 Hz
    rechan = AudioUtil.rechannel(reaud, 2)  # Ensure stereo audio (2 channels)

    # Pad or truncate the audio to 4 seconds (4000 ms)
    dur_aud = AudioUtil.pad_trunc(rechan, 5000)

    # Shift the audio for augmentation (can be skipped if not needed)
    shift_aud = AudioUtil.time_shift(dur_aud, 0.4)

    # Generate a Mel-Spectrogram
    # sgram = AudioUtil.spectro_gram(shift_aud, n_mels=64, n_fft=1024, hop_len=None)
    sgram = AudioUtil.spectro_gram(shift_aud)


    # Normalize the spectrogram
    sgram_m, sgram_s = sgram.mean(), sgram.std()
    sgram = (sgram - sgram_m) / sgram_s

    # Add batch dimension since model expects a batch of inputs
    sgram = sgram.unsqueeze(0)

    # Move to the same device as the model
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    sgram = sgram.to(device)

    # Disable gradient calculation (since we're only doing inference)
    with torch.no_grad():
        outputs = model(sgram)

    # Get the predicted class with the highest score
    _, prediction = torch.max(outputs, 1)

    # Return the predicted class ID
    return prediction.item()



In [103]:
# Load the trained model
model_inf = nn.DataParallel(AudioClassifier())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_inf = model_inf.to(device)
model_inf.load_state_dict(torch.load('model.pt'))

audio_file = r"D:\0_Workplace\data_audio\tu_thu\vi_5_2024_10_13_22_33_00_847800.wav"
# Predict for a single audio file
prediction = predict(model_inf, audio_file)
print(f'Predicted class ID: {prediction}')

Predicted class ID: 2


  model_inf.load_state_dict(torch.load('model.pt'))


In [108]:
import os

folder_path = r"D:\0_Workplace\data_audio\test_en"
model_inf = nn.DataParallel(AudioClassifier())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_inf = model_inf.to(device)
model_inf.load_state_dict(torch.load('model_1310.pt'))
count_vi = 0
count_en = 0
count_ko = 0

for audio_file in os.listdir(folder_path):
    if audio_file.endswith(".wav"):  # Chỉ xử lý file .wav
        audio_file_path = os.path.join(folder_path, audio_file)

        prediction = predict(model_inf, audio_file_path)
        if prediction == 1:
            count_vi = count_vi + 1
        if prediction == 2:
            count_en = count_en + 1
        if prediction == 3:
            count_ko = count_ko + 1
        print(f'{audio_file} ==> Predicted class ID: {prediction}')
print(f"count_vi:  {count_vi}")
print(f"count_en:  {count_en}")
print(f"count_ko:  {count_ko}")



  model_inf.load_state_dict(torch.load('model_1310.pt'))


enoutput_17252024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17302024_10_07_22_49_51_552246.wav ==> Predicted class ID: 3
enoutput_17352024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17402024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17452024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17502024_10_07_22_49_51_552246.wav ==> Predicted class ID: 3
enoutput_17552024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17602024_10_07_22_49_51_552246.wav ==> Predicted class ID: 1
enoutput_17652024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17702024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17752024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17802024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17852024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17902024_10_07_22_49_51_552246.wav ==> Predicted class ID: 2
enoutput_17952024_10