In [1]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, MaxPooling1D, Flatten, GRU, Reshape, Dense, Dropout, Concatenate
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import TensorBoard, ModelCheckpoint, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from datetime import datetime
from concurrent.futures import ThreadPoolExecutor
import matplotlib.pyplot as plt

In [2]:
#  for reproducability
tf.random.set_seed(42)
np.random.seed(42)

In [3]:
def load_npy_file(file_path):
    return np.load(file_path)


In [4]:
def load_data_from_csv(csv_path, feature_dir):
    # loading the path to the mel-spec and gcc-phat csv files
    df = pd.read_csv(csv_path)
    display(df)

    gcc_paths = df['fea_gcc'].apply(lambda x: os.path.join(feature_dir, x)).to_numpy()
    spec_paths = df['fea_spec'].apply(lambda x: os.path.join(feature_dir, x)).to_numpy()
    y_train_loc_x = df['loc_x'].values
    y_train_loc_y = df['loc_y'].values

    def load_files(paths):
        with ThreadPoolExecutor(max_workers=8) as executor:
            return list(executor.map(np.load, paths))

    gcc_features = np.array(load_files(gcc_paths))
    spec_features = np.array(load_files(spec_paths))

    labels = df['subject_label'].to_numpy()

    return gcc_features, spec_features, labels, y_train_loc_x, y_train_loc_y

In [5]:
def encode_labels(labels):
    # encoding labels
    unique_labels = np.unique(labels)
    label_to_index = {label: idx for idx, label in enumerate(unique_labels)}
    encoded_labels = np.array([label_to_index[label] for label in labels])
    return encoded_labels, label_to_index

In [6]:
def build_crnn_model(input_shape_gcc, input_shape_spec, hidden_units, output_units, dropout_rate):
    # Input layers
    input_gcc = Input(shape=input_shape_gcc, name='input_gcc')
    input_spec = Input(shape=input_shape_spec, name='input_spec')

    # Convolutional layers for GCC features
    conv_gcc = Conv1D(filters=64, kernel_size=3, activation='relu')(input_gcc)
    conv_gcc = MaxPooling1D(pool_size=2)(conv_gcc)
    conv_gcc = Conv1D(filters=128, kernel_size=3, activation='relu')(conv_gcc)
    conv_gcc = MaxPooling1D(pool_size=2)(conv_gcc)

    # Convolutional layers for Mel-spectrogram features
    conv_spec = Conv1D(filters=64, kernel_size=3, activation='relu')(input_spec)
    conv_spec = MaxPooling1D(pool_size=2)(conv_spec)
    conv_spec = Conv1D(filters=128, kernel_size=3, activation='relu')(conv_spec)
    conv_spec = MaxPooling1D(pool_size=2)(conv_spec)

    # Concatenate the features along the feature axis
    concatenated = Concatenate(axis=-1)([conv_gcc, conv_spec])

    # Reshape the concatenated output to be compatible with GRU
    # Calculate the new timesteps and features after concatenation
    timesteps = concatenated.shape[1]  # Number of timesteps
    features = concatenated.shape[2]   # Number of features
    reshaped = Reshape((timesteps, features))(concatenated)

    # Recurrent layers
    rnn = GRU(hidden_units, return_sequences=False)(reshaped)
    rnn = Dropout(dropout_rate)(rnn)

    # Dense layers
    dense = Dense(hidden_units, activation='relu')(rnn)
    dense = Dropout(dropout_rate)(dense)

    # Output layers
    output_loc_x = Dense(1, name='output_loc_x')(dense)
    output_loc_y = Dense(1, name='output_loc_y')(dense)
    output_subject = Dense(output_units, activation='softmax', name='output_subject')(dense)

    model = Model(inputs=[input_gcc, input_spec], outputs=[output_loc_x, output_loc_y, output_subject])

    return model

In [7]:
feature_dir = "/content/drive/MyDrive/FYP/data/audio_feature"  # dir containing the .npy feature files
csv_train_path = "/content/drive/MyDrive/FYP/data/audio_feature/AFPILD_FE1_rd_train.csv"  # path to the training CSV file
csv_test_path = "/content/drive/MyDrive/FYP/data/audio_feature/AFPILD_FE1_rd_test.csv"  # path to the test CSV file

In [8]:
batch_size=10
epochs=5
lr=0.001

In [9]:
log_dir = os.path.join("logs", datetime.now().strftime("%Y%m%d-%H%M%S"))
os.makedirs(log_dir, exist_ok=True)  # Ensure this directory is created
print("Log directory:", log_dir)

Log directory: logs/20250309-103314


In [10]:
# loading training data
train_gcc, train_spec, train_labels, y_train_loc_x, y_train_loc_y = load_data_from_csv(csv_train_path, feature_dir)
train_labels, label_to_index = encode_labels(train_labels)

Unnamed: 0.1,Unnamed: 0,fea_spec,fea_gcc,loc_azimuth,loc_x,loc_y,subject_label
0,0,spec/afpild_fe1_s10_3_melspec_14647.npy,gcc/afpild_fe1_s10_3_gccphat_14647.npy,-60.950540,1.220343,-2.197082,S10
1,1,spec/afpild_fe1_s03_4_melspec_4423.npy,gcc/afpild_fe1_s03_4_gccphat_4423.npy,-154.491737,-2.260692,-1.078695,S03
2,2,spec/afpild_fe1_s07_1_melspec_9453.npy,gcc/afpild_fe1_s07_1_gccphat_9453.npy,38.618597,2.106657,1.682842,S07
3,3,spec/afpild_fe1_s02_1_melspec_1864.npy,gcc/afpild_fe1_s02_1_gccphat_1864.npy,-85.188679,0.182055,-2.162911,S02
4,4,spec/afpild_fe1_s10_1_melspec_14058.npy,gcc/afpild_fe1_s10_1_gccphat_14058.npy,57.953284,1.468105,2.345202,S10
...,...,...,...,...,...,...,...
7640,7640,spec/afpild_fe1_s03_2_melspec_3671.npy,gcc/afpild_fe1_s03_2_gccphat_3671.npy,135.174471,-1.706765,1.696402,S03
7641,7641,spec/afpild_fe1_s02_4_melspec_3066.npy,gcc/afpild_fe1_s02_4_gccphat_3066.npy,104.219056,-0.618445,2.440658,S02
7642,7642,spec/afpild_fe1_s06_4_melspec_9148.npy,gcc/afpild_fe1_s06_4_gccphat_9148.npy,142.471422,-2.232066,1.714494,S06
7643,7643,spec/afpild_fe1_s05_3_melspec_7401.npy,gcc/afpild_fe1_s05_3_gccphat_7401.npy,18.412090,2.435140,0.810634,S05


In [11]:
# loading test data
test_gcc, test_spec, test_labels, y_test_loc_x, y_test_loc_y = load_data_from_csv(csv_test_path, feature_dir)
test_labels = np.array([label_to_index[label] for label in test_labels])

Unnamed: 0.1,Unnamed: 0,fea_spec,fea_gcc,loc_azimuth,loc_x,loc_y,subject_label
0,0,spec/afpild_fe1_s07_2_melspec_10032.npy,gcc/afpild_fe1_s07_2_gccphat_10032.npy,-163.534055,-2.488816,-0.735612,S07
1,1,spec/afpild_fe1_s01_2_melspec_694.npy,gcc/afpild_fe1_s01_2_gccphat_694.npy,-118.879613,-1.201399,-2.178162,S01
2,2,spec/afpild_fe1_s01_4_melspec_1353.npy,gcc/afpild_fe1_s01_4_gccphat_1353.npy,165.728483,-2.503814,0.636889,S01
3,3,spec/afpild_fe1_s02_2_melspec_2192.npy,gcc/afpild_fe1_s02_2_gccphat_2192.npy,140.908809,-2.245858,1.824586,S02
4,4,spec/afpild_fe1_s03_3_melspec_4320.npy,gcc/afpild_fe1_s03_3_gccphat_4320.npy,-135.872432,-1.913522,-1.856118,S03
...,...,...,...,...,...,...,...
7640,7640,spec/afpild_fe1_s05_1_melspec_6374.npy,gcc/afpild_fe1_s05_1_gccphat_6374.npy,-128.619019,-1.709304,-2.139751,S05
7641,7641,spec/afpild_fe1_s02_3_melspec_2313.npy,gcc/afpild_fe1_s02_3_gccphat_2313.npy,168.723426,-2.281209,0.454861,S02
7642,7642,spec/afpild_fe1_s02_4_melspec_3047.npy,gcc/afpild_fe1_s02_4_gccphat_3047.npy,-17.853105,2.482435,-0.799562,S02
7643,7643,spec/afpild_fe1_s04_4_melspec_6005.npy,gcc/afpild_fe1_s04_4_gccphat_6005.npy,22.512584,2.305999,0.955770,S04


In [12]:
train_gcc = train_gcc.reshape(train_gcc.shape[0], train_gcc.shape[1], -1)  # Shape: (None, 64, 384)
test_gcc = test_gcc.reshape(test_gcc.shape[0], test_gcc.shape[1], -1)  # Shape: (None, 64, 384)
train_spec = train_spec.reshape(train_spec.shape[0], train_spec.shape[1], -1)  # Shape: (None, 64, 256)
test_spec = test_spec.reshape(test_spec.shape[0], test_spec.shape[1], -1)  # Shape: (None, 64, 256)



In [13]:
# Split the data
X_train_gcc, X_val_gcc, X_train_spec, X_val_spec, y_train, y_val, y_train_loc_x_tr, y_train_loc_x_val, y_train_loc_y_tr, y_train_loc_y_val = train_test_split(
    train_gcc, train_spec, train_labels, y_train_loc_x, y_train_loc_y, test_size=0.2, random_state=42
)

In [14]:
# Check shapes of loaded data
print("Train GCC shape:", train_gcc.shape)
print("Train Spec shape:", train_spec.shape)
print("Train labels shape:", train_labels.shape)
print("Test GCC shape:", test_gcc.shape)
print("Test Spec shape:", test_spec.shape)
print("Test labels shape:", test_labels.shape)

Train GCC shape: (7645, 64, 384)
Train Spec shape: (7645, 64, 256)
Train labels shape: (7645,)
Test GCC shape: (7645, 64, 384)
Test Spec shape: (7645, 64, 256)
Test labels shape: (7645,)


In [15]:
# Check input shapes
print("Input shape for GCC:", (X_train_gcc.shape[1], X_train_gcc.shape[2]))
print("Input shape for Spec:", (X_train_spec.shape[1], X_train_spec.shape[2]))

Input shape for GCC: (64, 384)
Input shape for Spec: (64, 256)


In [16]:
output_units = len(label_to_index)

In [17]:
model = build_crnn_model(
    input_shape_gcc=(X_train_gcc.shape[1], X_train_gcc.shape[2]),
    input_shape_spec=(X_train_spec.shape[1], X_train_spec.shape[2]),
    hidden_units=128,
    output_units=output_units,
    dropout_rate=0.2
)

# Print model summary
model.summary()

In [18]:
# Compile the model
model.compile(
    optimizer='adam',
    loss={
        'output_loc_x': 'mse',
        'output_loc_y': 'mse',
        'output_subject': 'categorical_crossentropy'
    },
    metrics={
        'output_loc_x': 'mae',
        'output_loc_y': 'mae',
        'output_subject': 'accuracy'
    }
)


In [24]:
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
model_checkpoint_callback = ModelCheckpoint(
    filepath=os.path.join(log_dir, 'best_model.h5'),
    monitor='val_output_subject_accuracy',  # Monitor validation accuracy
    save_best_only=True,  # Save only the best model
    mode='max',  # Maximize validation accuracy
    verbose=1
)
early_stopping_callback = EarlyStopping(
    monitor='val_output_subject_accuracy',  # Monitor validation accuracy
    patience=3,  # Stop if validation accuracy doesn't improve for 3 epochs
    mode='max',  # Maximize validation accuracy
    verbose=1
)

In [25]:
history = model.fit(
    [X_train_gcc, X_train_spec],
    {
        'output_loc_x': y_train_loc_x_tr,
        'output_loc_y': y_train_loc_y_tr,
        'output_subject': tf.keras.utils.to_categorical(y_train, num_classes=output_units)
    },
    epochs=epochs,
    batch_size=batch_size,
    validation_split=0.2,
    verbose=1,
    callbacks=[tensorboard_callback, model_checkpoint_callback, early_stopping_callback]
)

Epoch 1/5
[1m489/490[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - loss: 0.7748 - output_loc_x_loss: 0.2404 - output_loc_x_mae: 0.3833 - output_loc_y_loss: 0.2469 - output_loc_y_mae: 0.3975 - output_subject_accuracy: 0.8954 - output_subject_loss: 0.2875
Epoch 1: val_output_subject_accuracy improved from -inf to 0.80474, saving model to logs/20250309-103314/best_model.h5




[1m490/490[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 37ms/step - loss: 0.7746 - output_loc_x_loss: 0.2404 - output_loc_x_mae: 0.3833 - output_loc_y_loss: 0.2468 - output_loc_y_mae: 0.3975 - output_subject_accuracy: 0.8954 - output_subject_loss: 0.2874 - val_loss: 1.2323 - val_output_loc_x_loss: 0.3952 - val_output_loc_x_mae: 0.4307 - val_output_loc_y_loss: 0.2556 - val_output_loc_y_mae: 0.3490 - val_output_subject_accuracy: 0.8047 - val_output_subject_loss: 0.5791
Epoch 2/5
[1m488/490[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 32ms/step - loss: 0.6439 - output_loc_x_loss: 0.2269 - output_loc_x_mae: 0.3726 - output_loc_y_loss: 0.2064 - output_loc_y_mae: 0.3564 - output_subject_accuracy: 0.9253 - output_subject_loss: 0.2105
Epoch 2: va

In [26]:
# Evaluate the model on the test set
_, _, y_pred_subject = model.predict([test_gcc, test_spec])
y_pred_subject_labels = np.argmax(y_pred_subject, axis=1)

# Calculate and print the accuracy
test_accuracy = accuracy_score(test_labels, y_pred_subject_labels)
print(f"Test Accuracy: {test_accuracy}")

[1m239/239[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 36ms/step
Test Accuracy: 0.787573577501635


In [27]:
# Evaluate the model
test_results = model.evaluate(
    [test_gcc, test_spec],
    {
        'output_loc_x': y_test_loc_x,
        'output_loc_y': y_test_loc_y,
        'output_subject': tf.keras.utils.to_categorical(test_labels, num_classes=output_units)
    },
    verbose=2
)

# Get the accuracy for the 'output_subject' output
subject_accuracy = test_results[3]  # Accuracy is usually the 4th element (index 3)

239/239 - 9s - 36ms/step - loss: 1.5075 - output_loc_x_loss: 0.5026 - output_loc_x_mae: 0.4784 - output_loc_y_loss: 0.2644 - output_loc_y_mae: 0.3824 - output_subject_accuracy: 0.7876 - output_subject_loss: 0.7405
