In [None]:
import tensorflow as tf
import tensorflow_hub as hub
import librosa
import os
import numpy as np
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import Dropout, Dense, InputLayer, Flatten
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
model = hub.load("https://tfhub.dev/google/vggish/1")

In [8]:
dataset_dir = rf'VocalSet1-2\data_by_technique'
fixed_sample_rate = 16000
fixed_duration = 5
classes = 17
x = []
y = []

In [9]:
def preprocess_vocalset_data(file_path):

    audio, sample_rate = librosa.load(file_path, sr=fixed_sample_rate)

    if len(audio.shape) > 1:
        audio = np.mean(audio, axis=1)
    
    audio = tf.convert_to_tensor(audio, dtype=tf.float32)

    target_length = int(sample_rate*fixed_duration)
    if len(audio) < target_length:
        audio = np.pad(audio, (0, target_length - len(audio)))
    elif len(audio) > target_length:
        audio = audio[:target_length]
    
    return audio

In [10]:
def get_vggish_embeddings(waveform):
    embeddings = model(waveform)
    embeddings.shape.assert_is_compatible_with([None, 128])
    return embeddings

In [11]:
for label, class_dir in enumerate(sorted(os.listdir(dataset_dir))):
    class_path = os.path.join(dataset_dir, class_dir)
    if os.path.isdir(class_path):
        for file_name in os.listdir(class_path):
            file_path = os.path.join(class_path, file_name)
            if file_name.endswith('.wav'):
                audio_tensor = preprocess_vocalset_data(file_path)
                embeddings = get_vggish_embeddings(audio_tensor)
                x.append(embeddings)
                y.append(label-1)

In [12]:
x1 = np.array(x)
y1 = np.array(y)

In [13]:
y1 = to_categorical(y1, num_classes=classes)

In [14]:
x_train, x_test, y_train, y_test = train_test_split(x1, y1, test_size=0.2, random_state=1)

In [None]:
dense1 = Dense(1024, activation='relu')
dense2= Dense(512, activation='relu')
dense3 = Dense(128, activation='relu')
dropout_layer = Dropout(0.5)
output_layer = Dense(17, activation='softmax')
input_layer = InputLayer(shape=(x_train.shape[1],x_train.shape[2]))
flatten_layer = Flatten()


In [16]:
model1 = tf.keras.Sequential([input_layer, flatten_layer, dense1, dropout_layer, dense2, dense3, dropout_layer, output_layer])

In [17]:
model1.summary()

In [18]:
model1.compile(optimizer=Adam(learning_rate=0.0001), loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
model1.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test))

Epoch 1/50
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 8ms/step - accuracy: 0.1493 - loss: 2.6752 - val_accuracy: 0.3444 - val_loss: 2.0761
Epoch 2/50
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.2941 - loss: 2.1822 - val_accuracy: 0.3956 - val_loss: 1.7996
Epoch 3/50
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.3744 - loss: 1.8992 - val_accuracy: 0.4398 - val_loss: 1.6650
Epoch 4/50
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4030 - loss: 1.7659 - val_accuracy: 0.4564 - val_loss: 1.5467
Epoch 5/50
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4389 - loss: 1.7033 - val_accuracy: 0.5076 - val_loss: 1.4513
Epoch 6/50
[1m91/91[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.4651 - loss: 1.5528 - val_accuracy: 0.5353 - val_loss: 1.3796
Epoch 7/50
[1m91/91[0m [32m━━━━━━━━━━

<keras.src.callbacks.history.History at 0x24fe7fdc550>