In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Animal-Soundprepros

/content/drive/MyDrive/Animal-Soundprepros


In [3]:
import librosa
import numpy as np
import warnings
warnings.filterwarnings("ignore")

def stats(matrix):
    if matrix.size == 0:
        return np.zeros(5)
    return np.array([
        np.mean(matrix),
        np.std(matrix),
        np.max(matrix),
        np.min(matrix),
        np.median(matrix)
    ])

def librosa_featurize(filename):
    y, sr = librosa.load(filename)
    y = y[::3]

    S = np.abs(librosa.stft(y))
    C = np.abs(librosa.cqt(y, sr=sr))

    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    poly_features = librosa.feature.poly_features(S=S, sr=sr)
    chroma_cens = librosa.feature.chroma_cens(C=C, sr=sr)
    chroma_cqt = librosa.feature.chroma_cqt(y=y, sr=sr)
    chroma_stft = librosa.feature.chroma_stft(y=y, sr=sr)
    tempogram = librosa.feature.tempogram(y=y, sr=sr)

    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)[0]
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=y, sr=sr)[0]
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)[0]
    spectral_flatness = librosa.feature.spectral_flatness(y=y)[0]
    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr)[0]

    onset = librosa.onset.onset_detect(y=y, sr=sr)
    if onset.size == 0:
        onset_stats = np.zeros(5)
    else:
        onset_stats = stats(onset)

    onset_features = np.concatenate([
        np.array([len(onset)]),
        onset_stats,
        np.array([librosa.beat.tempo(y=y, sr=sr)[0]]),
        stats(librosa.onset.onset_strength(y=y, sr=sr))
    ])

    rhythm_features = np.concatenate([stats(tempogram[i]) for i in range(min(13, tempogram.shape[0]))])

    spectral_features = np.concatenate([
        *[stats(mfcc[i]) for i in range(min(13, mfcc.shape[0]))],
        stats(poly_features[0]),
        stats(poly_features[1]),
        stats(spectral_centroid),
        stats(spectral_bandwidth),
        stats(spectral_contrast),
        stats(spectral_flatness),
        stats(spectral_rolloff)
    ])

    power_features = np.concatenate([
        stats(librosa.feature.zero_crossing_rate(y=y)[0]),
        stats(librosa.feature.rms(y=y)[0])
    ])

    features = np.concatenate([
        onset_features,
        rhythm_features,
        spectral_features,
        power_features
    ])

    return features


In [14]:
import librosa
import os
from sklearn.model_selection import train_test_split
from keras.utils import to_categorical
import numpy as np
from tqdm import tqdm

DATA_PATH = "/content/drive/MyDrive/Animal-Soundprepros"


# Input: Folder Path
# Output: Tuple (Label, Indices of the labels, one-hot encoded labels)
def get_labels(path=DATA_PATH):
    labels = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    label_indices = np.arange(0, len(labels))
    return labels, label_indices, to_categorical(label_indices)


# Handy function to convert wav2mfcc
def wav2mfcc(file_path, max_len=13):
    wave, sr = librosa.load(file_path, mono=True, sr=None)
    mfcc = librosa.feature.mfcc(y=wave, sr=sr)
    # If maximum length exceeds mfcc lengths then pad the remaining ones
    if (max_len > mfcc.shape[1]):
        pad_width = max_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')

    # Else cutoff the remaining parts
    else:
        mfcc = mfcc[:, :max_len]

    return mfcc


def save_mfcc_data_to_array(path=DATA_PATH, max_len=13):
    labels, _, _ = get_labels(path)

    for label in labels:
        # Init mfcc vectors
        mfcc_vectors = []

        wavfiles = [os.path.join(path, label, wavfile) for wavfile in os.listdir(os.path.join(path, label))]
        for wavfile in tqdm(wavfiles, "Saving vectors of label - '{}'".format(label)):
            mfcc = wav2mfcc(wavfile, max_len=max_len)
            mfcc_vectors.append(mfcc)
        np.save(os.path.join(path, label + '.npy'), mfcc_vectors)


def save_feature_data_to_array(path=DATA_PATH):
    labels, _, _ = get_labels(path)

    for label in labels:
        # Init mfcc vectors
        feature_vectors = []

        wavfiles = [os.path.join(path, label, wavfile) for wavfile in os.listdir(os.path.join(path, label)) if wavfile.endswith('.wav')]
        for wavfile in tqdm(wavfiles, "Saving vectors of label - '{}'".format(label)):
            feature = librosa_featurize(wavfile)
            feature_vectors.append(feature)
        np.save(label + '.npy', feature_vectors)


def get_train_test(split_ratio=0.8, random_state=42):
    # Get available labels
    labels, indices, _ = get_labels(DATA_PATH)

    # Getting first arrays
    X = np.load(os.path.join(DATA_PATH, labels[0] + '.npy'))
    y = np.zeros(X.shape[0])

    # Append all of the dataset into one single array, same goes for y
    for i, label in enumerate(labels[1:]):
        x = np.load(os.path.join(DATA_PATH, label + '.npy'))
        X = np.vstack((X, x))
        y = np.append(y, np.full(x.shape[0], fill_value= (i + 1)))

    assert X.shape[0] == len(y)

    return train_test_split(X, y, test_size= (1 - split_ratio), random_state=random_state, shuffle=True)


def prepare_dataset(path=DATA_PATH):
    labels, _, _ = get_labels(path)
    data = {}
    for label in labels:
        data[label] = {}
        data[label]['path'] = [path  + label + '/' + wavfile for wavfile in os.listdir(path + '/' + label)]

        vectors = []

        for wavfile in data[label]['path']:
            wave, sr = librosa.load(wavfile, mono=True, sr=None)
            # Downsampling
            wave = wave[::3]
            mfcc = librosa.feature.mfcc(wave, sr=16000)
            vectors.append(mfcc)

        data[label]['mfcc'] = vectors

    return data


def load_dataset(path=DATA_PATH):
    data = prepare_dataset(path)

    dataset = []

    for key in data:
        for mfcc in data[key]['mfcc']:
            dataset.append((key, mfcc))

    return dataset[:100]


In [11]:
# Save data to array file first
save_feature_data_to_array()

# # Loading train set and test set
X_train, X_test, y_train, y_test = get_train_test()

Saving vectors of label - 'Aslan': 100%|██████████| 50/50 [00:12<00:00,  3.89it/s]
Saving vectors of label - 'Bear': 100%|██████████| 50/50 [00:11<00:00,  4.29it/s]
Saving vectors of label - 'Cat': 100%|██████████| 50/50 [00:11<00:00,  4.50it/s]
Saving vectors of label - 'Chicken': 100%|██████████| 50/50 [00:12<00:00,  4.06it/s]
Saving vectors of label - 'Cow': 100%|██████████| 50/50 [00:10<00:00,  4.84it/s]
Saving vectors of label - 'Dog': 100%|██████████| 50/50 [00:09<00:00,  5.06it/s]
Saving vectors of label - 'Dolphin': 100%|██████████| 50/50 [00:11<00:00,  4.40it/s]
Saving vectors of label - 'Donkey': 100%|██████████| 50/50 [00:12<00:00,  3.97it/s]
Saving vectors of label - 'Elephant': 100%|██████████| 50/50 [00:12<00:00,  4.05it/s]
Saving vectors of label - 'Frog': 100%|██████████| 50/50 [00:11<00:00,  4.46it/s]
Saving vectors of label - 'Horse': 100%|██████████| 50/50 [00:09<00:00,  5.25it/s]
Saving vectors of label - 'Monkey': 100%|██████████| 50/50 [00:12<00:00,  4.00it/s]
Sav

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Initialize the RandomForestClassifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
rf_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = rf_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

# Print evaluation results
print("Random Forest Classifier Evaluation")
print(f"Accuracy: {accuracy * 100:.2f}%")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Random Forest Classifier Evaluation
Accuracy: 74.62%
Confusion Matrix:
[[ 9  1  0  0  0  0  0  0  0  0  0  0  0]
 [ 1 13  0  0  1  0  0  0  0  0  1  0  0]
 [ 0  0  8  0  1  0  1  0  1  0  0  0  0]
 [ 0  0  0  5  1  1  0  0  1  0  0  0  0]
 [ 0  0  0  0 11  0  0  0  0  0  0  0  0]
 [ 0  0  0  1  0  8  0  0  0  0  0  0  0]
 [ 0  0  1  1  0  0  7  0  1  0  2  0  1]
 [ 0  0  0  0  0  0  0  8  0  0  0  0  0]
 [ 1  0  0  1  0  0  0  0  7  1  0  0  1]
 [ 0  0  0  0  0  0  0  0  0  7  0  0  0]
 [ 0  1  0  0  0  0  0  0  0  0  6  2  0]
 [ 0  0  0  0  0  0  0  0  1  0  0  4  1]
 [ 0  1  0  0  0  1  1  0  0  0  1  3  4]]
Classification Report:
              precision    recall  f1-score   support

         0.0       0.82      0.90      0.86        10
         1.0       0.81      0.81      0.81        16
         2.0       0.89      0.73      0.80        11
         3.0       0.62      0.62      0.62         8
         4.0       0.79      1.00      0.88        11
         5.0       0.80      0.89 

In [15]:
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D
from keras.utils import to_categorical

# Second dimension of the feature is dim2
feature_dim_2 = 13

# Save data to array file first
save_mfcc_data_to_array(max_len=feature_dim_2)

# # Loading train set and test set
X_train, X_test, y_train, y_test = get_train_test()

# # Feature dimension
feature_dim_1 = 20
channel = 1
epochs = 50
batch_size = 64  #原為100
verbose = 1
num_classes = 13

# Reshaping to perform 2D convolution
X_train = X_train.reshape(X_train.shape[0], feature_dim_1, feature_dim_2, channel)
X_test = X_test.reshape(X_test.shape[0], feature_dim_1, feature_dim_2, channel)

y_train_hot = to_categorical(y_train)
y_test_hot = to_categorical(y_test)

Saving vectors of label - 'Aslan': 100%|██████████| 50/50 [00:00<00:00, 80.77it/s]
Saving vectors of label - 'Bear': 100%|██████████| 50/50 [00:00<00:00, 74.71it/s]
Saving vectors of label - 'Cat': 100%|██████████| 50/50 [00:01<00:00, 28.80it/s]
Saving vectors of label - 'Chicken': 100%|██████████| 50/50 [00:01<00:00, 35.08it/s]
Saving vectors of label - 'Cow': 100%|██████████| 50/50 [00:00<00:00, 102.38it/s]
Saving vectors of label - 'Dog': 100%|██████████| 50/50 [00:00<00:00, 128.23it/s]
Saving vectors of label - 'Dolphin': 100%|██████████| 50/50 [00:00<00:00, 132.12it/s]
Saving vectors of label - 'Donkey': 100%|██████████| 50/50 [00:00<00:00, 96.67it/s]
Saving vectors of label - 'Elephant': 100%|██████████| 50/50 [00:00<00:00, 90.99it/s]
Saving vectors of label - 'Frog': 100%|██████████| 50/50 [00:00<00:00, 88.93it/s]
Saving vectors of label - 'Horse': 100%|██████████| 50/50 [00:00<00:00, 120.48it/s]
Saving vectors of label - 'Monkey': 100%|██████████| 50/50 [00:00<00:00, 71.91it/s]

In [16]:
def get_model():
    model = Sequential()
    model.add(Conv2D(32, kernel_size=(2, 2), activation='relu', input_shape=(feature_dim_1, feature_dim_2, channel)))
    model.add(Conv2D(48, kernel_size=(2, 2), activation='relu'))
    model.add(Conv2D(120, kernel_size=(2, 2), activation='relu'))
    model.add(MaxPooling2D(pool_size=(2, 2)))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dense(64, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss=keras.losses.categorical_crossentropy,
                  optimizer=keras.optimizers.Adam(),
                  metrics=['accuracy'])
    return model

# Predicts one sample
def predict(filepath, model):
    sample = wav2mfcc(filepath)
    sample_reshaped = sample.reshape(1, feature_dim_1, feature_dim_2, channel)
    return get_labels()[0][
            np.argmax(model.predict(sample_reshaped))
    ]

# Building The Model Then Training it

In [17]:
model = get_model()
model.fit(X_train, y_train_hot, batch_size=batch_size, epochs=epochs, verbose=verbose, validation_data=(X_test, y_test_hot))

Epoch 1/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 382ms/step - accuracy: 0.0757 - loss: 4.0900 - val_accuracy: 0.1385 - val_loss: 2.5440
Epoch 2/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 11ms/step - accuracy: 0.1501 - loss: 2.4831 - val_accuracy: 0.2769 - val_loss: 2.3036
Epoch 3/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.2651 - loss: 2.2270 - val_accuracy: 0.2615 - val_loss: 2.1638
Epoch 4/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4086 - loss: 1.9190 - val_accuracy: 0.3615 - val_loss: 1.9385
Epoch 5/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.4528 - loss: 1.7478 - val_accuracy: 0.4615 - val_loss: 1.7529
Epoch 6/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.5323 - loss: 1.4545 - val_accuracy: 0.4692 - val_loss: 1.6731
Epoch 7/50
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7e22e3629910>

## Prediction

In [22]:
print(predict('/content/drive/MyDrive/Animal-Soundprepros/Dolphin/Dolphin_37.wav', model=model))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Dolphin
