In [16]:
pip install librosa

[0mNote: you may need to restart the kernel to use updated packages.


In [43]:
import librosa
import numpy as np
import json
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv2D, MaxPooling2D, Flatten, Dense, Dropout

# MAEC - Multilabel Audio Event Classification

# Function to chunk audio
def chunk_audio(audio_path, chunk_duration=2.0, sr=22050):
    y, sr = librosa.load(audio_path, sr=sr)
    chunk_length = int(chunk_duration * sr)
    chunks = [y[i:i + chunk_length] for i in range(0, len(y), chunk_length) if len(y[i:i + chunk_length]) == chunk_length]
    return chunks

# Function to convert chunks to spectrograms
def chunks_to_spectrograms(chunks, sr=22050):
    spectrograms = []
    for chunk in chunks:
        S = librosa.feature.melspectrogram(y=chunk, sr=sr, n_mels=128)
        S_DB = librosa.power_to_db(S, ref=np.max)
        spectrograms.append(S_DB)
    return np.array(spectrograms)

# Function to create the model
def create_model(input_shape, num_types, num_sub_categories):
    inputs = Input(shape=input_shape)
    x = Conv2D(32, (3, 3), activation='relu')(inputs)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Flatten()(x)
    x = Dense(256, activation='relu')(x)
    x = Dropout(0.5)(x)

    type_output = Dense(num_types, activation='sigmoid', name='type_output')(x)
    sub_category_output = Dense(num_sub_categories, activation='sigmoid', name='sub_category_output')(x)

    model = Model(inputs=inputs, outputs=[type_output, sub_category_output])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics={'type_output': 'accuracy', 'sub_category_output': 'accuracy'})
    return model

# Example labels for each chunk
labels = [
    [('dog', 'maltese'), ('bird', 'cockatiel')],
    [('cat', 'siamese'), ('dog', 'maltese')],
    [('bird', 'sparrow')],
    [('dog', 'labrador')],
    [('cat', 'persian')]
]

# Separate types and sub-categories
types = [[label[0] for label in chunk] for chunk in labels]
sub_categories = [[label[1] for label in chunk] for chunk in labels]

# Use MultiLabelBinarizer for one-hot encoding
mlb_types = MultiLabelBinarizer()
mlb_sub_categories = MultiLabelBinarizer()

y_types = mlb_types.fit_transform(types)
y_sub_categories = mlb_sub_categories.fit_transform(sub_categories)

# Create and train the model
input_shape = (128, 128, 1)  # Example shape; adjust according to your spectrogram dimensions
num_types = len(mlb_types.classes_)
num_sub_categories = len(mlb_sub_categories.classes_)
model = create_model(input_shape, num_types, num_sub_categories)

# Train the model
history = model.fit(
    X_train, 
    {'type_output': y_train_types, 'sub_category_output': y_train_sub_categories},
    epochs=20, 
    batch_size=32,
    validation_data=(X_val, {'type_output': y_val_types, 'sub_category_output': y_val_sub_categories})
)

# Predict on validation data
type_predictions, sub_category_predictions = model.predict(X_val)

threshold = 0.3  # Lower the threshold to capture more predictions

def interpret_predictions(type_preds, sub_category_preds, type_labels, sub_category_labels, threshold=0.3):
    results = []
    for t_pred, s_pred in zip(type_preds, sub_category_preds):
        chunk_results = []
        for i, type_prob in enumerate(t_pred):
            if type_prob > threshold:
                chunk_results.append({"label": type_labels[i], "type": "type", "confidence": f"{type_prob:.2f}"})
        for i, sub_category_prob in enumerate(s_pred):
            if sub_category_prob > threshold:
                chunk_results.append({"label": sub_category_labels[i], "type": "sub_category", "confidence": f"{sub_category_prob:.2f}"})
        results.append(chunk_results)
    return results

def process_audio_file(audio_path, model, mlb_types, mlb_sub_categories, chunk_duration=2.0, sr=22050, threshold=0.3):
    chunks = chunk_audio(audio_path, chunk_duration, sr)
    spectrograms = chunks_to_spectrograms(chunks, sr)
    spectrograms = np.array([np.pad(s, ((0, 0), (0, max(128 - s.shape[1], 0))), mode='constant') for s in spectrograms])
    spectrograms = np.expand_dims(spectrograms, axis=-1)  # Add channel dimension for CNN
    
    type_predictions, sub_category_predictions = model.predict(spectrograms)
    
    results = interpret_predictions(type_predictions, sub_category_predictions, mlb_types.classes_, mlb_sub_categories.classes_, threshold)
    
    formatted_results = []
    for i, chunk_results in enumerate(results):
        chunk_formatted = []
        for result in chunk_results:
            chunk_formatted.append(result)
        formatted_results.append({"chunk": i + 1, "results": chunk_formatted})
    
    return json.dumps(formatted_results, indent=4)

# Example usage
audio_path = 'birds.wav'
json_output = process_audio_file(audio_path, model, mlb_types, mlb_sub_categories)
print(json_output)


Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2s/step - loss: 1.6176 - sub_category_output_accuracy: 0.0000e+00 - type_output_accuracy: 0.5000 - val_loss: 21.6845 - val_sub_category_output_accuracy: 0.0000e+00 - val_type_output_accuracy: 0.0000e+00
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - loss: 15.4909 - sub_category_output_accuracy: 0.5000 - type_output_accuracy: 0.2500 - val_loss: 12.0404 - val_sub_category_output_accuracy: 0.0000e+00 - val_type_output_accuracy: 0.0000e+00
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - loss: 9.4431 - sub_category_output_accuracy: 0.2500 - type_output_accuracy: 0.2500 - val_loss: 6.6561 - val_sub_category_output_accuracy: 0.0000e+00 - val_type_output_accuracy: 0.0000e+00
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - loss: 5.2874 - sub_category_output_accuracy: 0.5000 - type_output_accuracy: 0.0000e+00 - va