In [1]:
import os
import numpy as np
import librosa
import tensorflow as tf
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import tensorflow as tf

In [2]:
# Function to load audio and compute MFCCs
def load_and_preprocess_audio(file_path, n_mfcc=80):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=n_mfcc)
    mean_mfccs = np.mean(mfccs, axis=1, keepdims=True)
    return mean_mfccs

In [3]:
# Paths to your data
data_dir = 'data_set'
categories = ['ambulance', 'firetruck', 'traffic']
X = []
y = []

# Load and preprocess data
for label, category in enumerate(categories):
    folder_path = os.path.join(data_dir, category)
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        # Check if the file is a .wav file
        if file_name.lower().endswith('.wav'):
            file_path = os.path.join(folder_path, file_name)
            mfcc_features = load_and_preprocess_audio(file_path)
            X.append(mfcc_features)
            y.append(label)

X = np.array(X)
y = np.array(y)

# Label Encoding
num_classes = len(categories)
y = to_categorical(y, num_classes=num_classes)

In [4]:
def build_model(input_shape, num_classes):
    inputs = tf.keras.Input(shape=input_shape)
    
    # Example: Use a Conv1D network
    x = tf.keras.layers.Conv1D(16, 3, activation='relu', padding='same')(inputs)
    x = tf.keras.layers.MaxPooling1D(2)(x)
    x = tf.keras.layers.Conv1D(32, 3, activation='relu', padding='same')(x)
    x = tf.keras.layers.MaxPooling1D(2)(x)
    x = tf.keras.layers.GlobalMaxPooling1D()(x)
    x = tf.keras.layers.Dense(64, activation='relu')(x)
    outputs = tf.keras.layers.Dense(num_classes, activation='softmax')(x)

    model = tf.keras.Model(inputs, outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    
    return model

In [5]:
# Build and compile the model
input_shape = (X.shape[1], 1)  # Assuming X has shape (num_samples, n_mfcc, 1)
model = build_model(input_shape, num_classes)

# Train the model
history = model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

Epoch 1/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 37ms/step - accuracy: 0.4065 - loss: 4.6155 - val_accuracy: 1.0000 - val_loss: 0.0411
Epoch 2/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7491 - loss: 0.6166 - val_accuracy: 0.9333 - val_loss: 0.3518
Epoch 3/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8495 - loss: 0.3888 - val_accuracy: 0.9833 - val_loss: 0.1083
Epoch 4/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8246 - loss: 0.4608 - val_accuracy: 1.0000 - val_loss: 0.0271
Epoch 5/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8657 - loss: 0.3423 - val_accuracy: 0.9833 - val_loss: 0.0392
Epoch 6/10
[1m15/15[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8815 - loss: 0.2896 - val_accuracy: 0.9833 - val_loss: 0.0350
Epoch 7/10
[1m15/15[0m [32m━━━━━━━━━

In [6]:
# Evaluate the model
loss, accuracy = model.evaluate(X, y)
print(f'Accuracy: {accuracy:.2f}')

# Save the model
model.save('sound_classification_model.h5')

[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7171 - loss: 0.5846




Accuracy: 0.87
