# Installs and Imports

In [1]:
!pip install soundata
!pip install google-cloud-storage

Collecting soundata
  Downloading soundata-1.0.1-py3-none-any.whl.metadata (7.4 kB)
Collecting jams>=0.3.4 (from soundata)
  Downloading jams-0.3.4.tar.gz (51 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m51.3/51.3 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr>=0.16.0 (from soundata)
  Downloading py7zr-0.22.0-py3-none-any.whl.metadata (16 kB)
Collecting mir_eval>=0.5 (from jams>=0.3.4->soundata)
  Downloading mir_eval-0.8.2-py3-none-any.whl.metadata (3.0 kB)
Collecting texttable (from py7zr>=0.16.0->soundata)
  Downloading texttable-1.7.0-py2.py3-none-any.whl.metadata (9.8 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr>=0.16.0->soundata)
  Downloading pycryptodomex-3.22.0-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.4 kB)
Collecting pyzstd>=0.15.9 (from py7zr>=0.16.0->soundata)
  Downloading pyzstd-0.16.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl

In [2]:
import soundata
import os
import numpy as np
import pandas as pd
import librosa
import tensorflow as tf
import matplotlib.pyplot as plt
import librosa.display
import time
import tempfile
import pickle

from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from google.colab import drive
from tensorflow.keras import layers, models
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.utils import to_categorical
from google.cloud import storage
from google.colab import auth
from sklearn.metrics import classification_report, confusion_matrix

In [3]:
# Initialize the client and get the bucket
auth.authenticate_user()
client = storage.Client()
bucket = client.get_bucket("urbansound")

## Data download and set up (one-time)

Get the data set of sounds

In [4]:
# Mount Google Drive
# drive.mount('/content/drive')
# data_home = '/content/drive/MyDrive/datasets/urbansound8k/'

# dataset = soundata.initialize('urbansound8k', data_home=data_home)
# dataset.download(extras_only=True)

# uncomment if you need to download the data
# dataset.download()

# dataset.validate()

# Functions to extract features and process the datas

## Feature Generation Functions

In [5]:
def parse_data(metadata_csv, audio_dir, feature_extractor, folds=[1,2,3,4,5,6,7,8,9,10],
               sr=22050, max_pad_len=174, **kwargs):

    # Read the data in, and filter to the folds we want
    # metadata = pd.read_csv(metadata_csv)
    filtered_metadata = metadata_csv[metadata_csv['fold'].isin(folds)]
    total_files = len(filtered_metadata)
    progress_bar = tqdm(total=total_files, desc="Extracting features")

    # This is where we will store the features and labels
    features = []
    labels = []
    errors = 0

    # For each row in the meta data, pull that audio file and generate a feature
    for _, row in filtered_metadata.iterrows():

      file_path = os.path.join(audio_dir, f"fold{row['fold']}", row['slice_file_name'])
      # print(file_path)
      # I kept getting bugs in feature gen
      # So wrap in try/except so that we don't stop feature gen at the very end
      # Sometimes the audio clips are very small
      try:
        feature = feature_extractor(file_path, sr=sr, max_pad_len=max_pad_len, **kwargs)
        features.append(feature)
        labels.append(row['classID'])

      except Exception as e:
        print(f"Error processing {file_path}: {e}")
        errors += 1

      progress_bar.update(1)

    progress_bar.close()

    X = np.array(features)
    y = to_categorical(np.array(labels))

    print(f"Total errors: {errors}")

    return X, y


In [6]:
def generate_mfcc_features(file_path, sr=22050, n_mfcc=40, max_pad_len=174):

    # Load the audio data in from GCP
    audioblob = bucket.blob(file_path)

    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
      audioblob.download_to_filename(tmp.name)
      signal, sr = librosa.load(tmp.name, sr=sr)

    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=n_mfcc)

    # Pad or truncate to ensure consistent shape along time axis
    if mfcc.shape[1] < max_pad_len:
        pad_width = max_pad_len - mfcc.shape[1]
        mfcc = np.pad(mfcc, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        mfcc = mfcc[:, :max_pad_len]

    # Transpose so that shape becomes (time, n_mfcc)
    return mfcc.T

def generate_spectrogram_features(file_path, sr=22050, n_fft=2048, hop_length=512, max_pad_len=174):

    audioblob = bucket.blob(file_path)

    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
      audioblob.download_to_filename(tmp.name)
      signal, sr = librosa.load(tmp.name, sr=sr)

    melspec = librosa.feature.melspectrogram(y=signal, sr=sr, n_fft=n_fft, hop_length=hop_length)
    melspec = librosa.power_to_db(melspec, ref=np.max)

    # Pad or truncate to a fixed number of time frames
    if melspec.shape[1] < max_pad_len:
        pad_width = max_pad_len - melspec.shape[1]
        melspec = np.pad(melspec, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        melspec = melspec[:, :max_pad_len]
    # Transpose so that shape becomes (time, n_mels)
    return melspec.T

def generate_delta_features(file_path, n_fft=2048, sr=22050, max_pad_len=174, hop_length=515):
    # Load the audio data from GCP
    audioblob = bucket.blob(file_path)
    with tempfile.NamedTemporaryFile(suffix=".wav") as tmp:
        audioblob.download_to_filename(tmp.name)
        signal, sr = librosa.load(tmp.name, sr=sr)

    # Compute MFCC features (using a default of 40 coefficients)
    mfcc = librosa.feature.mfcc(y=signal, sr=sr, n_mfcc=40, n_fft=n_fft, hop_length=hop_length)

    # Compute the delta (first derivative) of the MFCCs
    delta = librosa.feature.delta(mfcc)

    # Pad or truncate along the time axis to ensure consistent shape
    if delta.shape[1] < max_pad_len:
        pad_width = max_pad_len - delta.shape[1]
        delta = np.pad(delta, pad_width=((0, 0), (0, pad_width)), mode='constant')
    else:
        delta = delta[:, :max_pad_len]

    # Transpose so that the final shape is (time, n_mfcc)
    return delta.T

## Connect to GCS and Extract Features

### One-time feature creation and storage

In [7]:
!gsutil ls gs://urbansound/features/

gs://urbansound/features/
gs://urbansound/features/mfcc_features.pkl
gs://urbansound/features/delta/
gs://urbansound/features/mfcc/
gs://urbansound/features/mfcc_old/
gs://urbansound/features/spectogram/


In [8]:
# Load the meta data in from GCP
metablob = bucket.blob("datasets/urbansound8k/metadata/UrbanSound8K.csv")

with tempfile.NamedTemporaryFile(suffix=".csv") as tmp:
  metablob.download_to_filename(tmp.name)
  metadata_csv = tmp.name
  metadata_csv = pd.read_csv(metadata_csv)

audio_dir = "datasets/urbansound8k/audio"

gen_mfcc_features = False
gen_spectrogram_features = False
gen_delta_features = False

In [None]:
# Generate MFCC features for each fold and store in GCS
mfcc_features = {}

if gen_mfcc_features:
  for fold in metadata_csv['fold'].unique():

    print(f"Fold {fold}: {len(metadata_csv[metadata_csv['fold'] == fold])} samples")

    fold_features = {}

    for n_mfcc in [60]:
      print(f"Generating features for fold {fold} with {n_mfcc} MFCC coefficients")

      X, y = parse_data(metadata_csv, audio_dir, feature_extractor=generate_mfcc_features,
                        folds=[fold], sr=22050, max_pad_len=174, n_mfcc=n_mfcc)

      X = X[..., np.newaxis]
      X = np.expand_dims(X, axis=-1)

      fold_features[n_mfcc] = {'X': X, 'y': y}

    mfcc_features[fold] = fold_features

    # Save the fold features to GCS
    with tempfile.NamedTemporaryFile(suffix=".pkl") as tmp:
      pickle.dump(fold_features, tmp)
      tmp.flush()            # Ensure data is written to disk
      tmp.seek(0)            # Optional: move pointer to start
      blob = bucket.blob(f"features/mfcc/fold_{fold}_features.pkl")
      blob.upload_from_filename(tmp.name)


In [None]:
# Generate spectogram features for each fold and store in GCD
if gen_spectrogram_features:
  for fold in metadata_csv['fold'].unique():


    print(f"Fold {fold}: {len(metadata_csv[metadata_csv['fold'] == fold])} samples")

    X, y = parse_data(metadata_csv, audio_dir, feature_extractor=generate_spectrogram_features,
                        folds=[fold], sr=22050, n_fft=2048, max_pad_len=174, hop_length=512)

    spect_features = {'X': X, 'y': y}

    # Save the fold features to GCS

    with tempfile.NamedTemporaryFile(suffix=".pkl") as tmp:
      pickle.dump(spect_features, tmp)
      tmp.flush()             # Chec
      tmp.seek(0)            # Optional: move pointer to start
      blob = bucket.blob(f"features/spectogram/fold_{fold}_spect_features.pkl")
      blob.upload_from_filename(tmp.name)

Fold 5: 936 samples


Extracting features:   0%|          | 0/936 [00:00<?, ?it/s]

Total errors: 0
Fold 10: 837 samples


Extracting features:   0%|          | 0/837 [00:00<?, ?it/s]

Total errors: 0
Fold 2: 888 samples


Extracting features:   0%|          | 0/888 [00:00<?, ?it/s]



Total errors: 0
Fold 6: 823 samples


Extracting features:   0%|          | 0/823 [00:00<?, ?it/s]

Total errors: 0
Fold 1: 873 samples


Extracting features:   0%|          | 0/873 [00:00<?, ?it/s]



Total errors: 0
Fold 9: 816 samples


Extracting features:   0%|          | 0/816 [00:00<?, ?it/s]

Total errors: 0
Fold 7: 838 samples


Extracting features:   0%|          | 0/838 [00:00<?, ?it/s]

Total errors: 0
Fold 4: 990 samples


Extracting features:   0%|          | 0/990 [00:00<?, ?it/s]

Total errors: 0
Fold 3: 925 samples


Extracting features:   0%|          | 0/925 [00:00<?, ?it/s]

Total errors: 0
Fold 8: 806 samples


Extracting features:   0%|          | 0/806 [00:00<?, ?it/s]

Total errors: 0


In [None]:
# Generate delta features for each fold and store in GCD
if gen_delta_features:
  for fold in metadata_csv['fold'].unique():
    print(f"Fold {fold}: {len(metadata_csv[metadata_csv['fold'] == fold])} samples")

    X, y = parse_data(metadata_csv, audio_dir, feature_extractor=generate_delta_features,
                        folds=[fold], sr=22050, n_fft=2048, max_pad_len=174, hop_length=512)

    fold_features = {'X': X, 'y': y}

    # Save the fold features to GCS
    with tempfile.NamedTemporaryFile(suffix=".pkl") as tmp:
      pickle.dump(fold_features, tmp)
      blob = bucket.blob(f"features/delta/fold_{fold}_delta_features.pkl")
      blob.upload_from_filename(tmp.name)

## Load features from GCS

In [None]:
fold_features = {}
for fold in metadata_csv['fold'].unique():
    blob = bucket.blob(f"features/mfcc/fold_{fold}_features.pkl")
    local_filename = f"fold_{fold}_features.pkl"
    blob.download_to_filename(local_filename)

    file_size = os.path.getsize(local_filename)
    print(f"Downloaded file size for fold {fold}: {file_size} bytes")

    if file_size == 0:
        print(f"Error: File fold_{fold}_features.pkl is empty!")
    else:
        with open(local_filename, "rb") as f:
            fold_features[fold] = pickle.load(f)

Downloaded file size for fold 5: 39162505 bytes
Downloaded file size for fold 10: 35020345 bytes
Downloaded file size for fold 2: 37154185 bytes
Downloaded file size for fold 6: 34434585 bytes
Downloaded file size for fold 1: 36526585 bytes
Downloaded file size for fold 9: 34141696 bytes
Downloaded file size for fold 7: 35062185 bytes
Downloaded file size for fold 4: 41421865 bytes
Downloaded file size for fold 3: 38702265 bytes
Downloaded file size for fold 8: 33723296 bytes


In [9]:
spect_features = {}
for fold in metadata_csv['fold'].unique():
    blob = bucket.blob(f"features/spectogram/fold_{fold}_spect_features.pkl")
    local_filename = f"fold_{fold}_features.pkl"
    blob.download_to_filename(local_filename)

    file_size = os.path.getsize(local_filename)
    print(f"Downloaded file size for fold {fold}: {file_size} bytes")

    if file_size == 0:
        print(f"Error: File fold_{fold}_features.pkl is empty!")
    else:
        with open(local_filename, "rb") as f:
            spect_features[fold] = pickle.load(f)

Downloaded file size for fold 5: 83461503 bytes
Downloaded file size for fold 10: 74633871 bytes
Downloaded file size for fold 2: 79181439 bytes
Downloaded file size for fold 6: 73385519 bytes
Downloaded file size for fold 1: 77843919 bytes
Downloaded file size for fold 9: 72761334 bytes
Downloaded file size for fold 7: 74723039 bytes
Downloaded file size for fold 4: 88276575 bytes
Downloaded file size for fold 3: 82480655 bytes
Downloaded file size for fold 8: 71869654 bytes


# Model Build and Training Functions


In [10]:
def build_cnn_lstm_model(input_shape, num_classes, cnn_config, lstm_config, dense_config, dropout_rate=0.3):

    input_layer = layers.Input(shape=input_shape)
    x = input_layer
    print('input shape:', input_shape)

    # Determine if we're using CNN layers
    if cnn_config is not None:
        # CNN layers applied time-distributed (assumes input shape: (time, features, channels))
        for block in cnn_config:
            x = layers.TimeDistributed(
                layers.Conv2D(filters=block['filters'],
                             kernel_size=block['kernel_size'],
                             activation='relu',
                             padding='same')
            )(x)
            x = layers.TimeDistributed(
                layers.MaxPooling2D(pool_size=block.get('pool_size', (2, 2)))
            )(x)
            if dropout_rate:
                x = layers.TimeDistributed(layers.Dropout(dropout_rate))(x)

        # Flatten each time step's CNN output
        x = layers.TimeDistributed(layers.Flatten())(x)
    else:
        # For LSTM-only models
        # Handle MFCC or spectrogram data shape (timesteps, features, 1, 1) or similar

        # Handle 4D or 5D inputs (common for MFCC or spectrograms with extra dimensions)
        if len(input_shape) >= 3:
            # Calculate the total feature dimension by multiplying all dimensions except time steps
            feature_dim = 1
            for dim in input_shape[1:]:
                feature_dim *= dim

            # Reshape to (timesteps, flattened_features)
            x = layers.Reshape((input_shape[0], feature_dim))(x)

    # LSTM section
    if lstm_config is None:
        # If no LSTM layers are desired, flatten across all dimensions
        x = layers.Flatten()(x)
    else:
        # Support both a single LSTM layer (dict) or multiple LSTM layers (list)
        if isinstance(lstm_config, dict):
            x = layers.LSTM(lstm_config['units'],
                           dropout=lstm_config.get('dropout', 0),
                           return_sequences=lstm_config.get('return_sequences', False))(x)
        elif isinstance(lstm_config, list):
            for i, config in enumerate(lstm_config):
                # For all but the last LSTM layer, ensure return_sequences=True
                if i < len(lstm_config) - 1:
                    return_seq = config.get('return_sequences', True)
                else:
                    return_seq = config.get('return_sequences', False)
                x = layers.LSTM(config['units'],
                               dropout=config.get('dropout', 0),
                               return_sequences=return_seq)(x)

    # Dense layers section
    for dense in dense_config:
        x = layers.Dense(dense['units'], activation=dense.get('activation', 'relu'))(x)
        if dropout_rate:
            x = layers.Dropout(dropout_rate)(x)

    # Output classification layer
    output_layer = layers.Dense(num_classes, activation='softmax')(x)
    model = models.Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    return model
def train_model(model, X_train, y_train, X_val, y_val, batch_size=32, epochs=50, callbacks_list=None):
    history = model.fit(X_train, y_train,
                        validation_data=(X_val, y_val),
                        batch_size=batch_size,
                        epochs=epochs,
                        callbacks=callbacks_list)
    return history

# Model configurations and training

Model Config

In [11]:
fold_features[1].keys()

NameError: name 'fold_features' is not defined

In [12]:
spect_features[1].keys()

dict_keys(['X', 'y'])

Split the folds into Train / Test / Validate

In [30]:
# Decide which sets you want as train / validation
test_set = 3
val_set = 2


# Allocate the folds appropriately
all_folds = set([1,2,3,4,5,6,7,8,9,10])
train_set = all_folds - set([test_set, val_set])


# ###################
# For Loading MFCCs
# ###################

# X = np.concatenate([fold_features[fold][60]['X'] for fold in train_set])
# y = np.concatenate([fold_features[fold][60]['y'] for fold in train_set])

# X_test = fold_features[test_set][60]['X']
# y_test = fold_features[test_set][60]['y']

# X_val = fold_features[val_set][60]['X']
# y_val = fold_features[val_set][60]['y']


# ###########################
# For Loading Spectrograms
# ###########################

X = np.concatenate([spect_features[fold]['X'] for fold in train_set])
y = np.concatenate([spect_features[fold]['y'] for fold in train_set])

X_test = spect_features[test_set]['X']
y_test = spect_features[test_set]['y']

X_val = spect_features[val_set]['X']
y_val = spect_features[val_set]['y']

# Confirmed data shape: (samples, time_steps, features)
# X_test shape: (925, 174, 128)

# We need to reshape to: (samples, time_steps, height, width, channels)
# where height × width = features (128)

# Get the original shapes
print("Original shapes:")
print(f"X shape: {X.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"X_val shape: {X_val.shape}")

# Extract dimensions
samples = X.shape[0]
time_steps = X.shape[1]  # 174
features = X.shape[2]    # 128

# Find suitable factors for the features dimension (height × width = features)
height = 16
width = features // height

X = X.reshape(samples, time_steps, features, 1, 1)
X_test = X_test.reshape(X_test.shape[0], time_steps, features, 1, 1)
X_val = X_val.reshape(X_val.shape[0], time_steps, features, 1, 1)

print("\nReshaped to (samples, time_steps, height, width, channels):")
print(f"X reshaped shape: {X.shape}")
print(f"X_test reshaped shape: {X_test.shape}")
print(f"X_val reshaped shape: {X_val.shape}")

Original shapes:
X shape: (6919, 174, 128)
X_test shape: (925, 174, 128)
X_val shape: (888, 174, 128)

Reshaped to (samples, time_steps, height, width, channels):
X reshaped shape: (6919, 174, 128, 1, 1)
X_test reshaped shape: (925, 174, 128, 1, 1)
X_val reshaped shape: (888, 174, 128, 1, 1)


In [31]:
# Config 3
cnn_config = [
    {'filters': 64, 'kernel_size': (3, 3), 'pool_size': (2, 1)},
    {'filters': 32, 'kernel_size': (3, 3), 'pool_size': (2, 1)},
    {'filters': 32, 'kernel_size': (3, 3), 'pool_size': (2, 1)},
    {'filters': 16, 'kernel_size': (3, 3), 'pool_size': (2, 1)}
]


lstm_config =  [
    # {'units': 64, 'dropout': 0.3, 'return_sequences': True},
    {'units': 32, 'dropout': 0.3, 'return_sequences': False}
]

dense_config = [
    {'units': 32, 'activation': 'relu'}
]


# Config 1
# cnn_config = [
#     {'filters': 64, 'kernel_size': (3, 3), 'pool_size': (2, 1)},
#     {'filters': 32, 'kernel_size': (3, 3), 'pool_size': (3, 1)},
#     {'filters': 8, 'kernel_size': (3, 3), 'pool_size': (3, 1)}
# ]

# lstm_config =  None #{'units': 32, 'dropout': 0.3, 'return_sequences': False}

# dense_config = [
#     {'units': 24, 'activation': 'relu'}
# ]

# Determine input shape from X: (time_steps, feature_dim, channels)
input_shape = X.shape[1:]
num_classes = y.shape[1]
print(f'Input shape is {input_shape}.')
model = build_cnn_lstm_model(input_shape, num_classes, cnn_config, lstm_config, dense_config, dropout_rate=0.2)
model.summary()

Input shape is (174, 128, 1, 1).
input shape: (174, 128, 1, 1)


In [32]:
# Use early stopping to reduce overfitting... Maybe excessive
early_stop = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
history = train_model(model, X, y, X_val, y_val,
                      batch_size=32,
                      epochs=50,
                      callbacks_list=[early_stop])

Epoch 1/50
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m250s[0m 469ms/step - accuracy: 0.1329 - loss: 2.2115 - val_accuracy: 0.1115 - val_loss: 2.3239
Epoch 2/50
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 370ms/step - accuracy: 0.1984 - loss: 2.0197 - val_accuracy: 0.2162 - val_loss: 2.4498
Epoch 3/50
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 370ms/step - accuracy: 0.2735 - loss: 1.8459 - val_accuracy: 0.3007 - val_loss: 1.8269
Epoch 4/50
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 370ms/step - accuracy: 0.3363 - loss: 1.7499 - val_accuracy: 0.3288 - val_loss: 1.7160
Epoch 5/50
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 371ms/step - accuracy: 0.3653 - loss: 1.6940 - val_accuracy: 0.3953 - val_loss: 1.5499
Epoch 6/50
[1m217/217[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 370ms/step - accuracy: 0.3989 - loss: 1.6021 - val_accuracy: 0.3964 - val_loss: 1.5338
Epoch 7/5

In [None]:
# spect_features['X'].shape

In [None]:
X.shape

(6919, 174, 60, 1, 1)

In [33]:
# import numpy as np

# Generate predictions
y_pred = model.predict(X_test)

# Convert predictions and true labels to class indices
y_pred_classes = np.argmax(y_pred, axis=1)
y_true = np.argmax(y_test, axis=1)

# Display detailed classification report
print(classification_report(y_true, y_pred_classes))

# Display confusion matrix
print(confusion_matrix(y_true, y_pred_classes))

[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 606ms/step
              precision    recall  f1-score   support

           0       0.35      0.22      0.27       100
           1       0.76      0.86      0.80        43
           2       0.60      0.49      0.54       100
           3       0.58      0.77      0.66       100
           4       0.50      0.51      0.50       100
           5       0.24      0.31      0.27       107
           6       0.58      0.39      0.47        36
           7       0.33      0.37      0.35       120
           8       0.81      0.69      0.75       119
           9       0.46      0.46      0.46       100

    accuracy                           0.49       925
   macro avg       0.52      0.51      0.51       925
weighted avg       0.50      0.49      0.49       925

[[22  0  2  0  4 36  0 21  2 13]
 [ 0 37  0  3  0  0  3  0  0  0]
 [ 2  1 49 21  4  6  0  0  5 12]
 [ 1  4  8 77  2  0  0  0  5  3]
 [ 5  1  2  7 51  9  3  8  3 11]


In [34]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print("Test Loss:", loss)
print("Test Accuracy:", accuracy)


[1m29/29[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 56ms/step - accuracy: 0.5274 - loss: 1.4002
Test Loss: 1.4512518644332886
Test Accuracy: 0.5167567729949951


In [36]:
(confusion_matrix(y_true, y_pred_classes))

array([[22,  0,  2,  0,  4, 36,  0, 21,  2, 13],
       [ 0, 37,  0,  3,  0,  0,  3,  0,  0,  0],
       [ 2,  1, 49, 21,  4,  6,  0,  0,  5, 12],
       [ 1,  4,  8, 77,  2,  0,  0,  0,  5,  3],
       [ 5,  1,  2,  7, 51,  9,  3,  8,  3, 11],
       [14,  0,  0,  2,  5, 33,  2, 51,  0,  0],
       [ 0,  2,  0, 17,  0,  0, 14,  3,  0,  0],
       [ 7,  0,  0,  0, 30, 38,  1, 44,  0,  0],
       [ 6,  0,  5,  1,  0,  5,  1,  4, 82, 15],
       [ 6,  4, 15,  5,  7, 10,  0,  3,  4, 46]])

In [37]:
y_true

array([3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
       4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 5, 5, 5, 3, 3, 3, 3, 3, 3, 3, 3,
       3, 3, 3, 3, 3, 1, 1, 1, 3, 8, 8, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3,
       3, 3, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 9, 9, 9, 9, 9, 9, 4, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 6, 6, 6, 6, 2, 2, 2, 2, 2, 2, 2, 2, 1,
       1, 1, 1, 0, 0, 0, 0, 2, 2, 2, 2, 2, 9, 9, 9, 9, 9, 9, 2, 2, 2, 2,
       2, 2, 2, 2, 3, 3, 3, 3, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 6, 6, 6,
       6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
       7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 4, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       2, 2, 2, 2, 2, 2, 2, 3, 9, 9, 9, 9, 9, 9, 1,

In [39]:
conf_matrix = np.array([
    [22,  0,  2,  0,  4, 36,  0, 21,  2, 13],
    [ 0, 37,  0,  3,  0,  0,  3,  0,  0,  0],
    [ 2,  1, 49, 21,  4,  6,  0,  0,  5, 12],
    [ 1,  4,  8, 77,  2,  0,  0,  0,  5,  3],
    [ 5,  1,  2,  7, 51,  9,  3,  8,  3, 11],
    [14,  0,  0,  2,  5, 33,  2, 51,  0,  0],
    [ 0,  2,  0, 17,  0,  0, 14,  3,  0,  0],
    [ 7,  0,  0,  0, 30, 38,  1, 44,  0,  0],
    [ 6,  0,  5,  1,  0,  5,  1,  4, 82, 15],
    [ 6,  4, 15,  5,  7, 10,  0,  3,  4, 46]]
)

class_labels = [
    'air_conditioner',
    'car_horn',
    'children_playing',
    'dog_bark',
    'drilling',
    'engine_idling',
    'gun_shot',
    'jackhammer',
    'siren',
    'street_music'
]

# Create a DataFrame with proper labels
conf_df = pd.DataFrame(conf_matrix,
                      index=class_labels,
                      columns=class_labels)

conf_df


Unnamed: 0,air_conditioner,car_horn,children_playing,dog_bark,drilling,engine_idling,gun_shot,jackhammer,siren,street_music
air_conditioner,22,0,2,0,4,36,0,21,2,13
car_horn,0,37,0,3,0,0,3,0,0,0
children_playing,2,1,49,21,4,6,0,0,5,12
dog_bark,1,4,8,77,2,0,0,0,5,3
drilling,5,1,2,7,51,9,3,8,3,11
engine_idling,14,0,0,2,5,33,2,51,0,0
gun_shot,0,2,0,17,0,0,14,3,0,0
jackhammer,7,0,0,0,30,38,1,44,0,0
siren,6,0,5,1,0,5,1,4,82,15
street_music,6,4,15,5,7,10,0,3,4,46
