In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!mkdir -p /content/drive/MyDrive/EE641/Project/
%cd /content/drive/MyDrive/EE641/Project/

/content/drive/MyDrive/EE641/Project


In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm

In [None]:
file_path = 'metadata.csv'
data = pd.read_csv(file_path)
data

Unnamed: 0,clip_id,class,start_time,end_time
0,1,children_playing,0.000000,2.686621
1,1,air_conditioner,28.028980,31.269161
2,1,jackhammer,30.401678,32.624762
3,1,jackhammer,6.082177,8.604671
4,1,jackhammer,33.926712,36.357279
...,...,...,...,...
149995,5000,gun_shot,44.720816,45.640544
149996,5000,engine_idling,1.394875,4.343900
149997,5000,gun_shot,17.649161,18.643356
149998,5000,jackhammer,53.062494,53.826485


In [None]:
file_path = 'metadata.csv'
data = pd.read_csv(file_path)
audio_dir = 'UrbanSound8K_Composite/'
SAMPLE_RATE = 22050
N_MELS = 256
AUDIO_DURATION = 60
HOP_LENGTH = 512
NUM_MEL_FRAMES = int((SAMPLE_RATE * AUDIO_DURATION) / HOP_LENGTH) + 1

class_labels = sorted(data['class'].unique())

In [None]:
def extract_mel_spectrogram(file_path, sample_rate=SAMPLE_RATE, n_mels=N_MELS, duration=AUDIO_DURATION, hop_length=HOP_LENGTH):
    try:
        audio, sr = librosa.load(file_path, sr=sample_rate, duration=duration)

        mel_spectrogram = librosa.feature.melspectrogram(y=audio, sr=sr, n_mels=n_mels, hop_length=hop_length)

        mel_db = librosa.power_to_db(mel_spectrogram, ref=np.max)
        return mel_db

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

def build_label_matrix(clip_id, num_mel_frames, hop_length, sample_rate, df):
    label_matrix = np.zeros((num_mel_frames, len(class_labels)), dtype=int)
    events = df[df['clip_id'] == clip_id]

    for _, event in events.iterrows():
        start_frame = int((event['start_time'] * sample_rate) // hop_length)
        end_frame = int((event['end_time'] * sample_rate) // hop_length)
        class_idx = class_labels.index(event['class'])
        label_matrix[start_frame:end_frame + 1, class_idx] = 1

    return label_matrix

In [None]:
features = []
labels = []

for clip_id in tqdm(data['clip_id'].unique()):
    audio_file = os.path.join(audio_dir, f"composite_clip_{clip_id}.wav")
    mel_spectrogram = extract_mel_spectrogram(audio_file)

    if mel_spectrogram is None:
        continue


    num_mel_frames = mel_spectrogram.shape[1]


    label_matrix = build_label_matrix(clip_id, num_mel_frames, HOP_LENGTH, SAMPLE_RATE, data)

    features.append(mel_spectrogram)
    labels.append(label_matrix)


100%|██████████| 5000/5000 [34:15<00:00,  2.43it/s]


In [None]:
features = np.array(features)
labels = np.array(labels)

print(f"Features shape: {features.shape}, Labels shape: {labels.shape}")

Features shape: (5000, 256, 2584), Labels shape: (5000, 2584, 10)


In [None]:
np.save('features_full.npy', features)
np.save('labels_full.npy', labels)