In [1]:
import os
import pandas as pd
import librosa
import numpy as np
from tqdm import tqdm
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

In [3]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("chrisfilo/urbansound8k")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\user\.cache\kagglehub\datasets\chrisfilo\urbansound8k\versions\1


In [None]:
CSV_PATH = os.path.join(path, "UrbanSound8K.csv")

TARGET_DURATION = 4.0
SAMPLE_RATE = 22050

In [6]:
metadata = pd.read_csv(CSV_PATH)
metadata.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [7]:
label_dict = metadata.groupby("classID")["class"].first().to_dict()
label_dict

{0: 'air_conditioner',
 1: 'car_horn',
 2: 'children_playing',
 3: 'dog_bark',
 4: 'drilling',
 5: 'engine_idling',
 6: 'gun_shot',
 7: 'jackhammer',
 8: 'siren',
 9: 'street_music'}

In [None]:
folds = pickle.load(open('../data/folds.pkl', 'rb'))
folds[0]

{'101415-3-0-2.wav': {'data': array([-0.00011485, -0.00017083, -0.00017736, ..., -0.04609928,
         -0.04609471, -0.05099387], dtype=float32),
  'label': 3},
 '101415-3-0-3.wav': {'data': array([-0.00058609, -0.00104031, -0.00083608, ..., -0.00025773,
         -0.00023362, -0.00033846], dtype=float32),
  'label': 3},
 '101415-3-0-8.wav': {'data': array([ 0.23264292,  0.3517679 ,  0.2967721 , ..., -0.002873  ,
         -0.00120905, -0.00118477], dtype=float32),
  'label': 3},
 '102106-3-0-0.wav': {'data': array([ 0.00677376,  0.00218388, -0.00135232, ...,  0.        ,
          0.        ,  0.        ], dtype=float32),
  'label': 3},
 '102305-6-0-0.wav': {'data': array([-1.5769154e-05,  5.2932650e-04,  9.9615753e-04, ...,
          0.0000000e+00,  0.0000000e+00,  0.0000000e+00], dtype=float32),
  'label': 6},
 '102842-3-0-1.wav': {'data': array([ 0.00681507,  0.01001569,  0.00837398, ..., -0.00628984,
         -0.00583232, -0.00689586], dtype=float32),
  'label': 3},
 '102842-3-1-0.w

In [12]:
def extract_features(y, sr = SAMPLE_RATE):
    mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfcc_mean = np.mean(mfcc, axis=1)

    zcr = np.mean(librosa.feature.zero_crossing_rate(y))
    rmse = np.mean(librosa.feature.rms(y=y))
    contrast = np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1)

    return np.hstack([mfcc_mean, zcr, rmse, contrast])

In [13]:
sample = folds[0]['101415-3-0-2.wav']

sample_features = extract_features(y = sample['data'])
print(sample_features.shape)
sample_features

(22,)


array([-4.02430267e+02,  9.22634430e+01,  1.93035812e+01, -1.31941862e+01,
       -5.80657101e+00,  2.40858722e+00, -1.48856459e+01, -8.07093048e+00,
       -9.41730309e+00, -6.22067833e+00,  9.73379850e-01, -5.76208973e+00,
        1.65633976e+00,  4.09704660e-02,  6.41270205e-02,  1.87682684e+01,
        1.35025141e+01,  1.65413976e+01,  1.66304977e+01,  1.96487530e+01,
        1.73547718e+01,  4.61629666e+01])

In [18]:
feature_columns = (
    [f'mfcc_{i}' for i in range(1, 14)] +  # 13 MFCC
    ['rms', 'zcr'] +                      # 2 基本統計特徵
    [                                     # 7 Spectral features
        'spectral_centroid',
        'spectral_bandwidth',
        'spectral_contrast',
        'spectral_flatness',
        'spectral_rolloff',
        'chroma_stft',
        'tonnetz'
    ]
)

feature_columns

['mfcc_1',
 'mfcc_2',
 'mfcc_3',
 'mfcc_4',
 'mfcc_5',
 'mfcc_6',
 'mfcc_7',
 'mfcc_8',
 'mfcc_9',
 'mfcc_10',
 'mfcc_11',
 'mfcc_12',
 'mfcc_13',
 'rms',
 'zcr',
 'spectral_centroid',
 'spectral_bandwidth',
 'spectral_contrast',
 'spectral_flatness',
 'spectral_rolloff',
 'chroma_stft',
 'tonnetz']

In [None]:
for i in range(10):
    names, labels = [], []
    features = []

    # Extract features, tqdm
    for filename, info in tqdm(folds[i].items(), desc=f'Fold {i}'):
        names.append(filename)
        labels.append(info['label'])
        extracted_features = extract_features(info['data'])
        features.append(extracted_features)

    # Save data into dataframe
    df = pd.DataFrame(features, columns=feature_columns)
    df.insert(loc = 0, column = 'audio', value = names)
    df['label'] = labels
    df.to_csv(f'../data/orig/fold{i}.csv', index=False)

Fold 0: 100%|██████████| 873/873 [00:09<00:00, 89.10it/s] 
Fold 1: 100%|██████████| 888/888 [00:09<00:00, 93.50it/s] 
Fold 2: 100%|██████████| 925/925 [00:10<00:00, 90.67it/s] 
Fold 3: 100%|██████████| 990/990 [00:10<00:00, 92.93it/s] 
Fold 4: 100%|██████████| 936/936 [00:10<00:00, 93.52it/s] 
Fold 5: 100%|██████████| 823/823 [00:09<00:00, 90.89it/s] 
Fold 6: 100%|██████████| 838/838 [00:09<00:00, 88.82it/s]
Fold 7: 100%|██████████| 806/806 [00:09<00:00, 88.90it/s]
Fold 8: 100%|██████████| 816/816 [00:09<00:00, 88.23it/s]
Fold 9: 100%|██████████| 837/837 [00:09<00:00, 89.16it/s]
