## Features extraction

In [2]:
!pip install pydub
import os
from pydub import AudioSegment
from glob import glob

from google.colab import drive
drive.mount('/content/drive')
# Set up input and output directories
input_base_dir = '/content/drive/MyDrive/speech'
output_base_dir = '/content/drive/MyDrive/speech_wav'  # Output to this new directory

# Create output base directory if not exists
os.makedirs(output_base_dir, exist_ok=True)

# Loop over all folders in the base directory
for folder_name in os.listdir(input_base_dir):
    folder_path = os.path.join(input_base_dir, folder_name)
    if os.path.isdir(folder_path):
        # Create corresponding output folder
        output_folder = os.path.join(output_base_dir, folder_name)
        os.makedirs(output_folder, exist_ok=True)

        # Process all .m4a files in the folder
        for m4a_file in glob(os.path.join(folder_path, "*.m4a")):
            filename = os.path.basename(m4a_file).replace(".m4a", ".wav")
            output_file = os.path.join(output_folder, filename)

            # Convert using pydub
            audio = AudioSegment.from_file(m4a_file, format="m4a")
            audio.export(output_file, format="wav")

            print(f"Converted: {m4a_file} → {output_file}")

Collecting pydub
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pydub-0.25.1-py2.py3-none-any.whl (32 kB)
Installing collected packages: pydub
Successfully installed pydub-0.25.1
Mounted at /content/drive
Converted: /content/drive/MyDrive/speech/IWTGH/IWTGH_1.m4a → /content/drive/MyDrive/speech_wav/IWTGH/IWTGH_1.wav
Converted: /content/drive/MyDrive/speech/IWTGH/IWTGH_7.m4a → /content/drive/MyDrive/speech_wav/IWTGH/IWTGH_7.wav
Converted: /content/drive/MyDrive/speech/IWTGH/IWTGH_8.m4a → /content/drive/MyDrive/speech_wav/IWTGH/IWTGH_8.wav
Converted: /content/drive/MyDrive/speech/IWTGH/IWTGH_4.m4a → /content/drive/MyDrive/speech_wav/IWTGH/IWTGH_4.wav
Converted: /content/drive/MyDrive/speech/IWTGH/IWTGH_2.m4a → /content/drive/MyDrive/speech_wav/IWTGH/IWTGH_2.wav
Converted: /content/drive/MyDrive/speech/IWTGH/IWTGH_3.m4a → /content/drive/MyDrive/speech_wav/IWTGH/IWTGH_3.wav
Converted: /content/drive/MyDrive/speech/IWTGH/IWTGH_5.m4a → /content/drive/MyDrive/sp

In [3]:
# importing libraries
import librosa as lb
import os
import pandas as pd
import numpy as np
from tqdm import tqdm

In [9]:
import os
import librosa as lb
import numpy as np
from tqdm import tqdm

# Root folder path containing 10 subfolders
root_path = "/content/drive/MyDrive/speech_wav"

features_data = []

# Traverse each subfolder
for folder_name in os.listdir(root_path):
    folder_path = os.path.join(root_path, folder_name)

    # Skip if not a folder
    if not os.path.isdir(folder_path):
        continue

    # Iterate through all .wav files in the folder
    for item in tqdm(os.listdir(folder_path), desc=f"Processing {folder_name}"):
        if not item.endswith(".wav"):
            continue

        item_path = os.path.join(folder_path, item)
        fileName = f"{folder_name}/{item.split('.wav')[0]}"  # Include folder name for uniqueness

        result = np.array([])

        # Load audio
        y, sr = lb.load(item_path, sr=None)

        n_fft = 1024  # update to 1024 to match librosa warning

        # Pad signal if it's shorter than n_fft
        if len(y) < n_fft:
           y = np.pad(y, (0, n_fft - len(y)))

        S = np.abs(lb.stft(y, n_fft=n_fft))

        # MFCC
        mfcc = lb.feature.mfcc(y=y, sr=sr, n_fft=n_fft)
        mfcc = np.mean(mfcc.T, axis=0)
        result = np.hstack((result, mfcc))

        # MEL Spectrogram
        mel = lb.feature.melspectrogram(y=y, sr=sr)
        mel = np.mean(mel.T, axis=0)
        result = np.hstack((result, mel))

        # Chromagram
        chroma = lb.feature.chroma_stft(S=S, sr=sr, n_fft=n_fft)
        chroma = np.mean(chroma.T, axis=0)
        result = np.hstack((result, chroma))

        # Spectral Contrast
        contrast = lb.feature.spectral_contrast(S=S, sr=sr, n_fft=n_fft)
        contrast = np.mean(contrast.T, axis=0)
        result = np.hstack((result, contrast))

        # Tonnetz (requires harmonic component)
        h = lb.effects.harmonic(y)
        t = lb.feature.tonnetz(y=h, sr=sr)
        t = np.mean(t.T, axis=0)
        result = np.hstack((result, t))

        features_data.append([fileName, *result])

Processing IWTGH: 100%|██████████| 18/18 [00:13<00:00,  1.32it/s]
Processing IJRTB: 100%|██████████| 18/18 [00:15<00:00,  1.16it/s]
Processing WWIBA: 100%|██████████| 18/18 [00:15<00:00,  1.18it/s]
Processing TNAML: 100%|██████████| 18/18 [00:17<00:00,  1.04it/s]
Processing DYWDI: 100%|██████████| 18/18 [00:15<00:00,  1.15it/s]
Processing HDYFO: 100%|██████████| 18/18 [00:13<00:00,  1.35it/s]
Processing YSURE: 100%|██████████| 18/18 [00:06<00:00,  2.79it/s]
Processing NTHIN: 100%|██████████| 17/17 [00:12<00:00,  1.37it/s]
Processing WDYSM: 100%|██████████| 18/18 [00:08<00:00,  2.01it/s]
Processing TBMBW: 100%|██████████| 17/17 [00:17<00:00,  1.00s/it]


In [11]:
# Root folder path with 10 folders
root_path = "/content/drive/MyDrive/speech_wav"

features_data = []

# Loop through all folders inside root_path
for folder_name in os.listdir(root_path):
    folder_path = os.path.join(root_path, folder_name)

    # Skip if not a folder
    if not os.path.isdir(folder_path):
        continue

    # Loop through all wav files in the folder
    for item in tqdm(os.listdir(folder_path), desc=f"Processing {folder_name}"):
        if not item.endswith(".wav"):
            continue

        item_path = os.path.join(folder_path, item)
        fileName = os.path.splitext(item)[0]  # e.g., YSURE_18

        try:
            # Load and pad short audio
            y, sr = lb.load(item_path, sr=None)
            if len(y) < 1024:
                y = np.pad(y, (0, 1024 - len(y)))

            result = np.array([])

            n_fft = 1024
            S = np.abs(lb.stft(y, n_fft=n_fft))

            # Extract features
            mfcc = np.mean(lb.feature.mfcc(y=y, sr=sr, n_fft=n_fft).T, axis=0)
            mel = np.mean(lb.feature.melspectrogram(y=y, sr=sr).T, axis=0)[:108]  # Ensure length = 108
            chroma = np.mean(lb.feature.chroma_stft(S=S, sr=sr, n_fft=n_fft).T, axis=0)
            contrast = np.mean(lb.feature.spectral_contrast(S=S, sr=sr, n_fft=n_fft).T, axis=0)
            tonnetz = np.mean(lb.feature.tonnetz(y=lb.effects.harmonic(y), sr=sr).T, axis=0)

            # Combine all features
            result = np.hstack((mfcc, mel, chroma, contrast, tonnetz))
            result = np.nan_to_num(result)

            # Append to dataset
            features_data.append([fileName, *result])

        except Exception as e:
            print(f" Error processing {fileName}: {e}")
            continue

Processing IWTGH: 100%|██████████| 18/18 [00:13<00:00,  1.33it/s]
Processing IJRTB: 100%|██████████| 18/18 [00:15<00:00,  1.14it/s]
Processing WWIBA: 100%|██████████| 18/18 [00:15<00:00,  1.15it/s]
Processing TNAML: 100%|██████████| 18/18 [00:16<00:00,  1.06it/s]
Processing DYWDI: 100%|██████████| 18/18 [00:15<00:00,  1.13it/s]
Processing HDYFO: 100%|██████████| 18/18 [00:12<00:00,  1.39it/s]
Processing YSURE: 100%|██████████| 18/18 [00:07<00:00,  2.57it/s]
Processing NTHIN: 100%|██████████| 17/17 [00:12<00:00,  1.35it/s]
Processing WDYSM: 100%|██████████| 18/18 [00:08<00:00,  2.16it/s]
Processing TBMBW: 100%|██████████| 17/17 [00:17<00:00,  1.03s/it]


In [13]:
# Define column names
mfcc_feats = [f'mfcc_{i}' for i in range(20)]
mel_feats = [f'mel_{i}' for i in range(20, 128)]  # range adjusted to 128 to match the mel feature length
chroma_feats = [f'chroma_{i}' for i in range(128, 140)]
spect_feats = [f'spect_{i}' for i in range(140, 147)]
tone_feats = [f'tonnets_{i}' for i in range(147, 153)]

column_names = ['clipName'] + mfcc_feats + mel_feats + chroma_feats + spect_feats + tone_feats

# Create DataFrame
features_df = pd.DataFrame(features_data, columns=column_names)

features_df.head()

Unnamed: 0,clipName,mfcc_0,mfcc_1,mfcc_2,mfcc_3,mfcc_4,mfcc_5,mfcc_6,mfcc_7,mfcc_8,...,spect_143,spect_144,spect_145,spect_146,tonnets_147,tonnets_148,tonnets_149,tonnets_150,tonnets_151,tonnets_152
0,IWTGH_1,-456.161011,173.850739,-0.44593,43.220448,-2.20823,5.250477,2.358801,16.227489,4.849324,...,12.345949,13.21458,16.673017,31.7761,0.028255,-0.018589,0.115786,-0.013089,-0.03185,-0.050886
1,IWTGH_7,-395.242798,171.847427,0.3713,34.765068,-2.50512,14.97781,-3.485596,1.698898,-0.775851,...,12.642402,14.346615,17.441942,34.994604,-0.025708,-0.027114,0.042067,-0.113314,0.005059,-0.014681
2,IWTGH_8,-385.991058,164.279984,-10.153129,42.339943,-1.90993,2.02521,0.001517,1.005306,0.368697,...,12.748602,13.69466,16.364253,36.140007,0.021706,-0.019084,0.15282,-0.025098,-0.027136,-0.037164
3,IWTGH_4,-460.136444,171.376282,-0.733322,44.148335,-5.689566,9.734161,-3.116978,13.025571,3.540501,...,12.829127,13.194763,17.238967,32.216306,0.008253,-0.017944,0.081751,-0.002341,-0.019149,-0.016467
4,IWTGH_2,-403.748169,158.715408,10.69232,27.654495,1.954888,2.280664,0.531536,9.43469,6.756381,...,12.661829,13.684149,17.27658,33.785295,0.000486,0.028319,0.064798,-0.031453,0.001535,-0.01727


In [21]:
# save to csv
features_df.to_csv('/content/drive/MyDrive/speech_wav/features.csv', index=False)