In [123]:
import os
import json
import pandas as pd
import torch
import torchaudio
import numpy as np
from datasets import load_dataset
from datasets import load_from_disk
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split

current_directory = os.getcwd()
data_folder_path = os.path.join(current_directory, 'data')
datasets_path_file = os.path.join(data_folder_path, 'datasets_path.json')

In [115]:
with open(datasets_path_file, 'r', encoding='utf-8') as file:
    datasets_path = json.load(file)

path_to_UrbanSound8K = datasets_path.get("UrbanSound8K", None)
path_to_Cat = datasets_path.get("CatMeow", None)
path_to_ESC = datasets_path.get("ESC50", None)
path_to_minds14 = datasets_path.get("MINDS14", None)

### Some functions

In [15]:
def to_mono_if_stereo(waveform):
    if waveform.shape[0] > 1:
        waveform = torch.mean(waveform, dim=0, keepdim=True)
        return waveform
    else:
        return waveform

def resample_if_not_sample_rate(waveform, sr):
    if sr == self.sample_rate:
        return waveform
    else:
        resampler = torchaudio.transforms.Resample(sr, self.sample_rate)
        waveform = resampler(waveform)
        return waveform

def make_fixed_length(waveform):
    if waveform.shape[1] < self.num_samples:
        # pad with zeros if too short
        padding = self.num_samples - waveform.shape[1]
        waveform = torch.nn.functional.pad(waveform, (0, padding))
        return waveform
    elif waveform.shape[1] > self.num_samples:
        # randomly crop if too long
        start = np.random.randint(0, waveform.shape[1] - self.num_samples)
        waveform = waveform[:, start:start + self.num_samples]
        return waveform
    else:
        return waveform

### Define Datasets Classes

#### UrbanSound Dataset

In [22]:
class UrbanSoundDataset(Dataset):
    def __init__(self, metadata_file, audio_dir, sample_rate=16000, num_samples=160000, transform=None):
        self.metadata = pd.read_csv(metadata_file)
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.num_samples = num_samples 
        self.transform = transform
        # map class labels to integers
        self.class_mapping = {label: idx for idx, label in enumerate(sorted(self.metadata['class'].unique()))}

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        audio_path = os.path.join(
            self.audio_dir,
            f"fold{self.metadata.iloc[idx]['fold']}", 
            self.metadata.iloc[idx]['slice_file_name']
        )
        #TODO: first load --> foeach get by idx
        waveform, sr = torchaudio.load(audio_path) # load waveform and sample rate
        waveform = to_mono_if_stereo(waveform)
        waveform = resample_if_not_sample_rate(waveform, sr)
        waveform = make_fixed_length(waveform)

        if self.transform:
            waveform = self.transform(waveform)

        label = self.class_mapping[self.metadata.ilococ[idx]['class']]

        return waveform, label

#### MINDS-14 Dataset

In [127]:
class MINDS14Dataset(Dataset):
    def __init__(self, dataset_path, split='train', sample_rate=16000, num_samples=160000, transform=None):
        self.dataset_path = dataset_path
        self.split = split
        self.sample_rate = sample_rate
        self.num_samples = num_samples
        self.transform = transform
        self.dataset = load_from_disk(dataset_path)
        self.data = self.dataset[split]
        self.intents = sorted(list(set(self.data['intent_class'])))
        self.intent_to_idx = {intent: idx for idx, intent in enumerate(self.intents)}

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        audio_data = self.data[idx]['audio']
        audio_array = np.array(audio_data['array'])
        sr = audio_data['sampling_rate']
        intent = self.data[idx]['intent_class']
        # convert audio to tensor, due to hugging face download
        waveform = torch.FloatTensor(audio_array).unsqueeze(0)
        waveform = to_mono_if_stereo(waveform)
        waveform = resample_if_not_sample_rate(waveform, sr)
        waveform = make_fixed_length(waveform)

        if self.transform:
            waveform = self.transform(waveform)

        label = self.intent_to_idx[intent]

        return waveform, label

#### CatMeow Dataset

In [28]:
class CatMeowDataset(Dataset):
    def __init__(self, metadata_df, audio_dir, sample_rate=16000, num_samples=160000, transform=None):
        self.metadata = metadata_df
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.num_samples = num_samples
        self.transform = transform
        # map class labels to integers (using emission context as class)
        self.class_mapping = {label: idx for idx, label in enumerate(sorted(self.metadata['context'].unique()))}

    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        audio_filename = self.metadata.iloc[idx]['filename']
        audio_path = os.path.join(self.audio_dir, audio_filename)
        
        waveform, sr = torchaudio.load(audio_path)
        waveform = to_mono_if_stereo(waveform)
        waveform = resample_if_not_sample_rate(waveform, sr)
        waveform = make_fixed_length(waveform)

        if self.transform:
            waveform = self.transform(waveform)

        label = self.class_mapping[self.metadata.ilococ[idx]['class']]

        return waveform, label

#### ESC-50 Dataset

In [31]:
class ESC50Dataset(Dataset):
    def __init__(self, metadata_df, audio_dir, sample_rate=16000, num_samples=160000, transform=None):
        self.metadata = metadata_df
        self.audio_dir = audio_dir
        self.sample_rate = sample_rate
        self.num_samples = num_samples 
        self.transform = transform
        # map class labels to integers
        self.class_mapping = {label: idx for idx, label in enumerate(sorted(self.metadata['category'].unique()))}
        
    def __len__(self):
        return len(self.metadata)
    
    def __getitem__(self, idx):
        audio_filename = self.metadata.iloc[idx]['filename']
        audio_path = os.path.join(self.audio_dir, audio_filename)
        
        waveform, sr = torchaudio.load(audio_path)
        waveform = to_mono_if_stereo(waveform)
        waveform = resample_if_not_sample_rate(waveform, sr)
        waveform = make_fixed_length(waveform)

        if self.transform:
            waveform = self.transform(waveform)
        
        label = self.class_mapping[self.metadata.iloc[idx]['category']]
        
        return waveform, label

### Preparation Functions per Dataset

#### UrbanSound Dataset

In [85]:
def prepare_urbansound_dataset(path_to_UrbanSound8K, batch_size=32):
    """
    Prepare the UrbanSound8K dataset for training, validation, and testing.
    
    Args:
        path_to_UrbanSound8K (str): Path to the UrbanSound8K dataset root directory
        batch_size (int): Batch size for data loaders
        
    Returns:
        train_loader, val_loader, test_loader, class_mapping
    """
    metadata_path = os.path.join(path_to_UrbanSound8K, 'metadata', 'UrbanSound8K.csv')
    audio_dir = os.path.join(path_to_UrbanSound8K, 'audio')
    
    # Load metadata
    metadata = pd.read_csv(metadata_path)
    
    # Create stratified splits
    train_meta, test_meta = train_test_split(
        metadata, test_size=0.2, random_state=42, stratify=metadata['class']
    )
    train_meta, val_meta = train_test_split(
        train_meta, test_size=0.25, random_state=42, stratify=train_meta['class']
    )  # 0.25 x 0.8 = 0.2 of total
    
    # Save splits
    train_meta.to_csv(os.path.join(path_to_UrbanSound8K, 'metadata', 'train.csv'), index=False)
    val_meta.to_csv(os.path.join(path_to_UrbanSound8K, 'metadata', 'val.csv'), index=False)
    test_meta.to_csv(os.path.join(path_to_UrbanSound8K, 'metadata', 'test.csv'), index=False)
    
    # Define data augmentation transformations
    train_transform = torch.nn.Sequential(
        torchaudio.transforms.FrequencyMasking(freq_mask_param=15),
        torchaudio.transforms.TimeMasking(time_mask_param=35)
    )
    
    # Create datasets
    train_dataset = UrbanSoundDataset(
        os.path.join(path_to_UrbanSound8K, 'metadata', 'train.csv'),
        audio_dir,
        transform=train_transform
    )
    
    val_dataset = UrbanSoundDataset(
        os.path.join(path_to_UrbanSound8K, 'metadata', 'val.csv'),
        audio_dir
    )
    
    test_dataset = UrbanSoundDataset(
        os.path.join(path_to_UrbanSound8K, 'metadata', 'test.csv'),
        audio_dir
    )
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    return train_loader, val_loader, test_loader, train_dataset.class_mapping

#### MINDS-14 Dataset

In [131]:
def prepare_minds14_local_dataset(dataset_path, batch_size=32):
    """
    Prepare MINDS-14 dataset for training from local files
    
    Args:
        dataset_path: Path to the downloaded dataset
        batch_size: Batch size for dataloaders
    
    Returns:
        train_loader, val_loader, test_loader, intents, intent_to_idx
    """
    # Create datasets
    train_dataset = MINDS14Dataset(
        dataset_path, 
        split='train', 
    )
    
    # Create data loaders
    train_loader = DataLoader(
        train_dataset, 
        batch_size=batch_size, 
        shuffle=True, 
        num_workers=4
    )
    
    return train_loader, val_loader, test_loader, train_dataset.intents, train_dataset.intent_to_idx

#### CatMeow Dataset

In [68]:
def parse_cat_meow_filename(filename):
    """
    Parse CatMeow filename to extract metadata
    Filename format: C_NNNNN_BB_SS_OOOOO_RXX, where:
    * C = emission context (values: B = brushing; F = waiting for food; I: isolation)
    * NNNNN = cat's unique ID
    * BB = breed (values: MC = Maine Coon; EU: European Shorthair)
    * SS = sex (values: FI = female, intact; FN: female, neutered; MI: male, intact; MN: male, neutered)
    * OOOOO = cat owner's unique ID
    * R = recording session (values: 1, 2 or 3)
    * XX = vocalization counter (values: 01..99)
    """
    # Get the first character of the filename as the context
    if filename.startswith('B'):
        context = 'brushing'
    elif filename.startswith('F'):
        context = 'waiting_for_food'
    elif filename.startswith('I'):
        context = 'isolation'
    else:
        # Default context if none of the expected ones
        context = 'unknown'
    
    return {
        'filename': filename,
        'context': context
    }

In [72]:
def prepare_catmeow_dataset(audio_dir_path, batch_size=32):
    """
    Prepare the CatMeow dataset for training, validation, and testing.
    
    Args:
        audio_dir_path (str): Path to the dataset root directory
        batch_size (int): Batch size for data loaders
        
    Returns:
        train_loader, val_loader, test_loader, class_mapping
    """
    audio_files = [f for f in os.listdir(audio_dir_path) if f.endswith('.wav')]
    
    # Parse metadata from filenames
    metadata_list = []
    for filename in audio_files:
        parsed = parse_cat_meow_filename(filename)
        if parsed:
            metadata_list.append(parsed)
            
    metadata_df = pd.DataFrame(metadata_list)

    print("First few rows of metadata:")
    print(metadata_df.head())
    
    # Check if we have at least one file from each context
    context_counts = metadata_df['context'].value_counts()
    print(f"Context counts: {context_counts}")
    
    if len(context_counts) < 2:
        print("Warning: Not enough different contexts for stratification!")
        # If we don't have enough contexts, just do a random split without stratification
        train_meta, test_meta = train_test_split(
            metadata_df, test_size=0.2, random_state=42
        )
        train_meta, val_meta = train_test_split(
            train_meta, test_size=0.25, random_state=42
        )
    else:
        # Save the full metadata
        metadata_dir = os.path.join(os.path.dirname(audio_dir_path), 'metadata')
        os.makedirs(metadata_dir, exist_ok=True)
        metadata_df.to_csv(os.path.join(metadata_dir, 'CatMeow_metadata.csv'), index=False)
        
        # Create stratified splits
        train_meta, test_meta = train_test_split(
            metadata_df, test_size=0.2, random_state=42, stratify=metadata_df['context']
        )
        train_meta, val_meta = train_test_split(
            train_meta, test_size=0.25, random_state=42, stratify=train_meta['context']
        )  # 0.25 x 0.8 = 0.2 of total
    
    # Save splits
    train_meta.to_csv(os.path.join(metadata_dir, 'train.csv'), index=False)
    val_meta.to_csv(os.path.join(metadata_dir, 'val.csv'), index=False)
    test_meta.to_csv(os.path.join(metadata_dir, 'test.csv'), index=False)
    
    # Create datasets
    train_dataset = CatMeowDataset(
        train_meta,
        audio_dir_path,
    )
    
    val_dataset = CatMeowDataset(
        val_meta,
        audio_dir_path
    )
    
    test_dataset = CatMeowDataset(
        test_meta,
        audio_dir_path
    )
    
    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    return train_loader, val_loader, test_loader, train_dataset.class_mapping

#### ESC-50 Dataset

In [75]:
def prepare_esc50_dataset(esc50_path, batch_size=32):
    """
    Prepare the ESC-50 dataset for training, validation, and testing.
    
    Args:
        esc50_path (str): Path to the ESC-50 dataset root directory
        batch_size (int): Batch size for data loaders
        
    Returns:
        train_loader, val_loader, test_loader, class_mapping
    """
    audio_dir = os.path.join(esc50_path, 'audio')
    meta_dir = os.path.join(esc50_path, 'meta')
    
    metadata_path = os.path.join(meta_dir, 'esc50.csv')
    metadata_df = pd.read_csv(metadata_path)
    
    required_columns = ['filename', 'category', 'fold']
    missing_columns = [col for col in required_columns if col not in metadata_df.columns]
    
    if missing_columns:
        if 'target' in metadata_df.columns and 'category' in missing_columns:
            # map target numbers to category names if we have a target column but no category
            metadata_df['category'] = metadata_df['target'].apply(lambda x: f'class_{x}')
        else:
            raise ValueError(f"Metadata is missing required columns: {missing_columns}")

    if 'fold' in metadata_df.columns:
        # Use folds 1-4 for training, fold 5 for testing (the standard ESC-50 split)
        train_val_meta = metadata_df[metadata_df['fold'] <= 4].copy()
        test_meta = metadata_df[metadata_df['fold'] == 5].copy()
        
        # Split training into train and validation
        train_meta, val_meta = train_test_split(
            train_val_meta, test_size=0.25, random_state=42, 
            stratify=train_val_meta['category']
        )
    else:
        # If no fold information, do a standard split
        train_meta, test_meta = train_test_split(
            metadata_df, test_size=0.2, random_state=42, 
            stratify=metadata_df['category']
        )
        train_meta, val_meta = train_test_split(
            train_meta, test_size=0.25, random_state=42, 
            stratify=train_meta['category']
        )  # 0.25 x 0.8 = 0.2 of total

    # save splits
    os.makedirs(meta_dir, exist_ok=True)
    train_meta.to_csv(os.path.join(meta_dir, 'train.csv'), index=False)
    val_meta.to_csv(os.path.join(meta_dir, 'val.csv'), index=False)
    test_meta.to_csv(os.path.join(meta_dir, 'test.csv'), index=False)

    # create datasets
    train_dataset = ESC50Dataset(
        train_meta,
        audio_dir,
    )
    
    val_dataset = ESC50Dataset(
        val_meta,
        audio_dir
    )
    
    test_dataset = ESC50Dataset(
        test_meta,
        audio_dir
    )
    
    # create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
    
    return train_loader, val_loader, test_loader, train_dataset.class_mapping

### Some Tests

#### ESC-50 Dataset

In [43]:
train_loader, val_loader, test_loader, class_mapping = prepare_esc50_dataset(path_to_ESC)
print(f"Class mapping: {class_mapping}")
print(f"Number of classes: {len(class_mapping)}")

Class mapping: {'airplane': 0, 'breathing': 1, 'brushing_teeth': 2, 'can_opening': 3, 'car_horn': 4, 'cat': 5, 'chainsaw': 6, 'chirping_birds': 7, 'church_bells': 8, 'clapping': 9, 'clock_alarm': 10, 'clock_tick': 11, 'coughing': 12, 'cow': 13, 'crackling_fire': 14, 'crickets': 15, 'crow': 16, 'crying_baby': 17, 'dog': 18, 'door_wood_creaks': 19, 'door_wood_knock': 20, 'drinking_sipping': 21, 'engine': 22, 'fireworks': 23, 'footsteps': 24, 'frog': 25, 'glass_breaking': 26, 'hand_saw': 27, 'helicopter': 28, 'hen': 29, 'insects': 30, 'keyboard_typing': 31, 'laughing': 32, 'mouse_click': 33, 'pig': 34, 'pouring_water': 35, 'rain': 36, 'rooster': 37, 'sea_waves': 38, 'sheep': 39, 'siren': 40, 'sneezing': 41, 'snoring': 42, 'thunderstorm': 43, 'toilet_flush': 44, 'train': 45, 'vacuum_cleaner': 46, 'washing_machine': 47, 'water_drops': 48, 'wind': 49}
Number of classes: 50


#### CatMeow Dataset

In [83]:
train_loader, val_loader, test_loader, class_mapping = prepare_catmeow_dataset(path_to_Cat)
print(f"Class mapping: {class_mapping}")
print(f"Number of classes: {len(class_mapping)}")

First few rows of metadata:
                      filename   context
0  B_ANI01_MC_FN_SIM01_101.wav  brushing
1  B_ANI01_MC_FN_SIM01_102.wav  brushing
2  B_ANI01_MC_FN_SIM01_103.wav  brushing
3  B_ANI01_MC_FN_SIM01_301.wav  brushing
4  B_ANI01_MC_FN_SIM01_302.wav  brushing
Context counts: context
isolation           221
brushing            127
waiting_for_food     92
Name: count, dtype: int64
Class mapping: {'brushing': 0, 'isolation': 1, 'waiting_for_food': 2}
Number of classes: 3


#### UrbanSound Dataset

In [91]:
train_loader, val_loader, test_loader, class_mapping = prepare_urbansound_dataset(path_to_UrbanSound8K)
print(f"Class mapping: {class_mapping}")
print(f"Number of classes: {len(class_mapping)}")

Class mapping: {'air_conditioner': 0, 'car_horn': 1, 'children_playing': 2, 'dog_bark': 3, 'drilling': 4, 'engine_idling': 5, 'gun_shot': 6, 'jackhammer': 7, 'siren': 8, 'street_music': 9}
Number of classes: 10


#### MINDS-14 Dataset

In [133]:
train_loader, val_loader, test_loader, intents, intent_to_idx = prepare_minds14_local_dataset(path_to_minds14)

# Print dataset info
print(f"Number of classes: {len(intents)}")
print(f"Intent classes: {intents}")

Number of classes: 14
Intent classes: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]
