## Datasets Used -

- Crema - [https://www.kaggle.com/datasets/ejlok1/cremad](https://www.kaggle.com/datasets/ejlok1/cremad) 
- Ravdess - [https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio](https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio)
- Savee - [https://www.kaggle.com/datasets/ejlok1/surrey-audiovisual-expressed-emotion-savee](https://www.kaggle.com/datasets/ejlok1/surrey-audiovisual-expressed-emotion-savee)
- Tess - [https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess](https://www.kaggle.com/datasets/ejlok1/toronto-emotional-speech-set-tess)


In [1]:
import os
import shutil
import pandas as pd
import json
from tqdm.notebook import tqdm

In [2]:
def load_datasets(base_path):
    """
    Consolidated function to load all datasets and merge them.
    
    Args:
        base_path: Base path to the datasets
        
    Returns:
        DataFrame with Path and Emotions columns
    """
    print("Loading datasets...")
    dataset_paths = {
        'crema': os.path.join(base_path, "speech-emotion-recognition-en/Crema"),
        'ravdess': os.path.join(base_path, "speech-emotion-recognition-en/Ravdess/audio_speech_actors_01-24"),
        'savee': os.path.join(base_path, "speech-emotion-recognition-en/Savee"),
        'tess': os.path.join(base_path, "speech-emotion-recognition-en/Tess")
    }
    
    # Load CREMA dataset
    crema_data = []
    for file in os.listdir(dataset_paths['crema']):
        path = os.path.join(dataset_paths['crema'], file)
        part = file.split('_')
        
        emotion = None
        if part[2] == 'SAD': emotion = 'sad'
        elif part[2] == 'ANG': emotion = 'angry'
        elif part[2] == 'DIS': emotion = 'disgust'
        elif part[2] == 'FEA': emotion = 'fear'
        elif part[2] == 'HAP': emotion = 'happy'
        elif part[2] == 'NEU': emotion = 'neutral'
        
        if emotion:
            crema_data.append({'Path': path, 'Emotions': emotion})
    
    # Load SAVEE dataset
    savee_data = []
    for file in os.listdir(dataset_paths['savee']):
        path = os.path.join(dataset_paths['savee'], file)
        part = file.split('_')[1]
        ele = part[:-6]
        
        emotion = None
        if ele == 'a': emotion = 'angry'
        elif ele == 'd': emotion = 'disgust'
        elif ele == 'f': emotion = 'fear'
        elif ele == 'h': emotion = 'happy'
        elif ele == 'n': emotion = 'neutral'
        elif ele == 'sa': emotion = 'sad'
        elif ele == 'su': emotion = 'surprise'
        
        if emotion:
            savee_data.append({'Path': path, 'Emotions': emotion})
    
    # Load TESS dataset
    tess_data = []
    for dir in os.listdir(dataset_paths['tess']):
        directories = os.listdir(os.path.join(dataset_paths['tess'], dir))
        for file in directories:
            path = os.path.join(dataset_paths['tess'], dir, file)
            part = file.split('.')[0].split('_')[2]
            
            emotion = None
            if part == 'ps': emotion = 'surprise'
            else: emotion = part
            
            if emotion:
                tess_data.append({'Path': path, 'Emotions': emotion})
    
    # Load RAVDESS dataset
    ravdess_data = []
    for i in os.listdir(dataset_paths['ravdess']):
        actor = os.listdir(os.path.join(dataset_paths['ravdess'], i))
        for file in actor:
            path = os.path.join(dataset_paths['ravdess'], i, file)
            part = file.split('.')[0].split('-')
            emotion_code = int(part[2])
            
            emotion = None
            if emotion_code == 1 or emotion_code == 2: emotion = 'neutral'
            elif emotion_code == 3: emotion = 'happy'
            elif emotion_code == 4: emotion = 'sad'
            elif emotion_code == 5: emotion = 'angry'
            elif emotion_code == 6: emotion = 'fear'
            elif emotion_code == 7: emotion = 'disgust'
            elif emotion_code == 8: emotion = 'surprise'
            
            if emotion:
                ravdess_data.append({'Path': path, 'Emotions': emotion})
    
    # Create DataFrames
    print("Creating merged dataset...")
    crema_df = pd.DataFrame(crema_data)
    savee_df = pd.DataFrame(savee_data)
    tess_df = pd.DataFrame(tess_data)
    ravdess_df = pd.DataFrame(ravdess_data)
    
    # Merge datasets
    merged_df = pd.concat([crema_df, savee_df, tess_df, ravdess_df], axis=0, ignore_index=True)
    
    # Verify all paths exist
    print(f"Checking file paths... ({len(merged_df)} total)")
    existing_paths = [path for path in merged_df['Path'] if os.path.exists(path)]
    merged_df = merged_df[merged_df['Path'].apply(os.path.exists)].reset_index(drop=True)
    print(f"Files verified: {len(merged_df)} (removed {len(existing_paths) - len(merged_df)} invalid paths)")
    
    return merged_df

In [3]:
# Load datasets
base_path = '/kaggle/input/'
df = load_datasets(base_path)

Loading datasets...
Creating merged dataset...
Checking file paths... (12162 total)
Files verified: 12162 (removed 0 invalid paths)


In [4]:
def organize_ser_dataset(df, target_dir='/kaggle/working/merged_ser_dataset'):
    """
    Organize speech emotion recognition files into a new directory structure.
    
    Args:
        df: DataFrame with Path and Emotions columns
        target_dir: Target directory for organized files
    
    Returns:
        DataFrame with original and new paths
    """
    # Create target directory
    os.makedirs(target_dir, exist_ok=True)
    
    # Create subdirectories for each emotion
    for emotion in df['Emotions'].unique():
        os.makedirs(os.path.join(target_dir, emotion), exist_ok=True)
    
    # Copy and rename files
    new_paths = []
    
    print(f"Moving and renaming files to {target_dir}...")
    for i, row in tqdm(df.iterrows(), total=len(df)):
        # Extract info
        src_path = row['Path']
        emotion = row['Emotions']
        
        # Determine dataset source
        if 'Crema' in src_path:
            dataset = 'crema'
        elif 'Ravdess' in src_path:
            dataset = 'ravdess'
        elif 'Savee' in src_path:
            dataset = 'savee'
        elif 'Tess' in src_path:
            dataset = 'tess'
        else:
            dataset = 'unknown'
        
        # Get original filename without extension
        orig_filename = os.path.splitext(os.path.basename(src_path))[0]
        
        # Create new filename: dataset_originalname.wav in emotion subdirectory
        new_filename = f"{dataset}_{orig_filename}.wav"
        emotion_dir = os.path.join(target_dir, emotion)
        dst_path = os.path.join(emotion_dir, new_filename)
        
        # Copy file
        try:
            shutil.copy2(src_path, dst_path)
            new_paths.append(dst_path)
        except Exception as e:
            print(f"Error copying {src_path} to {dst_path}: {e}")
            new_paths.append(None)
    
    # Add new paths to dataframe
    df_with_paths = df.copy()
    df_with_paths['NewPath'] = new_paths
    
    # Save mapping file
    df_with_paths.to_csv(os.path.join(target_dir, 'file_mapping.csv'), index=False)
    
    # Create README file with dataset information
    with open(os.path.join(target_dir, 'README.md'), 'w') as f:
        f.write("# Speech Emotion Recognition Dataset\n\n")
        
        f.write(f"## Overview\n\n")
        f.write("This dataset is a merged collection from CREMA, RAVDESS, SAVEE, and TESS datasets.\n")
        f.write("It contains audio files labeled with 7 emotions: angry, disgust, fear, happy, neutral, sad, and surprise.\n\n")
        
        f.write(f"## Dataset Statistics\n\n")
        f.write(f"Total Files: {len(df)}\n\n")
        
        emotion_counts = df['Emotions'].value_counts()
        f.write("### Files by Emotion:\n")
        for emotion, count in emotion_counts.items():
            f.write(f"- {emotion}: {count}\n")
        
        f.write("\n### Directory Structure:\n")
        f.write("The dataset is organized into subdirectories by emotion. Each file is named using the convention:\n")
        f.write("`[dataset]_[originalfilename].wav`\n\n")
        
        f.write("## Usage\n\n")
        f.write("This dataset is suitable for speech emotion recognition tasks.\n")
    
    print(f"Dataset organized successfully to {target_dir}")
    print(f"Total files: {len(df)}")
    
    return df_with_paths

In [5]:
def prepare_for_kaggle_upload(dataset_dir, username, dataset_title='speech-emotion-recognition-dataset'):
    """
    Prepare dataset for uploading to Kaggle.
    
    Args:
        dataset_dir: Directory containing the dataset
        username: Your Kaggle username
        dataset_title: Title for the dataset (slug)
    """
    print(f"Preparing to push dataset to Kaggle as {username}/{dataset_title}")
    
    # Create dataset metadata
    metadata = {
        "title": dataset_title,
        "id": f"{username}/{dataset_title}",
        "licenses": [{"name": "CC0-1.0"}],
        "description": """
# Speech Emotion Recognition Dataset

This dataset is a merged collection from CREMA, RAVDESS, SAVEE, and TESS datasets.
It contains audio files labeled with 7 emotions: angry, disgust, fear, happy, neutral, sad, and surprise.

Each emotion has its own directory, and files are named using the convention: dataset_originalfilename.wav
        """
    }
    
    # Save metadata
    metadata_path = os.path.join(dataset_dir, 'dataset-metadata.json')
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=2)
    
    print(f"Created metadata file at {metadata_path}")
    
    return metadata_path

In [6]:
# Process and organize the dataset
target_dir = '/kaggle/working/merged_ser_dataset'
organized_df = organize_ser_dataset(df, target_dir)

Moving and renaming files to /kaggle/working/merged_ser_dataset...


  0%|          | 0/12162 [00:00<?, ?it/s]

Dataset organized successfully to /kaggle/working/merged_ser_dataset
Total files: 12162


In [7]:
import zipfile

def create_zip_file(source_dir, zip_path=None):
    """
    Create a ZIP file from the organized dataset directory.
    
    Args:
        source_dir: Directory containing the organized dataset
        zip_path: Path where the ZIP file will be saved (default: source_dir + '.zip')
        
    Returns:
        Path to the created ZIP file
    """
    if zip_path is None:
        zip_path = source_dir + '.zip'
    
    print(f"Creating ZIP file at {zip_path}...")
    
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        # Walk through all files and subdirectories
        for root, dirs, files in os.walk(source_dir):
            for file in tqdm(files, desc=f"Adding files from {os.path.basename(root)}"):
                file_path = os.path.join(root, file)
                # Calculate the path relative to the source directory
                # This preserves the directory structure in the ZIP
                rel_path = os.path.relpath(file_path, start=os.path.dirname(source_dir))
                zipf.write(file_path, rel_path)
    
    zip_size_mb = os.path.getsize(zip_path) / (1024 * 1024)
    print(f"ZIP file created successfully: {zip_path} ({zip_size_mb:.2f} MB)")
    
    return zip_path

# Create ZIP file
zip_path = create_zip_file(target_dir)

Creating ZIP file at /kaggle/working/merged_ser_dataset.zip...


Adding files from merged_ser_dataset:   0%|          | 0/2 [00:00<?, ?it/s]

Adding files from happy:   0%|          | 0/1923 [00:00<?, ?it/s]

Adding files from sad:   0%|          | 0/1923 [00:00<?, ?it/s]

Adding files from neutral:   0%|          | 0/1895 [00:00<?, ?it/s]

Adding files from angry:   0%|          | 0/1923 [00:00<?, ?it/s]

Adding files from fear:   0%|          | 0/1923 [00:00<?, ?it/s]

Adding files from surprise:   0%|          | 0/652 [00:00<?, ?it/s]

Adding files from disgust:   0%|          | 0/1923 [00:00<?, ?it/s]

ZIP file created successfully: /kaggle/working/merged_ser_dataset.zip (985.64 MB)
