In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!mkdir -p /content/drive/MyDrive/EE641/Project/
%cd /content/drive/MyDrive/EE641/Project/

/content/drive/MyDrive/EE641/Project


In [3]:
!pip install pydub
!pip install librosa



In [4]:
import os
import random
import librosa
import numpy as np
import soundfile as sf
from pydub import AudioSegment
import pandas as pd

In [5]:
# Parameters
num_clips = 5000  # Total number of composite clips
clip_duration = 60  # Duration of each composite clip in seconds
sample_rate = 22050  # Target sample rate
sounds_per_clip = 30  # Number of distinct sounds per clip
overlap_range = (0.1, 0.5)  # Range of overlap ratio between sounds
output_dir = "UrbanSound8K_Composite"  # Output directory for the dataset

# Load metadata
data = pd.read_csv("UrbanSound8K/metadata/UrbanSound8K.csv")

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Function to get file paths for a specific class
def get_file_paths(class_name):
    return [
        os.path.join("UrbanSound8K/audio", f"fold{row.fold}", row.slice_file_name)
        for _, row in data[data["class"] == class_name].iterrows()
    ]

# Function to mix sounds and record metadata
def mix_sounds_with_labels(sound_files, overlap_range, duration, sample_rate):
    mixed_audio = np.zeros(duration * sample_rate)
    labels = []  # To store metadata for each sound

    for idx, file in enumerate(sound_files):
        try:
            y, sr = librosa.load(file, sr=sample_rate)
            if idx == 0:
                # Force the first clip to start at 0
                start_sample = 0
            elif idx == len(sound_files) - 1:
                # Force the last clip to end at the last second
                start_sample = max(0, len(mixed_audio) - len(y))
            else:
                # Random start point for other clips
                start_sample = random.randint(0, len(mixed_audio) - len(y))

            overlap = random.uniform(*overlap_range)
            end_sample = start_sample + int(len(y) * (1 - overlap))
            start_time = start_sample / sample_rate
            end_time = min(end_sample, len(mixed_audio)) / sample_rate

            # Mix audio
            mixed_audio[start_sample:end_sample] += y[:end_sample - start_sample]

            # Extract class name from file
            class_name = data[data["slice_file_name"] == os.path.basename(file)]["class"].values[0]
            labels.append({"class": class_name, "start_time": start_time, "end_time": end_time})

        except Exception as e:
            print(f"Error with file {file}: {e}")

    # Normalize mixed audio
    mixed_audio = librosa.util.normalize(mixed_audio)
    return mixed_audio, labels

# Generate the dataset
metadata = []

for clip_id in range(1, num_clips + 1):
    # Select 30 random sound files (classes can be reselected)
    selected_files = []
    for _ in range(sounds_per_clip):
        random_class = random.choice(list(data["class"].unique()))  # Randomly select a class
        selected_file = random.choice(get_file_paths(random_class))  # Randomly select a file from that class
        selected_files.append(selected_file)

    # Mix sounds and get labels
    composite_audio, labels = mix_sounds_with_labels(selected_files, overlap_range, clip_duration, sample_rate)

    # Save the mixed audio to a file
    output_path = os.path.join(output_dir, f"composite_clip_{clip_id}.wav")
    sf.write(output_path, composite_audio, sample_rate)

    # Record metadata
    for label in labels:
        metadata.append({
            "clip_id": clip_id,
            "class": label["class"],
            "start_time": label["start_time"],
            "end_time": label["end_time"]
        })

    if clip_id % 100 == 0:
        print(f"Generated and saved {clip_id}/{num_clips} clips.")

# Save metadata to a CSV file
metadata_df = pd.DataFrame(metadata)
metadata_df.to_csv(os.path.join(output_dir, "metadata.csv"), index=False)
print(f"Dataset generation complete. Audio files and metadata saved to {output_dir}")


Generated and saved 100/5000 clips.
Generated and saved 200/5000 clips.
Generated and saved 300/5000 clips.
Generated and saved 400/5000 clips.
Generated and saved 500/5000 clips.
Generated and saved 600/5000 clips.
Generated and saved 700/5000 clips.
Generated and saved 800/5000 clips.
Generated and saved 900/5000 clips.
Generated and saved 1000/5000 clips.
Generated and saved 1100/5000 clips.
Generated and saved 1200/5000 clips.
Generated and saved 1300/5000 clips.
Generated and saved 1400/5000 clips.
Generated and saved 1500/5000 clips.
Generated and saved 1600/5000 clips.
Generated and saved 1700/5000 clips.
Generated and saved 1800/5000 clips.
Generated and saved 1900/5000 clips.
Generated and saved 2000/5000 clips.
Generated and saved 2100/5000 clips.
Generated and saved 2200/5000 clips.
Generated and saved 2300/5000 clips.
Generated and saved 2400/5000 clips.
Generated and saved 2500/5000 clips.
Generated and saved 2600/5000 clips.
Generated and saved 2700/5000 clips.
Generated 