# Generate training data

This notebook takes a YouTube video as input, downloads the audio, and splits it into multiple 5-second audio clips.

These clips are then assigned to the relevant directory (`music` or `not-music`) depending on whether they fall within the specified `music_sections` defined in the SETTINGS section below.

## Imports

In [None]:
from pathlib import Path

## SETTINGS (adjust as required)

Data Directories

In [None]:
DATA_DIR = Path("..") / "data"  # '..' moves up one level to project root

RAW_DATA_DIR = DATA_DIR / "raw"
CLIPS_DATA_DIR = DATA_DIR / "clips"

STAGING_DIR = CLIPS_DATA_DIR / "segments"

MUSIC_CLIPS_DIR = CLIPS_DATA_DIR / "music"
NOT_MUSIC_CLIPS_DIR = CLIPS_DATA_DIR / "not-music"

# === Create the folders if they don't exist ===
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
STAGING_DIR.mkdir(parents=True, exist_ok=True)
MUSIC_CLIPS_DIR.mkdir(parents=True, exist_ok=True)
NOT_MUSIC_CLIPS_DIR.mkdir(parents=True, exist_ok=True)

Output Audio filepath

In [None]:
OUTPUT_AUDIO = RAW_DATA_DIR / "training_audio.m4a"

Youtube video url

In [None]:
URL = "https://www.youtube.com/watch?v=VpAegQyncmQ"

Clip size

In [None]:
CLIP_SIZE = 5  # seconds

Specify time ranges for when the music starts and stops (using CLIP_SIZE resolution)

In [None]:
# Specify start and end times in format mm:ss
music_sections = [
    # start, end
    ("03:05", "12:05"),
    ("29:30", "33:30"),
    ("42:45", "46:05"),
    ("76:30", "79:55"),
]


# === Compute music_sections_processed ===
# convert data to raw seconds -
def get_sec(min_sec_str: str) -> int:
    split_str = min_sec_str.split(":")
    min = int(split_str[0])
    sec = int(split_str[1])
    return (60 * min) + sec


music_sections_processed = [(get_sec(s), get_sec(e)) for s, e in music_sections]
music_sections_processed

## Download Audio for YouTube Video

Download audio

In [None]:
!yt-dlp --force-overwrites -f "bestaudio[ext=m4a]/bestaudio" -o "{OUTPUT_AUDIO}" {URL}

## Split Audio File into 5 Second Clips

Split into 5 second clips

In [None]:
!ffmpeg -loglevel error -i "{OUTPUT_AUDIO}" -f segment -segment_time {CLIP_SIZE} -c copy "{STAGING_DIR}/clip_%03d.m4a"

## Move Clips to Relevant Data Folder

In [None]:
# 1. List files inside STAGING_DIR with names starting with 'clip_'
files = list(STAGING_DIR.glob('clip_*.m4a'))

moved_count = 0

for file_path in files:
    try:
        # 2. Extract number using pathlib's .stem (filename without extension)
        # "out_005.m4a" -> "out_005" -> split -> "005"
        file_num = int(file_path.stem.split('_')[1])
        
        # 3. Calculate timestamp
        clip_time = file_num * CLIP_SIZE

        # 4. Check if clip_time falls within ANY of the music ranges
        is_music = any(start <= clip_time < end for start, end in music_sections_processed)
        
        # 5. Determine destination
        target_dir = MUSIC_CLIPS_DIR if is_music else NOT_MUSIC_CLIPS_DIR
        dest_path = target_dir / file_path.name
        
        # 6. Move the file
        file_path.replace(dest_path)
        moved_count += 1
        
    except (ValueError, IndexError) as e:
        print(f"Skipping {file_path.name}: Could not parse number. Error: {e}")

print(f"Done! Moved {moved_count} files into {CLIPS_DATA_DIR} sub-directories.")


Remove the staging directory (only if it's empty)

In [None]:
try:
    STAGING_DIR.rmdir()
    print(f"Cleaned up: {STAGING_DIR} has been removed.")
except OSError:
    print(f"Note: {STAGING_DIR} was not empty and was not removed.")
