# Predict Song Blocks Using Classifier

This notebook predicts song blocks using the trained model. It downloads the audio from a specified YouTube video and makes the analysis on this audio file.

Inputs:
- YouTube video URL
- Model (trained in notebook 02)

Outputs:
- Music Timeline Plot (PNG)
- Music Block summary (CSV)

## Imports

In [None]:
from pathlib import Path
import subprocess
import pandas as pd
import numpy as np
import joblib
import librosa

## SETTINGS

Model directory and filename (copied from notebook 02)

In [None]:
MODELS_DIR = Path("..") / "models"  # '..' moves up one level to project root
MODEL_FILENAME = "music_classifier.pkl"

# === Create the folders if they don't exist ===
MODELS_DIR.mkdir(parents=True, exist_ok=True)

Data Directories (copied from notebook 01)

In [None]:
DATA_DIR = Path("..") / "data"  # '..' moves up one level to project root

RAW_DATA_DIR = DATA_DIR / "raw"
CLIPS_DATA_DIR = DATA_DIR / "clips"

STAGING_DIR = CLIPS_DATA_DIR / "segments"

MUSIC_CLIPS_DIR = CLIPS_DATA_DIR / "music"
NOT_MUSIC_CLIPS_DIR = CLIPS_DATA_DIR / "not-music"

# === Create the folders if they don't exist ===
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
STAGING_DIR.mkdir(parents=True, exist_ok=True)
MUSIC_CLIPS_DIR.mkdir(parents=True, exist_ok=True)
NOT_MUSIC_CLIPS_DIR.mkdir(parents=True, exist_ok=True)

Data Results Directory

In [None]:
RESULTS_DIR = DATA_DIR / "results"

# === Create the folders if they don't exist ===
RESULTS_DIR.mkdir(parents=True, exist_ok=True)

Output Audio filepath

In [None]:
OUTPUT_AUDIO = RAW_DATA_DIR / "output_audio.m4a"

YouTube video URL

In [None]:
URL = "https://www.youtube.com/watch?v=GA6knmK1UKs"  # Original YouTube Video

Clip size

In [None]:
CLIP_SIZE = 5  # Size of analysis window (seconds) - use same value as in notebook 01

## Download Audio for YouTube Video

Download audio

In [None]:
!yt-dlp -q --force-overwrites -f "bestaudio[ext=m4a]/bestaudio" -o "{OUTPUT_AUDIO}" {URL}

## Make Predictions Using Model

Generate dataframe of results

In [None]:
model = joblib.load(MODELS_DIR / MODEL_FILENAME)


def get_duration(file_path):
    cmd = [
        'ffprobe', '-v', 'error', '-show_entries', 'format=duration',
        '-of', 'default=noprint_wrappers=1:nokey=1', file_path
    ]
    return float(subprocess.check_output(cmd))


def predict_music_timeline(file_path, window_sec, sr=22050):
    total_duration = get_duration(file_path)
    total_chunks = int(total_duration // window_sec)
    
    # Calculate bytes per chunk: (seconds * rate * 4 bytes for float32)
    bytes_per_chunk = window_sec * sr * 4
    results = []
    
    # Use FFmpeg to pipe RAW PCM data to Python
    command = [
        'ffmpeg', '-i', file_path,
        '-f', 'f32le', '-acodec', 'pcm_f32le',
        '-ar', str(sr), '-ac', '1', '-'
    ]
    
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    
    chunk_idx = 0
    while True:
        raw_bytes = process.stdout.read(bytes_per_chunk)
        if not raw_bytes or len(raw_bytes) < bytes_per_chunk:
            break
            
        # Convert bytes to numpy
        y_block = np.frombuffer(raw_bytes, dtype=np.float32)
        
        # --- FEATURE EXTRACTION ---
        mfccs = librosa.feature.mfcc(y=y_block, sr=sr, n_mfcc=13)
        mfccs_mean = np.mean(mfccs.T, axis=0)
        mfccs_std = np.std(mfccs.T, axis=0)
        centroid = librosa.feature.spectral_centroid(y=y_block, sr=sr)
        centroid_mean = np.mean(centroid)
        
        features = np.hstack([mfccs_mean, mfccs_std, centroid_mean]).reshape(1, -1)
        
        # --- PREDICTION ---
        pred_idx = model.predict(features)[0]
        prob = np.max(model.predict_proba(features))
        
        start_time = chunk_idx * window_sec
        results.append({
            "start_sec": start_time,
            "end_sec": start_time + window_sec,
            "label": "music" if pred_idx == 1 else "not-music",
            "confidence": round(prob, 4)
        })
        
        chunk_idx += 1
        
        # --- DISPLAY PERCENTAGE UPDATE ---
        if chunk_idx % 20 == 0:
            percent = int((chunk_idx / total_chunks) * 100)
            # Limits display to 100% and prints on one line
            print(f"Progress: {min(100, percent):>3}% complete", end="\r")

    process.terminate()
    return pd.DataFrame(results)

# Get music predictions timeline
file_path = OUTPUT_AUDIO
df = predict_music_timeline(file_path=file_path, window_sec=CLIP_SIZE)

print("\nHere are first few rows..")
df.head()


Show predictions on a graph

In [None]:
import matplotlib.pyplot as plt

# 1. Prepare data for plotting
# Convert labels to numbers (1 for music, 0 for not-music)
df['label_num'] = df['label'].map({'music': 1, 'not-music': 0})

# 2. Setup the plot
plt.figure(figsize=(15, 4))
plt.step(df['start_sec'] / 60, df['label_num'], where='post', color='teal', linewidth=2)

# 3. Formatting
plt.fill_between(df['start_sec'] / 60, df['label_num'], step="post", alpha=0.3, color='teal')
plt.yticks([0, 1], ['Not-Music', 'Music'])
plt.xlabel('Time (Minutes)')
plt.ylabel('Classification')
plt.title('Music Detection Timeline')
plt.grid(axis='x', linestyle='--', alpha=0.7)

plt.tight_layout()

plot_path = RESULTS_DIR / "music_timeline_plot.png"
plt.savefig(plot_path, dpi=300, bbox_inches='tight')

plt.show()
print(f"Plot successfully saved to: {plot_path}")

Extract song blocks from the data

In [None]:
def extract_music_blocks(df, min_duration_sec=120, max_gap_seconds=15):
    # Create a copy so we don't overwrite the original dataframe
    df_clean = df.copy()
    
    # 1. Convert to numeric for processing
    df_clean['is_music'] = df_clean['label'].map({'music': 1, 'not-music': 0})
    
    # 2. SMOOTHING: Median filter - requires at least 2/3 chunks to match.
    df_clean['is_music'] = df_clean['is_music'].rolling(window=3, center=True).median().fillna(df_clean['is_music'])
    
    # 3. BRIDGE GAPS: check if music exists within the 'max_gap' range
    # Treat "not-music" sections as "music" if shorter than max_gap_seconds, 
    gap_chunks = max_gap_seconds // CLIP_SIZE
    df_clean['is_music'] = df_clean['is_music'].rolling(window=gap_chunks, center=True, min_periods=1).max()

    # 4. Identify song blocks using cumsum logic
    df_clean['block_id'] = (df_clean['is_music'] != df_clean['is_music'].shift()).cumsum()

    # 5. Group and Aggregate
    blocks = df_clean.groupby('block_id').agg({
        'is_music': 'first',
        'start_sec': 'min',
        'end_sec': 'max'
    })
    
    blocks['duration'] = blocks['end_sec'] - blocks['start_sec']

    # 6. Filter by music label (1) and min_duration threshold
    songs = blocks[(blocks['is_music'] == 1) & (blocks['duration'] >= min_duration_sec)].copy()

    # Formatting helper
    def format_time(seconds):
        return f"{int(seconds // 60):02d}:{int(seconds % 60):02d}"

    songs['start_timestamp'] = songs['start_sec'].apply(format_time)
    songs['end_timestamp'] = songs['end_sec'].apply(format_time)

    return songs[['start_timestamp', 'end_timestamp', 'start_sec', 'end_sec', 'duration']]

# Get music blocks
music_blocks = extract_music_blocks(df, min_duration_sec=120, max_gap_seconds=15)
music_blocks

Save song_blocks to results directory as csv (overwrite)

In [None]:
music_blocks.to_csv(RESULTS_DIR / "music_timeline.csv", index=True)
print(f"Summary saved to {RESULTS_DIR}")