# Train Classifier

This notebook trains a classification model for identifying whether a song clip contains music or not

## Imports

In [None]:
from pathlib import Path
import numpy as np
import subprocess
import librosa
import joblib

## SETTINGS

Model directory and filename

In [None]:
MODELS_DIR = Path("..") / "models"  # '..' moves up one level to project root
MODEL_FILENAME = "music_classifier.pkl"

# === Create the folders if they don't exist ===
MODELS_DIR.mkdir(parents=True, exist_ok=True)

Data directories (copied from notebook 01)

In [None]:
DATA_DIR = Path("..") / "data"  # '..' moves up one level to project root

RAW_DATA_DIR = DATA_DIR / "raw"
CLIPS_DATA_DIR = DATA_DIR / "clips"
RESULTS_DIR = DATA_DIR / "results"

STAGING_DIR = CLIPS_DATA_DIR / "segments"
MUSIC_CLIPS_DIR = CLIPS_DATA_DIR / "music"
NOT_MUSIC_CLIPS_DIR = CLIPS_DATA_DIR / "not-music"

# === Create the folders if they don't exist ===
RAW_DATA_DIR.mkdir(parents=True, exist_ok=True)
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
STAGING_DIR.mkdir(parents=True, exist_ok=True)
MUSIC_CLIPS_DIR.mkdir(parents=True, exist_ok=True)
NOT_MUSIC_CLIPS_DIR.mkdir(parents=True, exist_ok=True)

## Get Features and Target

Helper for extracting features from audio files

In [None]:
def extract_features(file_path, sr=22050):
    # 1. Use FFmpeg to read the file into a numpy array (Silent & Fast)
    command = [
        'ffmpeg', '-i', file_path,
        '-f', 'f32le', '-acodec', 'pcm_f32le',
        '-ar', str(sr), '-ac', '1', '-'
    ]
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL)
    stdout, _ = process.communicate()
    y = np.frombuffer(stdout, dtype=np.float32)

    # If the file is empty or corrupt, return None to skip it
    if len(y) == 0:
        return None

    # 2. Extract Features
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    mfccs_std = np.std(mfccs.T, axis=0)
    
    centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    centroid_mean = np.mean(centroid)
    
    return np.hstack([mfccs_mean, mfccs_std, centroid_mean])



Build `X` and `y` from audio clips in `data/music` and `data/not-music` directories.

In [None]:
class_folders = {
    'not-music': NOT_MUSIC_CLIPS_DIR,
    'music': MUSIC_CLIPS_DIR
}

X = []
y = []
classes = {'not-music': 0, 'music': 1}

# Loop through and extract
for label_name, label_idx in classes.items():
    folder_path = class_folders[label_name]
    files = list(folder_path.glob("*.m4a"))
    total_files = len(files)
    
    print(f"Processing {len(files)} files in '{label_name}'...")
    
    for i, file_path in enumerate(files, 1):
        features = extract_features(str(file_path))
        if features is not None:
            X.append(features)
            y.append(label_idx)

        # Calculate and print percentage (rounded down)
        if i % 20 == 0 or i == total_files:
            percent = int((i / total_files) * 100)
            print(f"Progress: {percent:>3}% complete ({i}/{total_files})", end="\r")
    
    print() # Print a newline after each folder is finished

X = np.array(X)
y = np.array(y)

print(f"\nSuccess! Features shape: {X.shape}") 

## Train Model

1. Split data into train and test sets
2. Fit RandomForestClassifier to training set
3. Evaluate the model against the test set
4. Save model as `pkl` file for using later

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# 1. Split data (80% Train, 20% Test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 2. Initialize and Train the Random Forest
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# 3. Evaluate
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.2%}")
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Not-Music', 'Music']))

Save model to models folder (overwrite)

In [None]:
joblib.dump(model, MODELS_DIR / MODEL_FILENAME)
print(f"Model saved as {MODEL_FILENAME}")