# SurfPerch Integration for Acoustic Reef

This notebook demonstrates how to integrate Google SurfPerch model for generating audio embeddings in the Acoustic Reef project.

## Overview
- Load and use Google SurfPerch model from Kaggle
- Generate embeddings from reef audio recordings
- Prepare data for downstream classification tasks


In [None]:
# End-to-end: Load embeddings/dataset, align, load RF model, predict
from src.models.reef_classifier import (
    load_embeddings_from_csv,
    load_master_dataset,
    align_embeddings_and_labels,
    load_trained_rf_model,
    predict_with_model,
)
from src.utils import config
import numpy as np
import pandas as pd

# Load data
X_emb, emb_df = load_embeddings_from_csv()
dataset_df = load_master_dataset()

# Align by key if available (tries 'clip_id' by default)
X, y, merged = align_embeddings_and_labels(emb_df, dataset_df)
print(f"Embeddings: {X_emb.shape} | Aligned features: {X.shape} | Labels shape: {getattr(y, 'shape', None)}")

# Load trained RandomForest model and run predictions
rf_model = load_trained_rf_model()
preds, probs = predict_with_model(rf_model, X)

# Preview results
results_df = merged.copy()
results_df["prediction"] = preds
if isinstance(probs, list):
    # If model returns list of proba arrays (multi-output), keep as-is
    results_df["proba"] = [p for p in zip(*probs)]
elif probs is not None:
    # Single output classifier: store max prob and full vector
    max_prob = probs.max(axis=1) if hasattr(probs, "max") else None
    results_df["prob_max"] = max_prob

results_df.head(10)
