# Audio Analysis

In [None]:
# link backend
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))
sys.path.append('/home/darkangel/ai-light-show/backend')
from backend.config import SONGS_TEMP_DIR, SONGS_DIR

# sample song 
song_name = 'Queen of Kings - Alessandra'
songs_folder = "/home/darkangel/ai-light-show/songs/"
song_file = f"{songs_folder}{song_name}.mp3"
print(f"song: {song_name} | {song_file}")

# load metadata
from backend.song_metadata import SongMetadata
song = SongMetadata(song_name=song_name, songs_folder=songs_folder)
print(f"song metadata: {song}")


In [None]:
from backend.ai.demucs_split import extract_stems
stems_folder = extract_stems(song_file)

drums_path = f"{stems_folder['output_folder']}/drums.wav"

In [None]:
print(f"drums: {drums_path}")
import librosa
import numpy as np

y, sr = librosa.load(drums_path, sr=None)
y = y / np.max(np.abs(y))  # peak normalize

In [None]:
!pip install matplotlib --quiet
import matplotlib.pyplot as plt
import numpy as np

# 🔍 Plot waveform
plt.figure(figsize=(14, 4))
librosa.display.waveshow(y, sr=sr, alpha=0.6)
plt.title("Waveform of drums.wav")
plt.xlabel("Time (s)")

num_points = len(y)
times = np.linspace(0, num_points / sr, num_points)
plt.ylabel("Amplitude")
plt.grid(True)
plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt
import librosa.display

# Plot spectrogram
S = librosa.stft(y, n_fft=2048, hop_length=512)
S_db = librosa.amplitude_to_db(np.abs(S), ref=np.max)
plt.figure(figsize=(14, 6))
librosa.display.specshow(S_db, sr=sr, x_axis='time', y_axis='log', cmap='magma')
plt.colorbar(format='%+2.0f dB')
plt.title('Spectrogram of drums.wav')
plt.xlabel('Time (s)')
plt.ylabel('Frequency (Hz)')
plt.tight_layout()
plt.show()

In [None]:
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch, librosa, numpy as np

model_name = "yojul/wav2vec2-base-one-shot-hip-hop-drums-clf"
feat = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

y, sr = librosa.load(drums_path, sr=16000, mono=True)
window, hop = int(0.1*sr), int(0.05*sr)
events = []

for start in range(0, len(y)-window, hop):
    chunk = y[start:start+window]
    inputs = feat(chunk, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()
    label = model.config.id2label[np.argmax(probs)]
    conf = float(np.max(probs))
    if conf > 0.6:  # tune down if needed
        events.append({"time": (start+window/2)/sr, "type": label, "confidence": conf})

print(events[:10])


In [None]:
events

In [None]:
# Save events to song metadata
drums_events = []
unique_types = set(e['type'] for e in events)
for t in unique_types:
    drums_events.append({
        "type": t,
        "time": [e['time'] for e in events if e['type'] == t]
        })

song.drums = drums_events
song.save()

In [None]:
!pip install seaborn --quiet
import numpy as np
import pandas as pd

# Convert events to a DataFrame
df = pd.DataFrame(events)
pivot_df = df.pivot_table(index="type", columns="time", values="confidence", aggfunc="max", fill_value=0)
pivot_df

# Plot the heatmap
import seaborn as sns
plt.figure(figsize=(12, 6))
sns.heatmap(pivot_df, cmap="YlOrBr", annot=True, fmt=".2f", cbar_kws={'label': 'Confidence'})
plt.title("Drum Events Heatmap")
plt.xlabel("Time (s)")
plt.ylabel("Drum Type")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Install necessary packages and clone the repository
!pip install tensorflow keras --quiet
!git clone https://github.com/aabalke33/drum-audio-classifier.git
!echo $PWD

In [None]:
# 🥁 aabalke33 Drum Detection (Keras 3 compatible)
!pip install -q tensorflow librosa matplotlib soundfile

# Clone model if needed
import os
if not os.path.exists("drum-audio-classifier"):
    !git clone https://github.com/aabalke33/drum-audio-classifier.git

import librosa, numpy as np, json, tensorflow as tf
from keras.layers import TFSMLayer

# Load model using TFSMLayer
model_path = "drum-audio-classifier/saved_model/model_20230607_02"
layer = TFSMLayer(model_path, call_endpoint="serving_default")
class_names = ['Clap', 'Closed_Hi-Hat', 'Kick', 'Open_Hi-Hat', 'Snare']

# Load audio
y, sr = librosa.load("drums.wav", sr=16000, mono=True)
window, hop = int(0.1 * sr), int(0.05 * sr)
events = []

# Detect events using sliding window + spectrogram
for start in range(0, len(y) - window, hop):
    chunk = y[start:start+window]
    mel = librosa.feature.melspectrogram(chunk, sr=sr, n_mels=128, fmax=8000)
    mel_db = librosa.power_to_db(mel, ref=np.max)
    mel_db = np.stack([mel_db]*3, axis=-1)
    mel_db = tf.image.resize(mel_db, (128, 128))
    mel_db = tf.expand_dims(mel_db, axis=0)

    pred = layer(mel_db)[0].numpy()
    label = class_names[np.argmax(pred)]
    conf = float(np.max(pred))
    if conf > 0.5:
        events.append({"time": (start + window/2) / sr, "type": label, "confidence": conf})

# Save results
with open("aabalke_events.json", "w") as f:
    json.dump(events, f, indent=2)

print(f"✅ Saved aabalke_events.json with {len(events)} events")


## Use pretzel-ai/drum-transcription

In [None]:
!pip install torch transformers flax jax librosa soundfile --quiet


# load model and tokenizer
from transformers import Wav2Vec2ForSequenceClassification, Wav2Vec2FeatureExtractor
import torch, librosa, numpy as np

model_name = "DunnBC22/wav2vec2-base-Drum_Kit_Sounds"
feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(model_name)
model = Wav2Vec2ForSequenceClassification.from_pretrained(model_name)

# Load drums.wav
y, sr = librosa.load(drums_path, sr=16000, mono=True)
y = y / np.max(np.abs(y))  # peak normalize

window = int(0.1 * sr)
hop = int(0.05 * sr)

events = []
for start in range(0, len(y) - window, hop):
    chunk = y[start:start+window]
    inputs = feature_extractor(chunk, sampling_rate=sr, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    probs = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()
    label = model.config.id2label[np.argmax(probs)]
    confidence = float(np.max(probs))

    if confidence > 0.3:
        time_sec = (start + window/2) / sr
        events.append({"time": time_sec, "type": label, "confidence": confidence})


In [None]:
import pandas as pd
df = pd.DataFrame(events)
df = df.sort_values(by='time').reset_index(drop=True)
df

In [None]:
# Grab a single random 1-second chunk
test_chunk = y[sr * 44:sr * 46]  # 10s to 11s
inputs = feature_extractor(test_chunk, sampling_rate=sr, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

probs = torch.softmax(logits, dim=-1).squeeze().cpu().numpy()
top_idx = np.argmax(probs)
label = model.config.id2label[top_idx]
confidence = float(probs[top_idx])

print(f"Predicted label: {label}, confidence: {confidence:.3f}")

for idx, prob in enumerate(probs):
    label = model.config.id2label[idx]
    print(f"{label}: {prob:.3f}")