# 02 Librosa Audio Analysis

In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_parquet("../data/processed/base_tracks.parquet")

In [10]:
df

Unnamed: 0,apple_track_id,apple_artist_id,track_name,artist_name,genre,country,release_date,track_time_ms,preview_url,advisory_rating,track_explicitness,apple_collection_id,collection_name
0,298321904,46087,Music,Erick Sermon,Hip-Hop/Rap,USA,2001-08-27T12:00:00Z,223133,https://audio-ssl.itunes.apple.com/itunes-asse...,Explicit,explicit,298321651,Music
1,80815173,20044,Music,Madonna,Pop,USA,2000-08-21T07:00:00Z,225973,https://audio-ssl.itunes.apple.com/itunes-asse...,,notExplicit,80815197,Music
2,169003415,486597,Don't Stop Believin' (2024 Remaster),Journey,Rock,USA,1981-06-03T07:00:00Z,250835,https://audio-ssl.itunes.apple.com/itunes-asse...,,notExplicit,169003304,Greatest Hits (2024 Remaster)
3,277635828,156987,I'm Yours,Jason Mraz,Pop,USA,2008-02-12T08:00:00Z,242947,https://audio-ssl.itunes.apple.com/itunes-asse...,,notExplicit,277635758,We Sing. We Dance. We Steal Things
4,298429596,46087,Music,Erick Sermon,Hip-Hop/Rap,USA,2001-05-29T07:00:00Z,223133,https://audio-ssl.itunes.apple.com/itunes-asse...,Clean,cleaned,298429528,Music
...,...,...,...,...,...,...,...,...,...,...,...,...,...
8736,261553091,5192163,So Damn Lucky,Dave Matthews & Tim Reynolds,Rock,USA,2007-08-14T12:00:00Z,411640,https://audio-ssl.itunes.apple.com/itunes-asse...,,notExplicit,261552981,Live At Radio City (Bonus Track Version)
8737,416155943,214135551,Faceless,Red,Rock,USA,2011-02-01T12:00:00Z,203520,https://audio-ssl.itunes.apple.com/itunes-asse...,,notExplicit,416155893,Until We Have Faces
8738,198017838,121582,"That Lady, Pts. 1 & 2",The Isley Brothers,R&B/Soul,USA,1973-07-14T07:00:00Z,334387,https://audio-ssl.itunes.apple.com/itunes-asse...,,notExplicit,198017385,The Essential Isley Brothers
8739,283379767,91157642,Sweet Victory,David Eisley & Bob Kulick,Children's Music,USA,2005-11-15T12:00:00Z,126653,https://audio-ssl.itunes.apple.com/itunes-asse...,,notExplicit,283379708,Spongebob Squarepants - The Yellow Album


In [3]:
df.columns

Index(['apple_track_id', 'apple_artist_id', 'track_name', 'artist_name',
       'genre', 'country', 'release_date', 'track_time_ms', 'preview_url',
       'advisory_rating', 'track_explicitness', 'apple_collection_id',
       'collection_name'],
      dtype='object')

In [4]:
import requests
import tempfile
import librosa
import os
import warnings
from tqdm import tqdm


tqdm.pandas()
warnings.filterwarnings('ignore')

HEADERS = {
    'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36'
}

def extract_audio_features_final(preview_url):
    if pd.isna(preview_url):
        return None
    
    temp_path = None
    try:
        response = requests.get(preview_url, headers=HEADERS, timeout=10)
        if response.status_code != 200:
            return None
            
        with tempfile.NamedTemporaryFile(suffix=".m4a", delete=False) as temp_file:
            temp_file.write(response.content)
            temp_path = temp_file.name
        
        if os.path.getsize(temp_path) == 0:
            return None

        y, sr = librosa.load(temp_path, duration=30)
        
        tempo, _ = librosa.beat.beat_track(y=y, sr=sr)
        if isinstance(tempo, np.ndarray):
            tempo = tempo.item() if tempo.size > 0 else 0

        rms = np.mean(librosa.feature.rms(y=y))
        spec_cent = np.mean(librosa.feature.spectral_centroid(y=y, sr=sr))
        zcr = np.mean(librosa.feature.zero_crossing_rate(y))
        onset_env = librosa.onset.onset_strength(y=y, sr=sr)
        rhythm_strength = np.mean(onset_env)
        
        if os.path.exists(temp_path):
            os.remove(temp_path)
        
        return {
            "bpm": float(tempo),
            "energy": float(rms),
            "brightness": float(spec_cent),
            "noisiness": float(zcr),
            "rhythm_strength": float(rhythm_strength)
        }

    except Exception:
        if temp_path and os.path.exists(temp_path):
            os.remove(temp_path)
        return None

In [5]:
print(f"ðŸš€ Starting {len(df)} Song Analysis...")

audio_data = df['preview_url'].progress_apply(extract_audio_features_final)

df_audio_features = pd.DataFrame(audio_data.tolist())

expected_cols = ["bpm", "energy", "brightness", "noisiness", "rhythm_strength"]
for col in expected_cols:
    if col not in df_audio_features.columns:
        df_audio_features[col] = np.nan

df_complete = pd.concat([df.reset_index(drop=True), df_audio_features], axis=1)

df_clean = df_complete.dropna(subset=['bpm'])

print("-" * 50)
print(f"ðŸŽ‰ðŸŽ‰ðŸŽ‰ Success!")
print(f"âœ… Total {len(df)} | Success {len(df_clean)}")
print(f"âœ… Success Rate : {len(df_clean)/len(df)}")
print("-" * 50)

ðŸš€ Starting 8741 Song Analysis...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 8741/8741 [52:44<00:00,  2.76it/s]  

--------------------------------------------------
ðŸŽ‰ðŸŽ‰ðŸŽ‰ Success!
âœ… Total 8741 | Success 8741
âœ… Success Rate : 1.0
--------------------------------------------------





In [8]:
audio_feature_cols = ["bpm", "energy", "brightness", "noisiness", "rhythm_strength"]

df_audio_only = df_complete[["apple_track_id", "preview_url"] + audio_feature_cols].copy()

df_audio_only = df_audio_only.dropna(subset=["bpm"])

In [9]:
df_audio_only

Unnamed: 0,apple_track_id,preview_url,bpm,energy,brightness,noisiness,rhythm_strength
0,298321904,https://audio-ssl.itunes.apple.com/itunes-asse...,99.384014,0.164533,1887.936273,0.080000,2.063508
1,80815173,https://audio-ssl.itunes.apple.com/itunes-asse...,117.453835,0.181498,2826.828547,0.118955,2.087084
2,169003415,https://audio-ssl.itunes.apple.com/itunes-asse...,117.453835,0.093140,2063.243547,0.109971,1.150188
3,277635828,https://audio-ssl.itunes.apple.com/itunes-asse...,151.999081,0.172077,1737.766877,0.061636,1.706438
4,298429596,https://audio-ssl.itunes.apple.com/itunes-asse...,99.384014,0.167033,1868.193043,0.078049,2.053480
...,...,...,...,...,...,...,...
8736,261553091,https://audio-ssl.itunes.apple.com/itunes-asse...,135.999178,0.260211,2186.494979,0.085391,1.209752
8737,416155943,https://audio-ssl.itunes.apple.com/itunes-asse...,112.347147,0.338366,2701.542131,0.128145,1.002540
8738,198017838,https://audio-ssl.itunes.apple.com/itunes-asse...,123.046875,0.165641,2207.091845,0.090233,1.471007
8739,283379767,https://audio-ssl.itunes.apple.com/itunes-asse...,123.046875,0.207391,3029.900644,0.147286,1.021104


In [11]:
df_audio_only.to_parquet("../data/processed/audio_features.parquet", index=False)
print("ðŸ’¾ Saved as audio_features.parquet")

ðŸ’¾ Saved as audio_features.parquet
