# 01 ‚Äî Data Audit (Trust & Definitions)

This notebook answers one question:
**Can we trust this dataset enough to analyze what drives music popularity?**

We will:
1. Verify schema + missingness + duplicates
2. Validate ranges + unit consistency (tempo, duration, loudness, etc.)
3. Define the target variable ("popularity") and known confounds
4. Produce a modeling-ready table and a data dictionary

In [None]:
import pandas as pd
import numpy as np

In [None]:
df_raw = pd.read_csv("../data/raw/apple_music_dataset.csv")
track_df = df_raw.copy()

display(track_df.head())
display(track_df.tail())

# Fetching Data
- get popularity data from spotify API

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

CLIENT_ID = os.getenv("SPOTIFY_CLIENT_ID")
CLIENT_SECRET = os.getenv("SPOTIFY_CLIENT_SECRET")

assert CLIENT_ID is not None
assert CLIENT_SECRET is not None

## GEt Track Info

In [None]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from tqdm import tqdm # Progress bar

URI = 'http://127.0.0.1:9090/callback'

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(
    client_id=CLIENT_ID,
    client_secret=CLIENT_SECRET,
    redirect_uri=URI,
    scope="user-read-private"
))

def get_spotify_details(row):
    artist = str(row['artistName']).replace("'", "")
    track = str(row['trackName']).replace("'", "")
    query = f"track:{track} artist:{artist}"
    
    try:
        results = sp.search(q=query, type='track', limit=1)
        items = results['tracks']['items']
        
        if not items:
            return None 
        
        t = items[0]
        
        return {
            "spotify_id": t['id'],
            "spotify_artist_id": t['artists'][0]['id'],
            "spotify_popularity": t['popularity'],
            "spotify_explicit": t['explicit'],
            "cover_art_url": t['album']['images'][0]['url'] if t['album']['images'] else None,
            "spotify_release_date": t['album']['release_date']
        }
        
    except Exception as e:
        return None

print("‚è≥ Searching Spotify...")
tqdm.pandas()
spotify_data = track_df.progress_apply(get_spotify_details, axis=1)


clean_data = [x if x is not None else {} for x in spotify_data]
df_spotify_features = pd.DataFrame(clean_data)

# Merge
final_df = pd.concat([track_df.reset_index(drop=True), df_spotify_features], axis=1)

print(f"‚úÖ Match Success Rate: {final_df['spotify_id'].notna().mean():.2%}")
display(final_df.head())

## Get Artist Info

In [None]:
print(f"üé§ Artists to fetch: {final_df['spotify_artist_id'].nunique()}")

unique_ids = final_df['spotify_artist_id'].dropna().unique().tolist()

batch_size = 50
artist_data = []

print("Fetching Artist Popularity & Other Info...")
for i in tqdm(range(0, len(unique_ids), batch_size)):
    batch = unique_ids[i:i + batch_size]
    
    try:
        response = sp.artists(batch)
        for artist in response['artists']:
            image_url = artist['images'][0]['url'] if artist['images'] else None
            
            artist_data.append({
                "spotify_artist_id": artist['id'],
                "artist_popularity": artist['popularity'],
                "artist_genres": artist['genres'],
                "artist_followers": artist['followers']['total'],
                "artist_image_url": image_url  # <--- NEW FIELD ADDED HERE
            })
    except Exception as e:
        print(f"Error on batch {i}: {e}")

df_artists_only = pd.DataFrame(artist_data)
final_df = final_df.merge(df_artists_only, on="spotify_artist_id", how="left")

print("‚úÖ Added Artist Info!")
display(final_df[['trackName', 'artist_popularity']].head())

In [None]:
df = final_df.copy()
del final_df

In [2]:
import pandas as pd
df = pd.read_parquet('../data/processed/base_tracks.parquet')

In [None]:
rename_map = {
    "artistId": "apple_artist_id",
    "trackId": "apple_track_id",
    "artistName": "artist_name",
    "trackName": "track_name",
    "primaryGenreName": "genre",
    "releaseDate": "release_date",
    "trackTimeMillis": "track_time_ms",
    "previewUrl": "preview_url",
    "contentAdvisoryRating": "advisory_rating",
    "collectionId": "apple_collection_id",
    "collectionName": "collection_name",
    "trackExplicitness": "track_explicitness",
    "country": "country",
}
df = df.rename(columns={k:v for k,v in rename_map.items() if k in df.columns})

In [None]:
df["release_date"] = pd.to_datetime(df["release_date"], errors="coerce")
df["release_year"] = df["release_date"].dt.year
df["release_month"] = df["release_date"].dt.month
df["release_ym"] = df["release_date"].dt.to_period("M").astype(str)

df["track_time_ms"] = pd.to_numeric(df["track_time_ms"], errors="coerce")
df["track_time_min"] = df["track_time_ms"] / 60000

In [4]:
base_cols = [
    "apple_track_id", "apple_artist_id",
    "track_name", "artist_name",
    "genre", "country",
    "release_date", "release_year", "release_month", "release_ym",
    "track_time_ms", "track_time_min",
    "preview_url", "advisory_rating", "track_explicitness",
    "apple_collection_id", "collection_name",
]
base_cols = [c for c in base_cols if c in df.columns]
base = df[base_cols].copy()

In [5]:
base = base.drop_duplicates(subset=["apple_track_id"]).reset_index(drop=True)
base.to_parquet("../data/processed/base_tracks.parquet", index=False)

In [None]:
spotify_cols = [
    "apple_track_id",
    "spotify_id", "spotify_artist_id",
    "spotify_popularity", "spotify_explicit",
    "spotify_release_date", "cover_art_url",
    "spotify_popularity_fetched_at",

    "match_confidence", "spotify_name", "spotify_artist_name",
]
spotify_cols = [c for c in spotify_cols if c in df.columns]
spotify_enrich = df[spotify_cols].copy()

spotify_enrich.to_parquet("../data/processed/spotify_enrichment.parquet", index=False)

In [None]:
artist_cols = ["spotify_artist_id", "artist_followers", "artist_popularity", "artist_genres", "artist_image_url"]
artist_cols = [c for c in artist_cols if c in df.columns]
artist_enrich = (df[artist_cols]
                .dropna(subset=["spotify_artist_id"])
                .drop_duplicates(subset=["spotify_artist_id"])
                .reset_index(drop=True))
artist_enrich.to_parquet("../data/processed/artist_enrichment.parquet", index=False)