# ðŸ”— Building the Unified Track Dataset
## Notebook 03 â€” Constructing `base_aligned`
Merging Apple Music base tracks, Spotify enrichment, audio features, and artist-level metadata into a single track-level analysis table.

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

pd.set_option("display.max_columns", None)

In [3]:
BASE_DIR = "../data/processed"

base_tracks = pd.read_parquet(f"{BASE_DIR}/base_tracks.parquet")
spotify = pd.read_parquet(f"{BASE_DIR}/spotify_enrichment.parquet")
audio = pd.read_parquet(f"{BASE_DIR}/audio_features.parquet")
artist_enrich = pd.read_parquet(f"{BASE_DIR}/artist_enrichment.parquet")

print("base_tracks:", base_tracks.shape)
print("spotify:", spotify.shape)
print("audio:", audio.shape)
print("artist_enrich:", artist_enrich.shape)

base_tracks: (8741, 13)
spotify: (10000, 7)
audio: (8741, 7)
artist_enrich: (1652, 5)


In [4]:
df = base_tracks.copy()

In [5]:
df = df.merge(
    spotify,
    on="apple_track_id",
    how="left"
)

In [6]:
df = df.merge(
    audio.drop(columns=["preview_url"]),
    on="apple_track_id",
    how="left"
)

In [7]:
df = df.merge(
    artist_enrich,
    on="spotify_artist_id",
    how="left"
)

In [8]:
df = df.rename(columns={"spotify_popularity": "track_popularity"})

In [9]:
df.to_parquet("../data/processed/base_aligned.parquet", index=False)
print("done:", df.shape)

done: (10000, 28)


In [10]:
df.track_popularity.describe()

count    9541.000000
mean       51.104706
std        18.110031
min         0.000000
25%        39.000000
50%        53.000000
75%        64.000000
max        96.000000
Name: track_popularity, dtype: float64

In [11]:
df.artist_popularity.describe()

count    9541.000000
mean       71.873703
std        13.466446
min         0.000000
25%        64.000000
50%        73.000000
75%        82.000000
max       100.000000
Name: artist_popularity, dtype: float64