In [4]:
import os
import pandas as pd
from datasets import load_dataset

OUT_DIR = "Users/Phillip/Downloads"
os.makedirs(OUT_DIR, exist_ok=True)

# Spotify tracks dataset (audio features + genres)
ds = load_dataset("maharshipandya/spotify-tracks-dataset")  # :contentReference[oaicite:1]{index=1}

# Many HF datasets expose only a "train" split; handle both cases safely
if isinstance(ds, dict) and "train" in ds:
    d = ds["train"]
else:
    # if it's already a Dataset
    d = ds

df = d.to_pandas()

print("Rows:", len(df))
print("Columns:", list(df.columns)[:25], "...")
df.head(3)


Generating train split: 100%|██████████| 114000/114000 [00:00<00:00, 349155.50 examples/s]

Rows: 114000
Columns: ['Unnamed: 0', 'track_id', 'artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'explicit', 'danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'time_signature', 'track_genre'] ...





Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic


In [5]:
# Try to standardize column names across variants
cols_map_candidates = {
    "track_id": ["track_id", "id", "spotify_id"],
    "track_name": ["track_name", "name", "track"],
    "artist_name": ["artists", "artist_name", "artist"],
    "album_name": ["album_name", "album"],
    "genre": ["genre", "track_genre", "genres"],
    "popularity": ["popularity"],
    "duration_ms": ["duration_ms", "duration"],
    "explicit": ["explicit"],
    "danceability": ["danceability"],
    "energy": ["energy"],
    "key": ["key"],
    "loudness": ["loudness"],
    "mode": ["mode"],
    "speechiness": ["speechiness"],
    "acousticness": ["acousticness"],
    "instrumentalness": ["instrumentalness"],
    "liveness": ["liveness"],
    "valence": ["valence"],
    "tempo": ["tempo"],
    "time_signature": ["time_signature"],
}

def pick_col(df, candidates):
    for c in candidates:
        if c in df.columns:
            return c
    return None

selected = {}
missing = []
for std, cand in cols_map_candidates.items():
    c = pick_col(df, cand)
    if c is None:
        missing.append(std)
    else:
        selected[std] = c

print("Selected columns mapping:", selected)
print("Missing (ok if some):", missing)

out = pd.DataFrame()
for std, src in selected.items():
    out[std] = df[src]

# Basic cleaning
if "popularity" in out.columns:
    out = out.dropna(subset=["popularity"])
    out["popularity"] = pd.to_numeric(out["popularity"], errors="coerce")

# De-dup (if track_id exists)
if "track_id" in out.columns:
    out = out.drop_duplicates(subset=["track_id"])

out_path = os.path.join(OUT_DIR, "spotify_tracks_clean.csv")
out.to_csv(out_path, index=False)
print("Wrote:", out_path)
out.head(3)


Selected columns mapping: {'track_id': 'track_id', 'track_name': 'track_name', 'artist_name': 'artists', 'album_name': 'album_name', 'genre': 'track_genre', 'popularity': 'popularity', 'duration_ms': 'duration_ms', 'explicit': 'explicit', 'danceability': 'danceability', 'energy': 'energy', 'key': 'key', 'loudness': 'loudness', 'mode': 'mode', 'speechiness': 'speechiness', 'acousticness': 'acousticness', 'instrumentalness': 'instrumentalness', 'liveness': 'liveness', 'valence': 'valence', 'tempo': 'tempo', 'time_signature': 'time_signature'}
Missing (ok if some): []
Wrote: Users/Phillip/Downloads/spotify_tracks_clean.csv


Unnamed: 0,track_id,track_name,artist_name,album_name,genre,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,5SuOikwiRyPMVoIQDJUgSV,Comedy,Gen Hoshino,Comedy,acoustic,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4
1,4qPNDBW1i3p13qLCt0Ki3A,Ghost - Acoustic,Ben Woodward,Ghost (Acoustic),acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4
2,1iJBSr7s7jYXzM8EGcbK5b,To Begin Again,Ingrid Michaelson;ZAYN,To Begin Again,acoustic,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4
