In [1]:
import numpy as np
import pandas as pd
import json
import glob

In [2]:
with open("spotify_million_playlist_dataset/data/mpd.slice.0-999.json", "r", encoding="utf-8") as f:
    mpd_data = json.load(f)

In [None]:

mpd_track_ids = set()
mpd_folder = "spotify_million_playlist_dataset/data"

for file_path in glob.glob(f"{mpd_folder}/mpd.slice.*.json"):
    with open(file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
        for playlist in data['playlists']:
            for track in playlist['tracks']:
                uri = track.get('track_uri', '')
                if uri.startswith('spotify:track:'):
                    track_id = uri.split(':')[-1].strip()
                    mpd_track_ids.add(track_id)

print(f"✅ Collected {len(mpd_track_ids):,} unique track_ids from MPD.")


✅ Collected 2,262,292 unique track_ids from MPD.


In [None]:
# audio feature data from kaggle dataset
feat_df = pd.read_csv("SpotifyFeatures.csv/SpotifyFeatures.csv")
print(feat_df.shape)
feat_df.head()

(232725, 18)


Unnamed: 0,genre,artist_name,track_name,track_id,popularity,acousticness,danceability,duration_ms,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,Movie,Henri Salvador,C'est beau de faire un Show,0BRjO6ga9RKCKjfDqeFgWV,0,0.611,0.389,99373,0.91,0.0,C#,0.346,-1.828,Major,0.0525,166.969,4/4,0.814
1,Movie,Martin & les fées,Perdu d'avance (par Gad Elmaleh),0BjC1NfoEOOusryehmNudP,1,0.246,0.59,137373,0.737,0.0,F#,0.151,-5.559,Minor,0.0868,174.003,4/4,0.816
2,Movie,Joseph Williams,Don't Let Me Be Lonely Tonight,0CoSDzoNIKCRs124s9uTVy,3,0.952,0.663,170267,0.131,0.0,C,0.103,-13.879,Minor,0.0362,99.488,5/4,0.368
3,Movie,Henri Salvador,Dis-moi Monsieur Gordon Cooper,0Gc6TVm52BwZD07Ki6tIvf,0,0.703,0.24,152427,0.326,0.0,C#,0.0985,-12.178,Major,0.0395,171.758,4/4,0.227
4,Movie,Fabien Nataf,Ouverture,0IuslXpMROHdEPvSl1fTQK,4,0.95,0.331,82625,0.225,0.123,F,0.202,-21.15,Major,0.0456,140.576,4/4,0.39


In [None]:
# audio feature data from class dataset
class_df = pd.read_csv("spotify_data.csv").drop(columns="Unnamed: 0")
print(class_df.shape)
class_df.head()

(1159764, 19)


Unnamed: 0,artist_name,track_name,track_id,popularity,year,genre,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Jason Mraz,I Won't Give Up,53QF56cjZA9RTuuMZDrSA6,68,2012,acoustic,0.483,0.303,4,-10.058,1,0.0429,0.694,0.0,0.115,0.139,133.406,240166,3
1,Jason Mraz,93 Million Miles,1s8tP3jP4GZcyHDsjvw218,50,2012,acoustic,0.572,0.454,3,-10.286,1,0.0258,0.477,1.4e-05,0.0974,0.515,140.182,216387,4
2,Joshua Hyslop,Do Not Let Me Go,7BRCa8MPiyuvr2VU3O9W0F,57,2012,acoustic,0.409,0.234,3,-13.711,1,0.0323,0.338,5e-05,0.0895,0.145,139.832,158960,4
3,Boyce Avenue,Fast Car,63wsZUhUZLlh1OsyrZq7sz,58,2012,acoustic,0.392,0.251,10,-9.845,1,0.0363,0.807,0.0,0.0797,0.508,204.961,304293,4
4,Andrew Belle,Sky's Still Blue,6nXIYClvJAfi6ujLiKqEq8,54,2012,acoustic,0.43,0.791,6,-5.419,0,0.0302,0.0726,0.0193,0.11,0.217,171.864,244320,4


In [None]:
# preparing to merge audio feature datasets
feat_df['track_id'] = feat_df['track_id'].astype(str).str.strip()
class_df['track_id'] = class_df['track_id'].astype(str).str.strip()

merge_cols = ['artist_name', 'track_name', 'track_id', 'popularity',
       'genre', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'duration_ms']

feat_df = feat_df[merge_cols]
class_df = class_df[merge_cols]

In [None]:
# merging audio feature datasets on track_id
combined_df = pd.concat([class_df, feat_df], axis=0)
combined_df = combined_df.drop_duplicates(subset='track_id', keep='first')

In [None]:
# looking at track_id matches between mpd data and individual datasets
feat_matches = feat_df['track_id'].isin(mpd_track_ids).sum()
class_matches = class_df['track_id'].isin(mpd_track_ids).sum()

print(f"feat_df matches: {feat_matches} / {len(feat_df)}")
print(f"class_df matches: {class_matches} / {len(class_df)}")

feat_df matches: 105823 / 232725
class_df matches: 188298 / 1159764


In [None]:
# track_id matches between mpd data and combined audio feature df
coverage = combined_df['track_id'].isin(mpd_track_ids).sum()
print(f"Combined match count: {coverage} / {len(combined_df)}")

Combined match count: 238552 / 1283516


In [None]:
import os

# Path to directory with mpd.slice.*.json files
json_dir = "spotify_million_playlist_dataset/data"

# Load your set of track_ids you care about
valid_track_ids = set(mpd_track_ids)  # already defined

records = []

# Iterate over all slice files
for filename in os.listdir(json_dir):
    if filename.endswith(".json"):
        with open(os.path.join(json_dir, filename), "r", encoding="utf-8") as f:
            data = json.load(f)
            for playlist in data["playlists"]:
                pid = playlist["pid"]
                for track in playlist["tracks"]:
                    tid = track["track_uri"].split(":")[-1]
                    if tid in valid_track_ids:
                        records.append((pid, tid))

# Create DataFrame
mpd_df = pd.DataFrame(records, columns=["playlist_id", "track_id"])

In [None]:
# Save to CSV for future use; playlists and track_ids in mpd dataset
mpd_df.to_csv("mpd_track_map.csv", index=False)


In [10]:
# Load back into notebook
mpd_df = pd.read_csv("mpd_track_map.csv")
print(mpd_df.shape)
mpd_df.head()

(66346428, 2)


Unnamed: 0,playlist_id,track_id
0,0,0UaMYEvWZi0ZqiDOoHU3YI
1,0,6I9VzXrHxO9rA9A5euc8Ak
2,0,0WqIKmW4BTrj3eJFmnCKMv
3,0,1AWQoqb9bSvzTjaLralEkT
4,0,1lzr43nnXAijIGYnCT8M8H


In [11]:
# Limit combined_df to just track_ids that are in mpd_df
track_id_set = set(mpd_df["track_id"].unique())
filtered_combined_df = combined_df[combined_df["track_id"].isin(track_id_set)].copy()


In [None]:
# merging mpd_df with filtered audio feature df on track_id
playlist_tracks = mpd_df.merge(filtered_combined_df, on="track_id", how="inner")

In [24]:
# Count how many matched tracks each playlist has
playlist_counts = playlist_tracks.groupby("playlist_id").size()
playlist_counts.mean()

33.566731964737016

In [None]:
# Filter to only playlists with ≥ 20 matched tracks
valid_playlists = playlist_counts[playlist_counts >= 20].index
filtered = playlist_tracks[playlist_tracks["playlist_id"].isin(valid_playlists)]

In [27]:
# creating playlist vectors
feature_cols = ['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence']

playlist_vectors = filtered.groupby("playlist_id")[feature_cols].mean().reset_index()

In [28]:
playlist_vectors.to_csv("playlist_vectors_thresh20.csv", index=False)


In [29]:
playlist_vectors_df = pd.read_csv("playlist_vectors_thresh20.csv")

In [30]:
playlist_vectors_df.head()

Unnamed: 0,playlist_id,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence
0,0,0.646759,0.775379,-4.734897,0.113386,0.113513,0.001199,0.185438,0.674034
1,2,0.656568,0.716838,-4.760973,0.098397,0.236678,0.001081,0.155524,0.585432
2,3,0.521366,0.583368,-9.969902,0.062844,0.325415,0.258824,0.15758,0.431712
3,5,0.696459,0.732054,-5.395784,0.068976,0.098065,0.001151,0.192219,0.573541
4,8,0.519435,0.716957,-6.05113,0.049935,0.157382,0.084758,0.173174,0.582


In [31]:
playlist_vectors_df.shape

(562107, 9)

In [None]:
combined_df.to_csv("combined_feats_df.csv", index=False)