In [8]:
import json
from pathlib import Path
import pandas as pd

# Path to your original slice file
folder_path = Path(r"/Users/noa/Desktop/02805 - Social Graphs/playlist_data/")

# Load and merge all mpd slice JSON files (Your original loading logic)
file_list = sorted(folder_path.glob("mpd.slice.*.json"))
playlists = []
for fp in file_list:
    with open(fp, 'r', encoding='utf-8') as f:
        data = json.load(f)
        playlists.extend(data.get("playlists", []))

mpd_slice = {
    "info": {"merged_from_files": len(file_list)},
    "playlists": playlists
}

print(f"Loaded {len(file_list)} files, total playlists merged: {len(mpd_slice['playlists'])}")
print(f"Total playlists in this slice: {len(mpd_slice['playlists'])}")

# --- Creating the DataFrame ---

# 1. Define the metadata fields from the playlist object to keep
playlist_meta = [
    'pid', 'name', 'num_tracks', 'num_albums', 
    'num_artists', 'duration_ms', 'modified_at'
]

# Use json_normalize, adding the correct record_prefix to prevent collision
mpd_slice_df = pd.json_normalize(
    data=mpd_slice['playlists'],
    record_path='tracks',
    meta=playlist_meta,
    # This line prevents the 'duration_ms' collision by prefixing track fields
    record_prefix='track_', 
    errors='ignore'
)
# --- REMOVED THE REDUNDANT, FAULTY CALL HERE ---


# 2. Rename columns for clarity. The record_prefix already handled the main conflict.
# This cleans up any remaining nested fields (e.g., if there were fields like 'track.album.uri')
mpd_slice_df.columns = mpd_slice_df.columns.str.replace('.', '_', regex=False)

print(f"\nDataframe shape: {mpd_slice_df.shape}")
print("\nDataFrame Head:")
mpd_slice_df.head()

Loaded 6 files, total playlists merged: 6000
Total playlists in this slice: 6000

Dataframe shape: (402016, 15)

DataFrame Head:


Unnamed: 0,track_pos,track_artist_name,track_track_uri,track_artist_uri,track_track_name,track_album_uri,track_duration_ms,track_album_name,pid,name,num_tracks,num_albums,num_artists,duration_ms,modified_at
0,0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,0,Throwbacks,52,47,37,11532414,1493424000
1,1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,0,Throwbacks,52,47,37,11532414,1493424000
2,2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),0,Throwbacks,52,47,37,11532414,1493424000
3,3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified,0,Throwbacks,52,47,37,11532414,1493424000
4,4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,0,Throwbacks,52,47,37,11532414,1493424000


In [15]:
import numpy as np
import networkx as nx

# --- 1. Filter Playlists by Song Count ---
# This filter needs to be applied to the 'pid' column in the already flattened DataFrame.

# identify the PIDs that meet the criteria using the 'num_tracks' metadata column.
valid_pids = mpd_slice_df[
    (mpd_slice_df['num_tracks'] >= 40) & (mpd_slice_df['num_tracks'] <= 100)
]['pid'].unique()

# Filter the main DataFrame to only include tracks from these valid playlists.
artist_playlist_df = mpd_slice_df[
    mpd_slice_df['pid'].isin(valid_pids)
].copy() # Use .copy() to avoid SettingWithCopyWarning

# --- 2. Select Relevant Columns and Remove Duplicates ---
# Now that the DataFrame is filtered by playlist size, we extract the core data.

# Mapping the columns from your json_normalize step (assuming track_artist_uri, etc.)
core_columns = {
    'pid': 'pid',
    'track_artist_uri': 'artist_uri', # Assuming your track artist URI column is prefixed
    'track_title': 'track_track_name',
    'track_artist_name': 'artist_name' # Assuming your track artist name column is prefixed
}
artist_playlist_df = artist_playlist_df.rename(
    columns=core_columns
)[list(core_columns.values())]


# Remove duplicate entries: An artist (artist_uri) appears only once per playlist (pid).
artist_playlist_df = artist_playlist_df.drop_duplicates(
    subset=['pid', 'artist_uri']
)

# --- 3. Create the Bipartite Incidence Matrix (Artist x Playlist) ---
# Use crosstab on the final, cleaned DataFrame.
incidence_matrix = pd.crosstab(
    artist_playlist_df['artist_name'],
    artist_playlist_df['pid']
)

print(f"Filtered Playlists (Unique PIDs): {len(valid_pids)}")
print(f"Unique Artists: {incidence_matrix.shape[0]}")

incidence_matrix.head()

Filtered Playlists (Unique PIDs): 2243
Unique Artists: 15553


pid,0,2,5,7,8,10,17,18,19,22,...,5971,5973,5976,5978,5984,5986,5991,5996,5998,5999
artist_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
!!!,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
!llmind,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
"""Weird Al"" Yankovic",0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
#1 Dads,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
$uicideBoy$,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
