In [2]:
import pandas as pd
import glob
import json
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

In [3]:
# Path to your JSON files
path = '../spotify_million_playlist_dataset/data/*.json'

# Function to read a single JSON file
def read_json_file(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
        return data['playlists']

# Use ThreadPoolExecutor to read files in parallel
all_files = glob.glob(path)
with ThreadPoolExecutor(max_workers=10) as executor:
    results = executor.map(read_json_file, all_files)

data_list = [item for sublist in results for item in sublist]

# Convert to DataFrame
df = pd.DataFrame(data_list)
df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Bob Dylan,false,549000,1454803200,75,65,1,"[{'pos': 0, 'artist_name': 'Bob Dylan', 'track...",28,18425368,39,
1,ON THE RUN,false,549001,1419552000,61,30,1,"[{'pos': 0, 'artist_name': 'JAY Z', 'track_uri...",4,15224873,5,
2,Biking,false,549002,1435708800,144,117,1,"[{'pos': 0, 'artist_name': 'Sander van Doorn',...",12,32723071,72,
3,August,false,549003,1505001600,38,32,1,"[{'pos': 0, 'artist_name': 'C-Trox', 'track_ur...",20,8439352,33,
4,run mix,false,549004,1459555200,16,15,1,"[{'pos': 0, 'artist_name': 'Buzzcocks', 'track...",3,3804816,12,
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,Hip Hop and R&B,false,302995,1465344000,83,65,1,"[{'pos': 0, 'artist_name': 'The Notorious B.I....",5,22126318,34,
999996,MAY,false,302996,1464393600,21,19,2,"[{'pos': 0, 'artist_name': 'Nebu Kiniza', 'tra...",6,4725029,18,
999997,Road Trip,false,302997,1455840000,137,106,1,"[{'pos': 0, 'artist_name': 'Sheppard', 'track_...",2,30219710,71,
999998,Guilty Pleasure,false,302998,1503705600,223,209,2,"[{'pos': 0, 'artist_name': 'Bernhoft', 'track_...",57,50603668,185,


In [None]:
# #convert to parquet
# selected = pa.Table.from_pandas(df)
# pq.write_table(selected, '1mil_spotify_playlists.parquet')
# print("Full playlists saved successfully!")

# # Load parquet file into DataFrame
# df = pd.read_parquet('1mil_spotify_playlists.parquet')
# df

Filter

In [4]:
# Filter by collaboration == False
filtered_df = df[df['collaborative'] == 'false'].copy()

# Sort by most recently updated date
filtered_df['modified_at'] = pd.to_datetime(filtered_df['modified_at'], unit='s')
sorted_df = filtered_df.sort_values(by='modified_at', ascending=False)

# Select top k playlists
num_of_playlist = 25000
selected_playlists = sorted_df.head(num_of_playlist)

# Determine the split sizes
total_size = len(selected_playlists)
split_size = total_size // 3

# Splitting the dataframe into three roughly equal parts
splits = [
    selected_playlists.iloc[:split_size],
    selected_playlists.iloc[split_size:2*split_size],
    selected_playlists.iloc[2*split_size:]
]

# Saving each split into separate parquet files
for i, split_df in enumerate(splits, start=1):
    split_table = pa.Table.from_pandas(split_df)
    pq.write_table(split_table, f'filtered_spotify_playlists_part{i}.parquet')
    print(f"Part {i} saved successfully!")


Part 1 saved successfully!
Part 2 saved successfully!
Part 3 saved successfully!


In [5]:
# Path pattern to read all parquet parts
parquet_files = glob.glob('filtered_spotify_playlists_part*.parquet')

# Read and concatenate all parquet files into one DataFrame
playlists_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# View combined dataframe
playlists_df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Gym,false,605377,2017-11-01,102,89,2,[{'album_name': 'Never Seen Runaway (Lost Soul...,18,25180469,64,
1,party !!,false,332166,2017-11-01,120,101,2,"[{'album_name': 'ANTI', 'album_uri': 'spotify:...",37,27526989,78,
2,Indie,false,93216,2017-11-01,120,88,1,"[{'album_name': 'Need Your Light', 'album_uri'...",32,25299899,79,
3,new,false,180293,2017-11-01,202,187,1,"[{'album_name': 'Lake Superior', 'album_uri': ...",129,52349366,173,
4,Spooky,false,525040,2017-11-01,28,19,2,"[{'album_name': 'Beetlejuice', 'album_uri': 's...",6,5394942,21,
...,...,...,...,...,...,...,...,...,...,...,...,...
24995,jamz,false,626336,2017-10-31,123,90,1,"[{'album_name': 'BEYONCÉ [Platinum Edition]', ...",34,27330388,65,
24996,eletronic,false,912587,2017-10-31,16,16,1,"[{'album_name': 'Can't Fight It', 'album_uri':...",12,4075514,15,
24997,Feelin Good,false,691796,2017-10-31,9,9,1,"[{'album_name': 'You Can Tune a Piano, But You...",7,2446943,8,
24998,Gym,false,773851,2017-10-31,125,106,1,"[{'album_name': 'Call Me a Spaceman', 'album_u...",74,30168892,82,


In [7]:
# Dictionary to hold track_uri as key and playlist pids & track details as values
track_details_dict = defaultdict(lambda: {'inside_playlists': [], 'track_info': {}})

# Iterate through each playlist to populate the dictionary
for _, playlist in playlists_df.iterrows():
    pid = playlist['pid']  # Correctly access pid from current row
    for track in playlist['tracks']:
        track_uri = track['track_uri']
        track_details_dict[track_uri]['inside_playlists'].append(pid)
        # Store track details if not already stored
        if not track_details_dict[track_uri]['track_info']:
            track_details_dict[track_uri]['track_info'] = track

# Convert dictionary to DataFrame
track_details_df = pd.DataFrame([
    {'track_uri': track_uri, **details['track_info'], 'inside_playlists': details['inside_playlists']}
    for track_uri, details in track_details_dict.items()
])

# Convert DataFrame to PyArrow Table
track_details_table = pa.Table.from_pandas(track_details_df)

# Save the track details to a parquet file
pq.write_table(track_details_table, 'spotify_track_details.parquet')

print("Track details saved successfully!")

Track details saved successfully!


In [8]:
pd.read_parquet('spotify_track_details.parquet')

Unnamed: 0,track_uri,album_name,album_uri,artist_name,artist_uri,duration_ms,pos,track_name,inside_playlists
0,spotify:track:20KUzCMs22osRkavPuShDl,Never Seen Runaway (Lost Soul & Genjo Mix),spotify:album:4WcGBbSbdkMZlxF6utjmIh,Jay Kill & The Hustle Standard,spotify:artist:26hpsv5zh8ETviLJC8sasT,244029,0,Never Seen Runaway (Lost Soul & Genjo Mix),"[605377, 224893]"
1,spotify:track:1rIQU62pjgoz3sFgVo3j5Q,Born Ready (Motivational Speeches Gym & Workout),spotify:album:0dl53H4UUDyrqDSh2lGcQZ,Fearless Motivation,spotify:artist:1FhamVtJlNqaekPnwxQpbk,288057,1,Stick It out so You Can Say I Made It (Motivat...,"[605377, 768281]"
2,spotify:track:3ejrk4i0LWIofaaBEmwX0A,Stay Schemin,spotify:album:75lbUHs6pIRT3OoB4J2LCx,Rick Ross,spotify:artist:1sBkRIssrMs1AbVkOJbc7a,267720,2,Stay Schemin,"[605377, 881984, 419793, 138838, 533450, 48115..."
3,spotify:track:1zfEeGXv4M5HkZTK4PX8gQ,Church In These Streets,spotify:album:5RHakS5UXhhnyT96maP8ic,Jeezy,spotify:artist:4yBK75WVCQXej1p04GWqxH,231746,3,Just Win,[605377]
4,spotify:track:5I6YA7oHtraxwWQzzCugDt,There Is No Competition 2: The Grieving Music ...,spotify:album:3BBRkEHu6nmOVs7juIJ1KN,Fabolous,spotify:artist:0YWxKQj2Go9CGHCp77UOyy,209693,4,You Be Killin Em,"[605377, 818482, 419793, 506940, 941636, 66762..."
...,...,...,...,...,...,...,...,...,...
286918,spotify:track:2NqgPxYL2JY5f8Lx5XUE7j,Dreamchasers,spotify:album:33Uf0G89S8CulADPn7a8zZ,DJ Drama,spotify:artist:5oNgAs7j5XcBMzWv3HAnHG,140409,110,Im On One,[773851]
286919,spotify:track:3gcBEbI8fg64IeyyNPQWrB,Roadkill,spotify:album:68EZ1UZKRPZm19Ay9LYgwg,Dubfire,spotify:artist:3bVYqr2NfmwmL4YJisWhJI,469493,113,Roadkill - Original Club Mix,[773851]
286920,spotify:track:7qm884sKOAoF5AuZUgBUfa,Barrio Fino (Bonus Track Version),spotify:album:5i2HhIrYyMNQvmsBk9h8Im,Daddy Yankee,spotify:artist:4VMYDCV2IEDYJArk749S6m,186133,120,Golpe de Estado,[773851]
286921,spotify:track:2QFtrrayGZ1wKEMQ9bMZE2,Alive,spotify:album:5UA9jSWeH1SfSYG8Fsq4oh,VAN HØF,spotify:artist:63MsklKb1ks7r8zMzuJcsn,301613,40,Alive,[436394]
