In [2]:
import pandas as pd
import glob
import json
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

In [3]:
# Path to your JSON files
path = '../spotify_million_playlist_dataset/data/*.json'

# Function to read a single JSON file
def read_json_file(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
        return data['playlists']

# Use ThreadPoolExecutor to read files in parallel
all_files = glob.glob(path)
with ThreadPoolExecutor(max_workers=10) as executor:
    results = executor.map(read_json_file, all_files)

data_list = [item for sublist in results for item in sublist]

# Convert to DataFrame
df = pd.DataFrame(data_list)
df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Bob Dylan,false,549000,1454803200,75,65,1,"[{'pos': 0, 'artist_name': 'Bob Dylan', 'track...",28,18425368,39,
1,ON THE RUN,false,549001,1419552000,61,30,1,"[{'pos': 0, 'artist_name': 'JAY Z', 'track_uri...",4,15224873,5,
2,Biking,false,549002,1435708800,144,117,1,"[{'pos': 0, 'artist_name': 'Sander van Doorn',...",12,32723071,72,
3,August,false,549003,1505001600,38,32,1,"[{'pos': 0, 'artist_name': 'C-Trox', 'track_ur...",20,8439352,33,
4,run mix,false,549004,1459555200,16,15,1,"[{'pos': 0, 'artist_name': 'Buzzcocks', 'track...",3,3804816,12,
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,Hip Hop and R&B,false,302995,1465344000,83,65,1,"[{'pos': 0, 'artist_name': 'The Notorious B.I....",5,22126318,34,
999996,MAY,false,302996,1464393600,21,19,2,"[{'pos': 0, 'artist_name': 'Nebu Kiniza', 'tra...",6,4725029,18,
999997,Road Trip,false,302997,1455840000,137,106,1,"[{'pos': 0, 'artist_name': 'Sheppard', 'track_...",2,30219710,71,
999998,Guilty Pleasure,false,302998,1503705600,223,209,2,"[{'pos': 0, 'artist_name': 'Bernhoft', 'track_...",57,50603668,185,


In [4]:
# #convert to parquet
# selected = pa.Table.from_pandas(df)
# pq.write_table(selected, '1mil_spotify_playlists.parquet')
# print("Full playlists saved successfully!")

# # Load parquet file into DataFrame
# df = pd.read_parquet('1mil_spotify_playlists.parquet')
# df

Filter

In [9]:
# Filter by collaboration == False
filtered_df = df[df['collaborative'] == 'false'].copy()

# Sort by most recently updated date
filtered_df['modified_at'] = pd.to_datetime(filtered_df['modified_at'], unit='s')
sorted_df = filtered_df.sort_values(by='modified_at', ascending=False)

# Select top k playlists
num_of_playlist = 200000
selected_playlists = sorted_df.head(num_of_playlist)

# Determine the split sizes
num_splits = 15
split_size = len(selected_playlists) // num_splits

# Splitting the dataframe into 15 roughly equal parts
splits = [
    selected_playlists.iloc[i*split_size : (i+1)*split_size] if i < num_splits - 1 else selected_playlists.iloc[i*split_size:]
    for i in range(num_splits)
]

# Saving each split into separate parquet files
for i, split_df in enumerate(splits, start=1):
    split_table = pa.Table.from_pandas(split_df)
    pq.write_table(split_table, f'filtered_spotify_playlists_part{i}.parquet')
    print(f"Part {i} saved successfully!")


Part 1 saved successfully!
Part 2 saved successfully!
Part 3 saved successfully!
Part 4 saved successfully!
Part 5 saved successfully!
Part 6 saved successfully!
Part 7 saved successfully!
Part 8 saved successfully!
Part 9 saved successfully!
Part 10 saved successfully!
Part 11 saved successfully!
Part 12 saved successfully!
Part 13 saved successfully!
Part 14 saved successfully!
Part 15 saved successfully!


In [10]:
# Path pattern to read all parquet parts
parquet_files = glob.glob('filtered_spotify_playlists_part*.parquet')

# Read and concatenate all parquet files into one DataFrame
playlists_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# View combined dataframe
playlists_df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Work Out Music,false,584466,2017-10-22,79,56,1,"[{'album_name': 'Dangerous Woman', 'album_uri'...",25,17114509,46,
1,2017,false,287290,2017-10-22,94,71,1,"[{'album_name': 'We Are The Wrecks', 'album_ur...",58,20143378,59,
2,Post-Rock,false,612105,2017-10-22,49,35,1,"[{'album_name': 'Avoid the Light', 'album_uri'...",23,23846185,24,
3,Vibes,false,241594,2017-10-22,182,100,2,"[{'album_name': 'Yamborghini High', 'album_uri...",44,42089629,60,
4,Happy,false,389065,2017-10-22,96,77,1,"[{'album_name': 'Simple Pleasures', 'album_uri...",26,20639281,65,
...,...,...,...,...,...,...,...,...,...,...,...,...
199995,Worship,false,158433,2017-10-27,157,36,1,"[{'album_name': 'This Is Living', 'album_uri':...",32,54224089,17,
199996,Christian,false,692845,2017-10-27,191,77,1,"[{'album_name': 'Get Ready', 'album_uri': 'spo...",48,56743086,44,
199997,Chillax,false,547030,2017-10-27,104,83,1,"[{'album_name': 'Barry Dransfield', 'album_uri...",61,25495205,73,
199998,Just Good Music,false,692830,2017-10-27,114,69,1,[{'album_name': 'Britney Jean (Deluxe Version)...,36,24191843,54,


In [11]:
# Dictionary to hold track_uri as key and playlist pids & track details as values
track_details_dict = defaultdict(lambda: {'inside_playlists': [], 'track_info': {}})

# Iterate through each playlist to populate the dictionary
for _, playlist in playlists_df.iterrows():
    pid = playlist['pid']  # Correctly access pid from current row
    for track in playlist['tracks']:
        track_uri = track['track_uri']
        track_details_dict[track_uri]['inside_playlists'].append(pid)
        # Store track details if not already stored
        if not track_details_dict[track_uri]['track_info']:
            track_details_dict[track_uri]['track_info'] = track

# Convert dictionary to DataFrame
track_details_df = pd.DataFrame([
    {'track_uri': track_uri, **details['track_info'], 'inside_playlists': details['inside_playlists']}
    for track_uri, details in track_details_dict.items()
])

# Splitting track details dataframe into 2 roughly equal parts
num_splits_tracks = 2
track_split_size = len(track_details_df) // num_splits_tracks
track_splits = [
    track_details_df.iloc[i*track_split_size : (i+1)*track_split_size] if i < num_splits_tracks - 1 else track_details_df.iloc[i*track_split_size:]
    for i in range(num_splits_tracks)
]

# Saving each track detail split into separate parquet files
for i, split_df in enumerate(track_splits, start=1):
    split_table = pa.Table.from_pandas(split_df)
    pq.write_table(split_table, f'spotify_track_details_part{i}.parquet')
    print(f"Track details part {i} saved successfully!")


Track details part 1 saved successfully!
Track details part 2 saved successfully!


In [13]:
# Path pattern to read all parquet parts
parquet_files = glob.glob('spotify_track_details_part*.parquet')

# Read and concatenate all parquet files into one DataFrame
tracks_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# View combined dataframe
tracks_df

Unnamed: 0,track_uri,album_name,album_uri,artist_name,artist_uri,duration_ms,pos,track_name,inside_playlists
0,spotify:track:4pLwZjInHj3SimIyN9SnOz,Dangerous Woman,spotify:album:4lVR2fg3DAUQpGVJ6DciHW,Ariana Grande,spotify:artist:66CXWjxzNUsdJxJ2JdwvnR,226160,0,Side To Side,"[584466, 290613, 339107, 923257, 629621, 77242..."
1,spotify:track:37f4ITSlgPX81ad2EvmVQr,Wildfire,spotify:album:0mFDIOqypzHp6Xd0el1hoT,Rachel Platten,spotify:artist:3QLIkT4rD2FMusaqmkepbq,204013,1,Fight Song,"[584466, 290613, 540271, 809633, 47448, 719590..."
2,spotify:track:6i0V12jOa3mr6uu4WYhUBr,Heathens,spotify:album:3J8W9AOjQhnBLCX33m3atT,Twenty One Pilots,spotify:artist:3YQKmKGau1PzlVlkL1iodx,195920,2,Heathens,"[584466, 290613, 339107, 396751, 145671, 23842..."
3,spotify:track:2DpCdPMg1BADE4HDnxt3Rd,"Sit Still, Look Pretty",spotify:album:2cE2eOy7alOZHpuelJEV8Q,Daya,spotify:artist:6Dd3NScHWwnW6obMFbl1BH,202226,3,"Sit Still, Look Pretty","[584466, 414677, 809633, 601812, 971763, 34065..."
4,spotify:track:6Knv6wdA0luoMUuuoYi2i1,MY HOUSE,spotify:album:5lkNnHVlnCCCV304t89wOH,Flo Rida,spotify:artist:0jnsk9HBra6NMjO2oANoPY,192190,4,My House,"[584466, 290613, 968716, 339107, 145671, 23842..."
...,...,...,...,...,...,...,...,...,...
929587,spotify:track:5iWGY5orVRvEKsZoD9Uk5b,Hubert Parry: Songs of Farewell,spotify:album:5U09crMp3cVi8Y2ckS9G3d,Ralph Vaughan Williams,spotify:artist:7wNkISK49lKeXuRaZcQVFe,198040,90,The Turtle Dove,[547030]
929588,spotify:track:2TGilZTUy8oRdpbtkoTSNG,Chasing the Dawn,spotify:album:3DahB7tw1jvl4ZggK6MdEM,Jenny Crook and Henry Sears,spotify:artist:5IS01g8VDKZ5bLwl9LbIX5,255746,91,Ten Thousand Miles,[547030]
929589,spotify:track:3CrbGwc6dR9CM5C9QT9c8Z,Autumn Sky,spotify:album:2Uut8UduaobeBZSSLBG4Qs,Blackmore's Night,spotify:artist:5BnNG7QQQDTNd43ozx5y13,259079,98,Health To The Company,[547030]
929590,spotify:track:1TNkOCvbb2pCoLaBRjE7ht,Here for You,spotify:album:5yc4cWS9VOAjJgR2YkZ2pO,Maraaya,spotify:artist:6IVSIh1lrxBjXZdovPbSdN,203907,24,Here for You,[692830]
