In [2]:
import pandas as pd
import glob
import json
import pyarrow as pa
import pyarrow.parquet as pq
from concurrent.futures import ThreadPoolExecutor
from collections import defaultdict

In [6]:
# Path to your JSON files
path = '../spotify_million_playlist_dataset/data/*.json'

# Function to read a single JSON file
def read_json_file(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
        return data['playlists']

# Use ThreadPoolExecutor to read files in parallel
all_files = glob.glob(path)
with ThreadPoolExecutor(max_workers=10) as executor:
    results = executor.map(read_json_file, all_files)

data_list = [item for sublist in results for item in sublist]

# Convert to DataFrame
df = pd.DataFrame(data_list)
df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Bob Dylan,false,549000,1454803200,75,65,1,"[{'pos': 0, 'artist_name': 'Bob Dylan', 'track...",28,18425368,39,
1,ON THE RUN,false,549001,1419552000,61,30,1,"[{'pos': 0, 'artist_name': 'JAY Z', 'track_uri...",4,15224873,5,
2,Biking,false,549002,1435708800,144,117,1,"[{'pos': 0, 'artist_name': 'Sander van Doorn',...",12,32723071,72,
3,August,false,549003,1505001600,38,32,1,"[{'pos': 0, 'artist_name': 'C-Trox', 'track_ur...",20,8439352,33,
4,run mix,false,549004,1459555200,16,15,1,"[{'pos': 0, 'artist_name': 'Buzzcocks', 'track...",3,3804816,12,
...,...,...,...,...,...,...,...,...,...,...,...,...
999995,Hip Hop and R&B,false,302995,1465344000,83,65,1,"[{'pos': 0, 'artist_name': 'The Notorious B.I....",5,22126318,34,
999996,MAY,false,302996,1464393600,21,19,2,"[{'pos': 0, 'artist_name': 'Nebu Kiniza', 'tra...",6,4725029,18,
999997,Road Trip,false,302997,1455840000,137,106,1,"[{'pos': 0, 'artist_name': 'Sheppard', 'track_...",2,30219710,71,
999998,Guilty Pleasure,false,302998,1503705600,223,209,2,"[{'pos': 0, 'artist_name': 'Bernhoft', 'track_...",57,50603668,185,


In [7]:
# #convert to parquet
# selected = pa.Table.from_pandas(df)
# pq.write_table(selected, '1mil_spotify_playlists.parquet')
# print("Full playlists saved successfully!")

# # Load parquet file into DataFrame
# df = pd.read_parquet('1mil_spotify_playlists.parquet')
# df

Filter

In [8]:
# Filter by collaboration == False
filtered_df = df[df['collaborative'] == 'false'].copy()

# Sort by most recently updated date
filtered_df['modified_at'] = pd.to_datetime(filtered_df['modified_at'], unit='s')
sorted_df = filtered_df.sort_values(by='modified_at', ascending=False)

# Select top k playlists
num_of_playlist = 20000
selected_playlists = sorted_df.head(num_of_playlist)


In [9]:
# Determine the split sizes
num_splits = 10
split_size = len(selected_playlists) // num_splits

# Splitting the dataframe into 15 roughly equal parts
splits = [
    selected_playlists.iloc[i*split_size : (i+1)*split_size] if i < num_splits - 1 else selected_playlists.iloc[i*split_size:]
    for i in range(num_splits)
]

# Saving each split into separate parquet files
for i, split_df in enumerate(splits, start=1):
    split_table = pa.Table.from_pandas(split_df)
    pq.write_table(split_table, f'filtered_spotify_playlists_part{i}.parquet')
    print(f"Part {i} saved successfully!")

Part 1 saved successfully!
Part 2 saved successfully!
Part 3 saved successfully!
Part 4 saved successfully!
Part 5 saved successfully!
Part 6 saved successfully!
Part 7 saved successfully!
Part 8 saved successfully!
Part 9 saved successfully!
Part 10 saved successfully!


In [5]:
# Path pattern to read all parquet files in the folder
parquet_files = glob.glob('playlists/*.parquet')

# Read and concatenate all parquet files into one DataFrame
playlists_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# View combined DataFrame
playlists_df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Winter 16/17,false,656605,2017-10-31,116,115,1,"[{'album_name': 'With Me', 'album_uri': 'spoti...",84,27464949,112,
1,Country,false,43568,2017-10-31,193,114,1,"[{'album_name': 'Golden Road', 'album_uri': 's...",73,42549856,68,
2,FALL,false,892061,2017-10-31,51,48,2,"[{'album_name': 'Let Me Go (with Alesso, Flori...",34,10589174,47,
3,happy mix,false,284411,2017-10-31,75,72,1,"[{'album_name': 'Blue Empire', 'album_uri': 's...",60,33358703,66,
4,Hype it up,false,44918,2017-10-31,114,83,1,"[{'album_name': '2014 Forest Hills Drive', 'al...",57,27651175,52,
...,...,...,...,...,...,...,...,...,...,...,...,...
19995,HypeBeast,false,805044,2017-10-31,159,113,3,"[{'album_name': 'Painting Pictures', 'album_ur...",37,32453982,67,
19996,chillin,false,805043,2017-10-31,41,39,1,"[{'album_name': 'Virile', 'album_uri': 'spotif...",18,10950319,36,
19997,Rap,false,206020,2017-10-31,56,44,1,"[{'album_name': 'Ill Mind of Hopsin 8', 'album...",40,12831434,36,
19998,Free Spirit,false,206053,2017-10-31,53,38,1,"[{'album_name': 'Gulag Orkestar', 'album_uri':...",6,12893343,26,


In [11]:
# Dictionary to hold track_uri as key and playlist pids & track details as values
track_details_dict = defaultdict(lambda: {'inside_playlists': [], 'track_info': {}})

# Iterate through each playlist to populate the dictionary
for _, playlist in playlists_df.iterrows():
    pid = playlist['pid']  # Correctly access pid from current row
    for track in playlist['tracks']:
        track_uri = track['track_uri']
        track_details_dict[track_uri]['inside_playlists'].append(pid)
        # Store track details if not already stored
        if not track_details_dict[track_uri]['track_info']:
            track_details_dict[track_uri]['track_info'] = track

# Convert dictionary to DataFrame
track_details_df = pd.DataFrame([
    {'track_uri': track_uri, **details['track_info'], 'inside_playlists': details['inside_playlists']}
    for track_uri, details in track_details_dict.items()
])


In [12]:
# Splitting track details dataframe
num_splits_tracks = 5
track_split_size = len(track_details_df) // num_splits_tracks
track_splits = [
    track_details_df.iloc[i*track_split_size : (i+1)*track_split_size] if i < num_splits_tracks - 1 else track_details_df.iloc[i*track_split_size:]
    for i in range(num_splits_tracks)
]

# Saving each track detail split into separate parquet files
for i, split_df in enumerate(track_splits, start=1):
    split_table = pa.Table.from_pandas(split_df)
    pq.write_table(split_table, f'spotify_track_details_part{i}.parquet')
    print(f"Track details part {i} saved successfully!")

Track details part 1 saved successfully!
Track details part 2 saved successfully!
Track details part 3 saved successfully!
Track details part 4 saved successfully!
Track details part 5 saved successfully!


In [7]:
# Path pattern to read all parquet files in the folder
parquet_files = glob.glob('tracks/*.parquet')

# Read and concatenate all parquet files into one DataFrame
tracks_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# View combined DataFrame
tracks_df[tracks_df['track_uri'] == 'spotify:track:6oBWomqWwisfkdCeCZYD7V']

Unnamed: 0,track_uri,album_name,album_uri,artist_name,artist_uri,duration_ms,pos,track_name,inside_playlists
101010,spotify:track:6oBWomqWwisfkdCeCZYD7V,Golden Road,spotify:album:56k96UDsZd4nBpNZrY5fOF,Keith Urban,spotify:artist:0u2FHSq3ln94y5Q57xazwf,323040,0,Somebody Like You,"[43568, 43568, 43568, 91605, 943522, 161559, 8..."
