In [13]:
import pandas as pd
import os
from sklearn.preprocessing import LabelEncoder

def combine_and_encode_tracks(tracks_folder, tracks_output_csv, playlist_mapping=None):
    """
    Combine all track Parquet files into one DataFrame and encode the 'track_uri'
    column into a new 'track_idx' column.
    Optionally, process the 'inside_playlists' column (a list of pids) to convert them
    to their corresponding playlist_idx using playlist_mapping and remove duplicates.
    
    Returns the combined DataFrame and a mapping dictionary from track_uri to track_idx.
    """
    dfs = []
    for file in os.listdir(tracks_folder):
        if file.endswith(".parquet"):
            file_path = os.path.join(tracks_folder, file)
            df = pd.read_parquet(file_path)
            dfs.append(df)
            
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        encoder = LabelEncoder()
        combined_df['track_idx'] = encoder.fit_transform(combined_df['track_uri'])
        
        # Process 'inside_playlists' if playlist_mapping is provided.
        if playlist_mapping is not None and 'inside_playlists' in combined_df.columns:
            def transform_inside_playlists(pid_list):
                # Map each pid to its playlist_idx, defaulting to -1 if not found.
                mapped = [playlist_mapping.get(pid, -1) for pid in pid_list]
                # Remove duplicates while preserving order
                seen = set()
                unique = []
                for item in mapped:
                    if item not in seen:
                        unique.append(item)
                        seen.add(item)
                return unique
            combined_df['inside_playlists'] = combined_df['inside_playlists'].apply(transform_inside_playlists)
        
        combined_df.to_csv(tracks_output_csv, index=False)
        print(f"Combined tracks CSV saved to {tracks_output_csv}")
        track_mapping = dict(zip(combined_df['track_uri'], combined_df['track_idx']))
        return combined_df, track_mapping
    else:
        print("No Parquet files found in the tracks folder.")
        return None, {}

def process_playlist_tracks(df, track_mapping):
    """
    Replace the 'tracks' column in a playlist DataFrame with a dictionary that maps
    each track's position (from the 'pos' field within each track details) to its
    corresponding track index from the track_mapping.
    
    Each element in the 'tracks' list is expected to be a dictionary with at least:
        - 'track_uri': the identifier for the track.
        - 'pos': the position of the track in the playlist.
    """
    if 'tracks' in df.columns:
        def transform_tracks(tracks):
            return {str(track['pos']): str(track_mapping.get(track['track_uri'], -1)) 
                    for track in tracks}
        df['tracks'] = df['tracks'].apply(transform_tracks)
    return df

def combine_and_encode_playlists(playlists_folder, playlists_output_csv):
    """
    Combine all playlist Parquet files into one DataFrame, encode the primary key 'pid'
    into a new column 'playlist_idx' using LabelEncoder on the combined data, and process
    the 'tracks' column to replace it with a dictionary mapping track position to track index.
    
    Returns the combined DataFrame and a mapping from pid to playlist_idx.
    """
    dfs = []
    for file in os.listdir(playlists_folder):
        if file.endswith(".parquet"):
            file_path = os.path.join(playlists_folder, file)
            df = pd.read_parquet(file_path)
            dfs.append(df)
    
    if dfs:
        combined_df = pd.concat(dfs, ignore_index=True)
        
        # Apply LabelEncoder on the combined DataFrame for the playlist primary key 'pid'
        encoder = LabelEncoder()
        combined_df['playlist_idx'] = encoder.fit_transform(combined_df['pid'])
        
        # Process the 'tracks' column using the provided track_mapping later in this function.
        # Note: This will be done after we obtain the track mapping.
        
        combined_df.to_csv(playlists_output_csv, index=False)
        print(f"Combined playlists CSV saved to {playlists_output_csv}")
        playlist_mapping = dict(zip(combined_df['pid'], combined_df['playlist_idx']))
        return combined_df, playlist_mapping
    else:
        print("No Parquet files found in the playlists folder.")
        return None, {}

# Example usage:

# First, combine and encode playlists to generate a mapping from pid to playlist_idx.
playlists_folder = "parquet datasets/playlists"
playlists_output_csv = "playlists.csv"
playlists_df, playlist_mapping = combine_and_encode_playlists(playlists_folder, playlists_output_csv)

# Next, combine and encode tracks. Use the playlist_mapping to process the 'inside_playlists' column.
tracks_folder = "parquet datasets/tracks"
tracks_output_csv = "tracks.csv"
tracks_df, track_mapping = combine_and_encode_tracks(tracks_folder, tracks_output_csv, playlist_mapping)

# Now, if needed, you can further process the playlists to update their 'tracks' column.
# For example, if you want to update the 'tracks' column to map positions to track_idx:
if playlists_df is not None:
    playlists_df = process_playlist_tracks(playlists_df, track_mapping)
    playlists_df.to_csv(playlists_output_csv, index=False)
    print("Updated playlists CSV with processed 'tracks' column saved.")

Combined playlists CSV saved to playlists.csv
Combined tracks CSV saved to tracks.csv
Updated playlists CSV with processed 'tracks' column saved.


In [14]:
playlists = pd.read_csv(playlists_output_csv)
print(len(playlists))
playlists.head()

20000


Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description,playlist_idx
0,Winter 16/17,False,656605,2017-10-31,116,115,1,"{'0': '56159', '1': '158741', '2': '130035', '...",84,27464949,112,,13122
1,Country,False,43568,2017-10-31,193,114,1,"{'0': '220590', '1': '199180', '2': '108648', ...",73,42549856,68,,885
2,FALL,False,892061,2017-10-31,51,48,2,"{'0': '167186', '1': '31002', '2': '117854', '...",34,10589174,47,,17788
3,happy mix,False,284411,2017-10-31,75,72,1,"{'0': '217906', '1': '95923', '2': '206234', '...",60,33358703,66,,5836
4,Hype it up,False,44918,2017-10-31,114,83,1,"{'0': '36171', '1': '75724', '2': '68192', '3'...",57,27651175,52,,924


In [15]:
tracks = pd.read_csv(tracks_output_csv)
print(len(tracks))
tracks.head()

252236


Unnamed: 0,track_uri,album_name,album_uri,artist_name,artist_uri,duration_ms,pos,track_name,inside_playlists,track_idx
0,spotify:track:2sywvGUBuSuYICHHKOElSn,Knock Madness,spotify:album:16SMGvTZrwtiJ132g69toP,Hopsin,spotify:artist:7EWU4FhUJM1sZQgQKdENeT,211573,106,Nollie Tre Flip,"[9885, 11157, 4959, 18044]",93755
1,spotify:track:7n1y18wPhZv5YxpR6nFBck,Knock Madness,spotify:album:16SMGvTZrwtiJ132g69toP,Hopsin,spotify:artist:7EWU4FhUJM1sZQgQKdENeT,257600,107,I Need Help,"[9885, 774, 11157, 16421, 15329, 17621, 2799, ...",245549
2,spotify:track:1zZXXOfcYKJbUh494VmGVB,Knock Madness,spotify:album:16SMGvTZrwtiJ132g69toP,Hopsin,spotify:artist:7EWU4FhUJM1sZQgQKdENeT,231493,108,Rip Your Heart Out (feat. Tech N9ne),"[9885, 16421, 3046, 5920, 1662, 17621, 14239, ...",64890
3,spotify:track:4lDRCZho3GlmRqOJRjmLfE,Bout the Business,spotify:album:6kVdsufIniV3hlXSxKxax2,Hopsin,spotify:artist:7EWU4FhUJM1sZQgQKdENeT,272360,110,Bout the Business,"[9885, 774, 4577, 19798, 19041, 12834, 12320, ...",154041
4,spotify:track:5wWDfRF7NQKMx8ZPfrhBwa,Funk Volume 2013 - Single,spotify:album:6pCCQ7R7J4N2femfQVbFg2,Hopsin,spotify:artist:7EWU4FhUJM1sZQgQKdENeT,313195,112,Funk Volume 2013,"[9885, 5920, 17092]",192527
