In [2]:
import pandas as pd
import ast

# Load Datasets

In [7]:
playlist = pd.read_parquet("../phase2_data_cleaning/cleaned dataset/playlist_cleaned.parquet", columns=["playlist_idx", "tracks"])
track = pd.read_parquet("../phase2_data_cleaning/cleaned dataset/track_cleaned.parquet", columns=["track_idx", "track_uri"])

# Select the first 1000 playlists

In [8]:
playlist_selected = playlist.head(1000)

# Select the unique tracks in the 1000 playlists

In [9]:
# Extract unique values from the 'tracks' column
unique_values = set()

# Iterate over the rows in the 'tracks' column
for track_str in playlist_selected["tracks"]:
    # Convert the string representation to a dictionary
    track_dict = ast.literal_eval(track_str)
    
    if isinstance(track_dict, dict):  # Ensure it's a dictionary
        # Add integer values to the set
        unique_values.update(int(value) for value in track_dict.values())

# Convert to DataFrame with 'track_idx' as integers
unique_df = pd.DataFrame(unique_values, columns=["track_idx"])
unique_df = unique_df.sort_values(by="track_idx", ascending=True).reset_index(drop=True)

# Display the DataFrame
print(unique_df)

       track_idx
0              1
1              5
2             15
3             36
4             38
...          ...
37437     252207
37438     252209
37439     252210
37440     252213
37441     252216

[37442 rows x 1 columns]


# Merge with the `track_uri` in the Track dataset

In [10]:
# Extract the unique 'track_idx' from the unique_df
unique_track_idx = unique_df['track_idx']

# Filter the 'track' DataFrame based on whether 'track_idx' is in the unique values
filtered_track = track[track['track_idx'].isin(unique_track_idx)]

# Display the filtered DataFrame
print(filtered_track)

        track_idx                             track_uri
1               1  spotify:track:000GjfnQc7ggBayDiy1sLW
5               5  spotify:track:000xQL6tZNLJzIrtIgxqSl
15             15  spotify:track:002opcRBgYV5jqoh72QcqA
36             36  spotify:track:005X0FmdtkM1kiutosXLTR
38             38  spotify:track:005drRcJJFgFHgtxSJJO0v
...           ...                                   ...
252207     252207  spotify:track:7zwtlwZslYsynnoDxH7lOR
252209     252209  spotify:track:7zx34MQjW5Svvltow5EzsR
252210     252210  spotify:track:7zx5GJLsFcKpXfvFAIioqI
252213     252213  spotify:track:7zxRMhXxJMQCeDDg0rKAVo
252216     252216  spotify:track:7zxhhMt0j1JV7OuFpQ2Boe

[37442 rows x 2 columns]


# Save files

In [11]:
playlist_selected = playlist_selected.drop(columns=['tracks'])
playlist_selected.to_parquet("datasets/1000_playlist.parquet", index=False)
filtered_track.to_parquet("datasets/1000_track.parquet", index=False)