In [36]:
import pandas as pd
import numpy as np
import ast

# Load Datasets

In [65]:
playlist = pd.read_parquet("../phase2_data_cleaning/cleaned dataset/playlist_cleaned.parquet", columns=["playlist_idx", "tracks"])
track = pd.read_parquet("../phase2_data_cleaning/cleaned dataset/track_cleaned.parquet", columns=["track_idx", "track_uri"])

# Select the first 1000 playlists

In [66]:
playlist_selected = playlist.head(1000)

# Select the unique tracks in the 1000 playlists

In [67]:
# Extract unique values from the 'tracks' column
unique_values = set()

# Iterate over the rows in the 'tracks' column
for track_str in playlist_selected["tracks"]:
    # Convert the string representation to a dictionary
    track_dict = ast.literal_eval(track_str)
    
    if isinstance(track_dict, dict):  # Ensure it's a dictionary
        # Add integer values to the set
        unique_values.update(int(value) for value in track_dict.values())

# Convert to DataFrame with 'track_idx' as integers
unique_df = pd.DataFrame(unique_values, columns=["track_idx"])
unique_df = unique_df.sort_values(by="track_idx", ascending=True).reset_index(drop=True)

# Display the DataFrame
print(unique_df)

       track_idx
0              1
1              5
2             15
3             36
4             38
...          ...
37437     252207
37438     252209
37439     252210
37440     252213
37441     252216

[37442 rows x 1 columns]


# Merge with the `track_uri` in the Track dataset

In [68]:
# Extract the unique 'track_idx' from the unique_df
unique_track_idx = unique_df['track_idx']

# Filter the 'track' DataFrame based on whether 'track_idx' is in the unique values
filtered_track = track[track['track_idx'].isin(unique_track_idx)]

# Display the filtered DataFrame
print(filtered_track)

        track_idx                             track_uri
1               1  spotify:track:000GjfnQc7ggBayDiy1sLW
5               5  spotify:track:000xQL6tZNLJzIrtIgxqSl
15             15  spotify:track:002opcRBgYV5jqoh72QcqA
36             36  spotify:track:005X0FmdtkM1kiutosXLTR
38             38  spotify:track:005drRcJJFgFHgtxSJJO0v
...           ...                                   ...
252207     252207  spotify:track:7zwtlwZslYsynnoDxH7lOR
252209     252209  spotify:track:7zx34MQjW5Svvltow5EzsR
252210     252210  spotify:track:7zx5GJLsFcKpXfvFAIioqI
252213     252213  spotify:track:7zxRMhXxJMQCeDDg0rKAVo
252216     252216  spotify:track:7zxhhMt0j1JV7OuFpQ2Boe

[37442 rows x 2 columns]


# Save files

In [69]:
playlist_selected = playlist_selected.drop(columns=['tracks'])
playlist_selected.to_parquet("datasets/1000_playlist_idx_only.parquet", index=False)
filtered_track.to_parquet("datasets/1000_track_idx_only.parquet", index=False)

# Create Track Table using Sample

### Load Relevant Datasets

In [70]:
pop_era_duration_artist_idx = pd.read_parquet("../phase2_data_cleaning/cleaned dataset/track_cleaned.parquet", columns=["track_idx","artist_idx", "era", "category"])
sentiment = pd.read_parquet("../phase3_feature_engineering/scores/sentiment_scores_1000_subsample.parquet")
genre = pd.read_parquet("../phase3_feature_engineering/scores/genre_scores.parquet")

### Merge `Popularity`, `Era`, `Duration` first

In [71]:
# Perform the merge on 'track_idx'
track_final = pd.merge(filtered_track, pop_era_duration_artist_idx, on='track_idx', how='inner')

# Display the merged DataFrame
track_final.head()

Unnamed: 0,track_idx,track_uri,artist_idx,era,category
0,1,spotify:track:000GjfnQc7ggBayDiy1sLW,26382,2000s,Short
1,5,spotify:track:000xQL6tZNLJzIrtIgxqSl,38821,Modern Era,Medium
2,15,spotify:track:002opcRBgYV5jqoh72QcqA,19124,Modern Era,Medium
3,36,spotify:track:005X0FmdtkM1kiutosXLTR,35662,Classic Era,Short
4,38,spotify:track:005drRcJJFgFHgtxSJJO0v,43820,Modern Era,Long


### Merge Sentiment Scores

In [72]:
track_final = pd.merge(track_final, sentiment, on='track_idx', how='left')
track_final.head()

Unnamed: 0,track_idx,track_uri,artist_idx,era,category,joy,calm,sadness,fear,energizing,dreamy
0,1,spotify:track:000GjfnQc7ggBayDiy1sLW,26382,2000s,Short,,,,,,
1,5,spotify:track:000xQL6tZNLJzIrtIgxqSl,38821,Modern Era,Medium,0.152113,0.155419,0.106975,0.20176,0.196968,0.186765
2,15,spotify:track:002opcRBgYV5jqoh72QcqA,19124,Modern Era,Medium,0.162265,0.030118,0.002151,0.008668,0.429312,0.367487
3,36,spotify:track:005X0FmdtkM1kiutosXLTR,35662,Classic Era,Short,0.29355,0.11537,0.001115,0.00268,0.29522,0.292065
4,38,spotify:track:005drRcJJFgFHgtxSJJO0v,43820,Modern Era,Long,0.005309,0.018578,0.306113,0.346907,0.195283,0.127809


### Merge Genre Scores

In [73]:
track_final = pd.merge(track_final, genre, on='track_uri', how='left')
track_final.head()

Unnamed: 0,track_idx,track_uri,artist_idx,era,category,joy,calm,sadness,fear,energizing,dreamy,Instrumental / Ambient Sounds,Soft Acoustic / Classical,Orchestral / Soundtrack,Mid-tempo Pop / Indie,Upbeat Electronic / Dance,Slow & Melancholic (Sad Songs),Experimental / Jazz Fusion,Lo-Fi / Chill Vibes
0,1,spotify:track:000GjfnQc7ggBayDiy1sLW,26382,2000s,Short,,,,,,,0.683467,0.005597,0.220851,0.018938,0.000612,0.062293,0.006924,0.001318
1,5,spotify:track:000xQL6tZNLJzIrtIgxqSl,38821,Modern Era,Medium,0.152113,0.155419,0.106975,0.20176,0.196968,0.186765,0.012407,0.912539,0.010838,0.000878,0.047247,0.004436,0.002915,0.008741
2,15,spotify:track:002opcRBgYV5jqoh72QcqA,19124,Modern Era,Medium,0.162265,0.030118,0.002151,0.008668,0.429312,0.367487,0.225427,0.017463,0.579103,0.010267,0.002024,0.150953,0.012993,0.00177
3,36,spotify:track:005X0FmdtkM1kiutosXLTR,35662,Classic Era,Short,0.29355,0.11537,0.001115,0.00268,0.29522,0.292065,0.012463,0.902842,0.015791,0.00266,0.022419,0.011421,0.002254,0.030149
4,38,spotify:track:005drRcJJFgFHgtxSJJO0v,43820,Modern Era,Long,0.005309,0.018578,0.306113,0.346907,0.195283,0.127809,0.747282,0.006214,0.12077,0.033136,0.000654,0.082782,0.00713,0.002032


### Save File

In [74]:
track_final.to_parquet("datasets/1000_track_full.parquet", index=False)

# Create Playlist Table using Track Table

### Load Relevant Datasets

In [75]:
pop_era_artist = pd.read_parquet("../phase3_feature_engineering/feature engineered datasets/playlist.parquet", 
                                 columns=["playlist_idx", "track_idx_list", "popularity_var", "artist_diversity", 
                                              "early_years_proportion", "classic_era_proportion", "golden_era_proportion", "2000s_proportion", "modern_era_proportion",
                                              "short_proportion", "medium_proportion", "long_proportion"])

### Filter to only the selected 1000 playlists

In [76]:
# Filter the track_final DataFrame based on 'playlist_idx' from playlist_selected
playlist_final = pop_era_artist[pop_era_artist['playlist_idx'].isin(playlist_selected['playlist_idx'])]

# Display the filtered DataFrame
playlist_final.head()

Unnamed: 0,playlist_idx,track_idx_list,popularity_var,artist_diversity,early_years_proportion,classic_era_proportion,golden_era_proportion,2000s_proportion,modern_era_proportion,short_proportion,medium_proportion,long_proportion
0,0,"[55709, 5427, 52495, 157555, 235367, 126542, 8...",658.67,2.128085,0.0,0.0,0.0,0.0,1.0,0.571429,0.428571,0.0
1,1,"[3689, 207774, 194775, 135193, 218011, 37844, ...",834.42,5.136249,0.0,0.0,0.0,0.0,1.0,0.095238,0.714286,0.190476
2,2,"[160375, 131195, 164629, 147280, 193891, 17077...",300.85,4.321928,0.0,0.45,0.2,0.2,0.15,0.2,0.35,0.45
3,3,"[104436, 229428, 25968, 186871, 81592, 15947, ...",776.5,5.760648,0.0,0.209677,0.467742,0.290323,0.032258,0.241935,0.516129,0.241935
4,4,"[244173, 210510, 9349, 224202, 147251, 35498, ...",365.06,5.004921,0.0,0.0,0.0,0.067568,0.932432,0.0,0.324324,0.675676


### Duration Diversity

In [77]:
def shannon_entropy(row):
    # Convert proportions to a NumPy array from the separate columns
    proportions = np.array([row["short_proportion"], row["medium_proportion"], row["long_proportion"]])
    
    # Mask the zero values (replace them with a small value)
    proportions = np.where(proportions == 0, np.finfo(float).eps, proportions)
    
    # Calculate entropy: -sum(p(x) * log2(p(x)))
    entropy = -np.sum(proportions * np.log2(proportions))
        
    return entropy

# Apply the Shannon entropy function to each row
playlist_final.loc[:, 'duration_var'] = playlist_final.apply(shannon_entropy, axis=1)
playlist_final = playlist_final.drop(columns=['short_proportion', 'medium_proportion', 'long_proportion'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  playlist_final.loc[:, 'duration_var'] = playlist_final.apply(shannon_entropy, axis=1)


### Era Diversity

In [78]:
def shannon_entropy(row):
    # Convert proportions to a NumPy array from the separate columns
    proportions = np.array([row["early_years_proportion"], row["classic_era_proportion"], row["golden_era_proportion"], row["2000s_proportion"], row["modern_era_proportion"]])
    
    # Mask the zero values (replace them with a small value)
    proportions = np.where(proportions == 0, np.finfo(float).eps, proportions)
    
    # Calculate entropy: -sum(p(x) * log2(p(x)))
    entropy = -np.sum(proportions * np.log2(proportions))
        
    return entropy

# Apply the Shannon entropy function to each row
playlist_final.loc[:, 'era_var'] = playlist_final.apply(shannon_entropy, axis=1)
playlist_final = playlist_final.drop(columns=["early_years_proportion", "classic_era_proportion", "golden_era_proportion", "2000s_proportion", "modern_era_proportion"])

### Sentiment Diversity 

### Genre Diversity