In [1]:
import pandas as pd

from relative_path import PATH_DATA, PATH_OUTPUT_DATA, PATH_OUTPUT_GRAPH, PATH_OUTPUT_PROF
from relative_path import TRACK_DATA, ARTIST_DATA, FEATURE_COLS, PROCESSED_FEATURE_DATA

from pandas import DataFrame, Series
from typing import List, Dict

from data_preprocessing import isolate_columns, preprocess_df

import seaborn as sns
import numpy as np

## Standard

In [2]:
df_tracks =     pd.read_parquet(TRACK_DATA)
df_artists =    pd.read_parquet(ARTIST_DATA)

# Separate the main criterias from its features.
dfp_tracks, dfp_features = isolate_columns(df_tracks, FEATURE_COLS)

In [3]:
# Preprocess Dataframe - sorting and re-indexing
dfp_tracks = preprocess_df(dfp_tracks, sort_by="popularity")
dfp_artists = preprocess_df(df_artists, sort_by="popularity")

# Rename column
dfp_tracks.rename(columns = {"id":"track_id", "name":"track_name", "popularity": "track_popularity", "artists":"artist_name", "id_artists":"artist_id"}, inplace=True)
dfp_artists.rename(columns = {"id":"artist_id", "name":"artist_name", "popularity": "artist_popularity", "followers":"artist_followers"}, inplace=True)
dfp_features.rename(columns = {"id":"track_id"}, inplace=True)

In [4]:
# Checking and processing the DATATYPES. 
dfp_tracks["dates"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["release_date"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["year"]=dfp_tracks["release_date"].dt.year

dfp_tracks.drop(['release_date', 'dates'], axis=1, inplace=True)

In [5]:
# Process Data - Remove popularity values that less than 20. Too much data - distorts the distribution. 
#dfp_tracks = dfp_tracks[dfp_tracks["track_popularity"] >= 20]
#dfp_artists = dfp_artists[dfp_artists["artist_popularity"] >= 20]

In [6]:
### Track Dataframe
# Track Popularity over time / year - distribution, boxplot. (Render the top 10, 100 and 1000, )
# Extract - Top solo artists vs Top various artists

### Artists Dataframe
# Artist Popularity over Artist Followers
# Frequency of genres, relative to the top artist. (Top 10% and Top 25%, Top 10 and 100)


###  Combination of Track and Artists Dataframe
# Track Popularity over Total Artists Followers, Track Popularity over Average Artists Popularity

In [7]:
def remove_characters(string:str, special_char:str = None):
    if special_char == None:
        special_char = "@#$*&"
        
    lst_special_char = [x for x in special_char]
    result = "".join(filter(lambda char: char not in lst_special_char , string))
    return result


def apply_fn_mean_popularity(string:str) -> int:
    """ Returns the total followers amongst the artists within the track. """
    
    # Process list data (str type)
    str_process = remove_characters(string, "[]'")
    str_process = str_process.split(",")
    artists_ids = [x.strip() for x in str_process]
    
    # Loop through the artist ids
    popularity_list = []
    for artist_id in artists_ids:
        match_df = dfp_artists.loc[dfp_artists["artist_id"]==artist_id]
        match_lst = match_df["artist_popularity"].tolist()
        
        if match_lst==[] or match_lst==np.NaN:
            return 0
        
        for follower in match_lst: popularity_list.append(follower)
    return np.mean(popularity_list)



def apply_fn_total_followers(string:str) -> int:
    """ Returns the total followers amongst the artists within the track. """
    
    # Process list data (str type)
    str_process = remove_characters(string, "[]'")
    str_process = str_process.split(",")
    artists_ids = [x.strip() for x in str_process]
    
    # Loop through the artist ids
    followers_list = []
    for artist_id in artists_ids:
        match_df = dfp_artists.loc[dfp_artists["artist_id"]==artist_id]
        match_lst = match_df["artist_followers"].tolist()
        
        if match_lst==[] or match_lst==np.NaN:
            return 0
        
        for follower in match_lst: followers_list.append(follower)
    return np.sum(followers_list)

In [8]:
dfp_tracks["total_followers"] = dfp_tracks["artist_id"].apply(apply_fn_total_followers)

In [9]:
dfp_tracks["mean_popularity"] = dfp_tracks["artist_id"].apply(apply_fn_mean_popularity)

In [10]:
dfp_tracks.to_parquet("processed_track.parquet")

In [11]:
dfp_tracks

Unnamed: 0,track_id,track_name,track_popularity,duration_ms,artist_name,artist_id,year,total_followers,mean_popularity
0,4iJyoBOLtHqaGxP12qzhQI,Peaches (feat. Daniel Caesar & Giveon),100,198082,"['Justin Bieber', 'Daniel Caesar', 'Giveon']","['1uNFoZAHBGtllmzznpCI3s', '20wkVLutqVOYrc0kxF...",2021,48141557.0,93.333333
1,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,['Olivia Rodrigo'],['1McMsnEElThX1knmY4oliG'],2021,1444702.0,88.000000
2,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,98,132780,['Masked Wolf'],['1uU7g3DNSbsu0QjSEqZtEd'],2021,177401.0,85.000000
3,5QO79kh1waicV47BqGRL3g,Save Your Tears,97,215627,['The Weeknd'],['1Xyo4u8uXC1ZmMpatF05PJ'],2020,31308207.0,96.000000
4,6tDDoYIxWvMLTdKpjFkc1B,telepatía,97,160191,['Kali Uchis'],['1U1el3k54VvEUzo3ybLPlM'],2020,1698014.0,88.000000
...,...,...,...,...,...,...,...,...,...
586667,0sniGbmm1sjg51BxO6AHD4,O Pardesi Musafir,0,198053,"['Suraiya', 'Lata Mangeshkar']","['71lk7UDkZK1mabUE6LCASW', '61JrslREXq98hurYL2...",1949,2672219.0,46.000000
586668,0sk8KA96i9MIU8St2nmbIx,Mere Dil Mein Aao Aur Bas Jao,0,171133,['Suraiya'],['71lk7UDkZK1mabUE6LCASW'],1949,60022.0,22.000000
586669,0sdkzsc3WtZIrSi1nNyl7v,"Diversions, Op. 21: Variation 9, Toccata I",0,46720,"['Benjamin Britten', 'Leon Fleisher', 'Seiji O...","['7MJ1pB5d6Vjmzep2zQlorn', '6ncNdxBc8zVWMOF7nJ...",1949,124473.0,54.500000
586670,0s6h3lMdJsWkpUAW6xC7m3,John Henry,0,189973,['Lead Belly'],['3Ovf2lytXSXWFM2cwsJACC'],1949,138336.0,46.000000
