In [1]:

import pandas as pd

from relative_path import PATH_DATA, PATH_OUTPUT_DATA, PATH_OUTPUT_GRAPH, PATH_OUTPUT_PROF
from relative_path import TRACK_DATA, ARTIST_DATA, FEATURE_COLS, PROCESSED_FEATURE_DATA

from pandas import DataFrame, Series
from typing import List, Dict

from data_preprocessing import isolate_columns, preprocess_df

import seaborn as sns
import numpy as np

## Standard

In [2]:
df_tracks =     pd.read_parquet(TRACK_DATA)
df_artists =    pd.read_parquet(ARTIST_DATA)

# Separate the main criterias from its features.
dfp_tracks, dfp_features = isolate_columns(df_tracks, FEATURE_COLS)

In [3]:
# Preprocess Dataframe - sorting and re-indexing
dfp_tracks = preprocess_df(dfp_tracks, sort_by="popularity")
dfp_artists = preprocess_df(df_artists, sort_by="popularity")

# Rename column
dfp_tracks.rename(columns = {"id":"track_id", "name":"track_name", "popularity": "track_popularity", "artists":"artist_name", "id_artists":"artist_id"}, inplace=True)
dfp_artists.rename(columns = {"id":"artist_id", "name":"artist_name", "popularity": "artist_popularity", "followers":"artist_followers"}, inplace=True)
dfp_features.rename(columns = {"id":"track_id"}, inplace=True)

In [4]:
# Checking and processing the DATATYPES. 
dfp_tracks["dates"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["release_date"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["year"]=dfp_tracks["release_date"].dt.year

dfp_tracks.drop(['release_date', 'dates'], axis=1, inplace=True)

In [5]:
# Process Data - Remove popularity values that less than 20. Too much data - distorts the distribution. 
#dfp_tracks = dfp_tracks[dfp_tracks["track_popularity"] >= 20]
#dfp_artists = dfp_artists[dfp_artists["artist_popularity"] >= 20]

In [6]:
### Track Dataframe
# Track Popularity over time / year - distribution, boxplot. (Render the top 10, 100 and 1000, )
# Extract - Top solo artists vs Top various artists

### Artists Dataframe
# Artist Popularity over Artist Followers
# Frequency of genres, relative to the top artist. (Top 10% and Top 25%, Top 10 and 100)


###  Combination of Track and Artists Dataframe
# Track Popularity over Total Artists Followers, Track Popularity over Average Artists Popularity

In [7]:
def remove_characters(string:str, special_char:str = None):
    if special_char == None:
        special_char = "@#$*&"
        
    lst_special_char = [x for x in special_char]
    result = "".join(filter(lambda char: char not in lst_special_char , string))
    return result


def apply_fn_total_followers(string:str) -> int:
    """ Returns the total followers amongst the artists within the track. """
    
    # Process list data (str type)
    str_process = remove_characters(string, "[]'")
    str_process = str_process.split(",")
    followers_ids = [x.strip() for x in str_process]
    
    # Loop through the followers ids
    followers_list = []
    for follower in followers_ids:
        match = dfp_artists[dfp_artists["artist_id"]==follower]
        match_list = match["artist_followers"].tolist()
        
        for result in match_list: followers_list.append(result)
        
    # Check if the result is [] or NaN value
    if followers_list==[] or followers_list==np.NaN:
        return 0
    return np.sum(followers_list)


def apply_fn_mean_popularity(string:str) -> int:
    """ Returns the average popularity amongst the artists within the track. """
    
    # Process list data (str type)
    str_process = remove_characters(string, "[]'")
    str_process = str_process.split(",")
    followers_ids = [x.strip() for x in str_process]\
    
    # Loop through the followers ids
    popularity_list = []
    for follower in followers_ids:
        match = dfp_artists[dfp_artists["artist_id"]==follower]
        match_list = match["artist_popularity"].tolist()
        
        for result in match_list: popularity_list.append(result)
        
    # Check if the result is [] or NaN value
    if popularity_list==[] or popularity_list==np.NaN:
        return 0
    return int(np.mean(popularity_list))

In [8]:
dfp_tracks["artist_total_followers"] = dfp_tracks["artist_id"].apply(apply_fn_total_followers)

In [None]:
dfp_tracks["artist_mean_popularity"] = dfp_tracks["artist_id"].apply(apply_fn_mean_popularity)

0    93.333333
1    88.000000
2    85.000000
3    96.000000
4    88.000000
5    87.333333
6    96.000000
7    87.000000
8    91.000000
9    89.000000
Name: artist_id, dtype: float64

In [None]:
dfp_tracks.to_parquet("Sample.parquet")