In [1]:
import pandas as pd

from relative_path import PATH_DATA, PATH_OUTPUT_DATA, PATH_OUTPUT_GRAPH, PATH_OUTPUT_PROF
from relative_path import TRACK_DATA, ARTIST_DATA, FEATURE_COLS, PROCESSED_FEATURE_DATA, OUTPUT_TRANSFORM

from pandas import DataFrame, Series
from typing import List, Dict

from data_preprocessing import isolate_columns, preprocess_df

import seaborn as sns
import numpy as np

ImportError: cannot import name 'PATH_OUTPUT_DATA' from 'relative_path' (/Users/mavwong/Desktop/Github/SpotifyDataAnalysis/relative_path.py)

## Standard

In [2]:
df_tracks =     pd.read_parquet(TRACK_DATA)
df_artists =    pd.read_parquet(ARTIST_DATA)

# Separate the main criterias from its features.
dfp_tracks, dfp_features = isolate_columns(df_tracks, FEATURE_COLS)

## Renaming

In [3]:
# Preprocess Dataframe - sorting and re-indexing
dfp_tracks = preprocess_df(dfp_tracks, sort_by="popularity")
dfp_artists = preprocess_df(df_artists, sort_by="popularity")

# Rename column
dfp_tracks.rename(columns = {"id":"track_id", "name":"track_name", "popularity": "track_popularity", "artists":"artist_name", "id_artists":"artist_id"}, inplace=True)
dfp_artists.rename(columns = {"id":"artist_id", "name":"artist_name", "popularity": "artist_popularity", "followers":"artist_followers"}, inplace=True)
dfp_features.rename(columns = {"id":"track_id"}, inplace=True)

## Changing Datatypes

In [4]:
# Checking and processing the DATATYPES. 
dfp_tracks["dates"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["release_date"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["year"]=dfp_tracks["release_date"].dt.year

dfp_tracks.drop(['release_date', 'dates'], axis=1, inplace=True)

In [5]:
# Process Data - Remove popularity values that less than 20. Too much data - distorts the distribution. 
#dfp_tracks = dfp_tracks[dfp_tracks["track_popularity"] >= 20]
#dfp_artists = dfp_artists[dfp_artists["artist_popularity"] >= 20]

In [7]:
def remove_characters(string:str, special_char:str = None):
    if special_char == None:
        special_char = "@#$*&"
        
    lst_special_char = [x for x in special_char]
    result = "".join(filter(lambda char: char not in lst_special_char , string))
    return result


def apply_fn_mean_popularity(string:str) -> int:
    """ Returns the total followers amongst the artists within the track. """
    
    # Process list data (str type)
    str_process = remove_characters(string, "[]'")
    str_process = str_process.split(",")
    artists_ids = [x.strip() for x in str_process]
    
    # Loop through the artist ids
    popularity_list = []
    for artist_id in artists_ids:
        match_df = dfp_artists.loc[dfp_artists["artist_id"]==artist_id]
        match_lst = match_df["artist_popularity"].tolist()
        
        if match_lst==[] or match_lst==np.NaN:
            return 0
        
        for follower in match_lst: popularity_list.append(follower)
    return np.mean(popularity_list)



def apply_fn_total_followers(string:str) -> int:
    """ Returns the total followers amongst the artists within the track. """
    
    # Process list data (str type)
    str_process = remove_characters(string, "[]'")
    str_process = str_process.split(",")
    artists_ids = [x.strip() for x in str_process]
    
    # Loop through the artist ids
    followers_list = []
    for artist_id in artists_ids:
        match_df = dfp_artists.loc[dfp_artists["artist_id"]==artist_id]
        match_lst = match_df["artist_followers"].tolist()
        
        if match_lst==[] or match_lst==np.NaN:
            return 0
        
        for follower in match_lst: followers_list.append(follower)
    return np.sum(followers_list)

## Map Total artist followers and Average artist popularity based on the artist_id given in the Tracks Dataset

In [8]:
dfp_tracks["total_followers"] = dfp_tracks["artist_id"].apply(apply_fn_total_followers)
dfp_tracks["mean_popularity"] = dfp_tracks["artist_id"].apply(apply_fn_mean_popularity)

## Export: Processed Track File

In [9]:
dfp_tracks.to_parquet("processed_track.parquet")

## Processing Again...

In [9]:
from relative_path import OUTPUT_TRANSFORM

In [10]:
FILE_PARQUET = OUTPUT_TRANSFORM / "processed_track.parquet"

In [11]:
# Reading the tracks data
df_tracks_processed = pd.read_parquet(FILE_PARQUET)

# Renaming columns
df_tracks_processed.rename(columns={
        "total_followers": "artist_total_followers", 
        "mean_popularity": "artist_mean_popularity"
    }, errors="ignore", inplace=True)

In [12]:
# Apply functions
def apply_fn_convert_to_int(input):
    """ Apply function that converts the input into an integer. 
    Sometimes you can't convert the pandas series datatypes natively. """
    try:
        return int(input)
    except:
        return 0

# Changing the datatypes
df_tracks_processed["artist_total_followers"] = df_tracks_processed["artist_total_followers"].apply(apply_fn_convert_to_int)
df_tracks_processed["artist_mean_popularity"] = df_tracks_processed["artist_mean_popularity"].astype("int", errors="ignore")

In [13]:
# Remove 0 within duration_ms
df_tracks_processed = df_tracks_processed[df_tracks_processed["duration_ms"]!=0]

# Remove 0 within artist_total_followers
df_tracks_processed = df_tracks_processed[df_tracks_processed["artist_total_followers"]!=0]

# Remove 0 within artist_mean_popularity
df_tracks_processed = df_tracks_processed[df_tracks_processed["artist_mean_popularity"]!=0]

# Remove 0 within track_popularity
df_tracks_processed = df_tracks_processed[df_tracks_processed["track_popularity"]!=0]

# Get only 1920 to 2020 - a 200 year gap
df_tracks_processed = df_tracks_processed[(df_tracks_processed["year"] >= 1920) & (df_tracks_processed["year"] != 2021)] 

In [15]:
df_tracks_processed.to_parquet(FILE_PARQUET)

In [16]:
df_tracks_processed

Unnamed: 0,track_id,track_name,track_popularity,duration_ms,artist_name,artist_id,year,artist_total_followers,artist_mean_popularity
3,5QO79kh1waicV47BqGRL3g,Save Your Tears,97,215627,['The Weeknd'],['1Xyo4u8uXC1ZmMpatF05PJ'],2020,31308207,96
4,6tDDoYIxWvMLTdKpjFkc1B,telepatía,97,160191,['Kali Uchis'],['1U1el3k54VvEUzo3ybLPlM'],2020,1698014,88
6,0VjIjW4GlUZAMYd2vXMi3b,Blinding Lights,96,200040,['The Weeknd'],['1Xyo4u8uXC1ZmMpatF05PJ'],2020,31308207,96
7,6f3Slt0GbA2bPZlz0aIFXN,The Business,95,164000,['Tiësto'],['2o5jDhtHVPhrJdv3cEQ99Z'],2020,5527032,87
8,3FAJ6O0NOHQV8Mc5Ri6ENp,Heartbreak Anniversary,94,198371,['Giveon'],['4fxd5Ee7UefO4CUXgwJ7IP'],2020,946550,91
...,...,...,...,...,...,...,...,...,...
541977,4HxvIdH92INqp7Fj1SllFW,Kal Talak Hum Theek Tha,1,203907,"['Geeta Dutt', 'Mohammed Rafi']","['0QsbYX8XslSSUcztc6u4pO', '0gXDpqwYNDODn7fB0R...",1958,2445150,54
541978,7uFR8iwOoC2pdPV9bFogSG,I'm putting all my eggs in one basket,1,166422,['The Boswell Sisters'],['2mflSlnegn3rBmzsh08OcN'],1931,15936,36
541979,5EoIeGRaeGBFkcdBAtHv6D,Maalaiyittu Manamudichu,1,193093,['Jamuna Rani'],['1UwFdPnfr7JcxV9D1Ln8zu'],1958,3056,27
541980,6sZvCShbUUg05tlxbrEiTn,你還不走過來,1,170507,['Ying Yin Wu'],['4Id8e7ATmAtOVCX9S0Jc0F'],1940,1494,25
