In [1]:
import pandas as pd

from relative_path import PATH_DATA, PATH_OUTPUT_DATA, PATH_OUTPUT_GRAPH, PATH_OUTPUT_PROF
from relative_path import TRACK_DATA, ARTIST_DATA, FEATURE_COLS, PROCESSED_FEATURE_DATA

from pandas import DataFrame, Series
from typing import List, Dict

from data_preprocessing import isolate_columns, preprocess_df

## Standard

In [2]:
df_tracks =     pd.read_parquet(TRACK_DATA)
df_artists =    pd.read_parquet(ARTIST_DATA)

# Separate the main criterias from its features.
dfp_tracks, dfp_features = isolate_columns(df_tracks, FEATURE_COLS)

In [3]:
# Preprocess Dataframe - sorting and re-indexing
dfp_tracks = preprocess_df(dfp_tracks, sort_by="popularity")
dfp_artists = preprocess_df(df_artists, sort_by="popularity")

# Rename column
dfp_tracks.rename(columns = {"id":"track_id", "name":"track_name", "popularity": "track_popularity", "artists":"artist_name", "id_artists":"artist_id"}, inplace=True)
dfp_artists.rename(columns = {"id":"artist_id", "name":"artist_name", "popularity": "artist_popularity", "followers":"artist_followers"}, inplace=True)
dfp_features.rename(columns = {"id":"track_id"}, inplace=True)

In [4]:
# Checking and processing the DATATYPES. 
dfp_tracks["dates"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["release_date"] = pd.to_datetime(dfp_tracks["release_date"])
dfp_tracks["year"]=dfp_tracks["release_date"].dt.year

dfp_tracks.drop(['release_date', 'dates'], axis=1, inplace=True)

In [5]:
# Process Data - Remove popularity values that less than 20. Too much data - distorts the distribution. 
dfp_tracks = dfp_tracks[dfp_tracks["track_popularity"] >= 20]
dfp_artists = dfp_artists[dfp_artists["artist_popularity"] >= 20]

In [18]:
dfp_tracks

Unnamed: 0,track_id,track_name,track_popularity,duration_ms,artist_name,artist_id,year
0,4iJyoBOLtHqaGxP12qzhQI,Peaches (feat. Daniel Caesar & Giveon),100,198082,"['Justin Bieber', 'Daniel Caesar', 'Giveon']","['1uNFoZAHBGtllmzznpCI3s', '20wkVLutqVOYrc0kxF...",2021
1,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,['Olivia Rodrigo'],['1McMsnEElThX1knmY4oliG'],2021
2,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,98,132780,['Masked Wolf'],['1uU7g3DNSbsu0QjSEqZtEd'],2021
3,5QO79kh1waicV47BqGRL3g,Save Your Tears,97,215627,['The Weeknd'],['1Xyo4u8uXC1ZmMpatF05PJ'],2020
4,6tDDoYIxWvMLTdKpjFkc1B,telepatía,97,160191,['Kali Uchis'],['1U1el3k54VvEUzo3ybLPlM'],2020
...,...,...,...,...,...,...,...
376777,60Y9NU6XGMHjeUtUKtC51N,Money Can't Save Your Soul,20,331067,['Savoy Brown'],['17obwOahRWI121iMUZznh2'],1970
376778,3ai2Zl9Fw0zipayXShnwJo,High Fashion Queen,20,127333,['The Flying Burrito Brothers'],['0rESpKEusFHxhW59MIf7eM'],1970
376779,42eLA3ghHs4NCmGaWrZUHz,Sticks And Stones - Live At The Fillmore East/...,20,166333,['Joe Cocker'],['3pFCERyEiP5xeN2EsPXhjI'],1970
376780,49oa9eioE1MQm3XpwhHVjl,Give Peace A Chance,20,143867,['Leon Russell'],['6r1Xmz7YUD4z0VRUoGm8XN'],1970


## Export - Processed Data

In [None]:
# Export Final Feature
#dfp_features.to_parquet(PROCESSED_FEATURE_DATA)