## Library

In [1]:
import pandas as pd

from relative_path import PATH_DATA
from exploration_data import VisualizeMissing, EvalMissingData

## Standard

In [2]:
TRACK_DATA = PATH_DATA / "tracks.parquet"
ARTISTS_DATA = PATH_DATA / "artists.parquet"

## Reading the Data

In [3]:
df_tracks = pd.read_parquet(TRACK_DATA)
df_artists = pd.read_parquet(ARTISTS_DATA)

In [11]:
from pandas import DataFrame
import math
from datetime import date
import matplotlib.pyplot as plt

from typing import Any

TODAY = date.today()
DATE_FORMAT = str(TODAY.year) + str(TODAY.month) + str(TODAY.day)
from relative_path import OUTPUT_EXPLORE, OUTPUT_MAIN, OUTPUT_FEATURE, PATH_OUTPUT
# Turn interactive plotting off
plt.ioff()

<contextlib.ExitStack at 0x7fd63817e2e0>

In [20]:
from pandas_profiling import ProfileReport
import seaborn as sns

In [42]:
class EvaluateDataset:
    """ Standard evaluation for the dataframe. """
    def __init__(
            self, 
            input_df:DataFrame, 
            data_name:str, 
            
            export_plot:bool=False, 
            show_plot:bool=False,
        ) -> None:
        
        self.df:DataFrame = input_df
        self.name:str = data_name.title()
        self._profile = ProfileReport(self.df, title=self.name)
        
        self._export = export_plot
        self._show = show_plot
        
        self._color = "orange"
        self._figsize = (16,10) 
        self._dpi = 720
        self._aspect = 3
        
        plt.ioff()
        
    @property
    def dataframe(self):
        return self.df
        
    @property
    def profile(self):
        return self._profile
            
    def get_profiling(self) -> None:
        profile_name = f"{DATE_FORMAT}-{self.name}Data_Profiling.html"
        self._profile.to_file(PATH_OUTPUT / profile_name)
        
    def get_all_corr(self, show:bool=False):
        all_correlation = ["spearman", "kendall", "pearson"]
    
        for corr in all_correlation:
            plt.figure(figsize=self._figsize, dpi=self._dpi)
            df_corr = self.df.corr(method=corr)
            
            fig = sns.heatmap(df_corr, annot=True, cmap="inferno", center=0)
            fig.set(title=f"{self.name} - {corr.title()} Correlation Heatmap")
            
            fig_name = f"{DATE_FORMAT}-{self.name}Data_Corr{corr.title()}.png"
            self._saving_and_showing(_name = fig_name, _show = show)
    
    def get_all_hist(self, show:bool=False):
        num_in_df = self.df.select_dtypes(include="number")
        row_size = math.ceil(len(num_in_df.columns) / 4) * 5
        num_in_df.hist(bins=20, color=self._color, figsize=(30, row_size))
        
        fig_name = f"{DATE_FORMAT}-{self.name}Data_Hist.png"
        self._saving_and_showing(_name = fig_name, _show = show)
        
    def get_category_hist(self, column_name:str, show:bool=False):
        fig = sns.displot(self.df[column_name], discrete=True, aspect=self._aspect, kind='hist', color=self._color)
        
        # Setting the x label
        xlabel = column_name.replace("_", " ").title()
        
        # Setting the title
        title_name = f"{self.name.title()} Data - Count over {column_name.title()}"
        
        fig.set(title=title_name, xlabel=xlabel)
        fig.tight_layout()
        
        # Export plot?
        if self._export:
            fig.savefig(column_name)
        
        # Show plot?
        if self._show or show:
            plt.show()
        else:
            plt.close("all")
        
    def _saving_and_showing(self, _name:str, _show:bool):
        if self._export:
            plt.savefig(OUTPUT_EXPLORE / _name)
        
        # Show or Hide Plot
        if self._show or _show:
            plt.show()
        else:
            plt.close("all")

In [43]:
eval_tracks = EvaluateDataset(df_tracks, "Tracks")

In [44]:
eval_tracks.dataframe

Unnamed: 0,id,name,popularity,duration_ms,explicit,artists,id_artists,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
0,4iJyoBOLtHqaGxP12qzhQI,Peaches (feat. Daniel Caesar & Giveon),100,198082,1,"['Justin Bieber', 'Daniel Caesar', 'Giveon']","['1uNFoZAHBGtllmzznpCI3s', '20wkVLutqVOYrc0kxF...",2021-03-19,0.677,0.6960,0,-6.181,1,0.1190,0.3210,0.000000,0.420,0.464,90.030,4
1,7lPN2DXiMsVn7XUKtOW1CS,drivers license,99,242014,1,['Olivia Rodrigo'],['1McMsnEElThX1knmY4oliG'],2021-01-08,0.585,0.4360,10,-8.761,1,0.0601,0.7210,0.000013,0.105,0.132,143.874,4
2,3Ofmpyhv5UAQ70mENzB277,Astronaut In The Ocean,98,132780,0,['Masked Wolf'],['1uU7g3DNSbsu0QjSEqZtEd'],2021-01-06,0.778,0.6950,4,-6.865,0,0.0913,0.1750,0.000000,0.150,0.472,149.996,4
3,5QO79kh1waicV47BqGRL3g,Save Your Tears,97,215627,1,['The Weeknd'],['1Xyo4u8uXC1ZmMpatF05PJ'],2020-03-20,0.680,0.8260,0,-5.487,1,0.0309,0.0212,0.000012,0.543,0.644,118.051,4
4,6tDDoYIxWvMLTdKpjFkc1B,telepatía,97,160191,0,['Kali Uchis'],['1U1el3k54VvEUzo3ybLPlM'],2020-12-04,0.653,0.5240,11,-9.016,0,0.0502,0.1120,0.000000,0.203,0.553,83.970,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
586667,7cHgF1z7KeampXC54Vp0dd,My Baby Knows How,0,192701,0,"[""Coon-Sanders' Original Nighthawk Orchestra""]",['53E8i8Zawt25O5bICAORNO'],1932,0.525,0.4750,8,-7.581,1,0.1660,0.9930,0.196000,0.293,0.769,209.427,4
586668,0uszXvZLZ2QpVJ45bkjDe9,Satyi Katha Galpo Na,0,174520,0,['Robin Majumdar'],['76TzDmArcIh69amb9LzJff'],1949-12-31,0.559,0.2020,11,-17.190,1,0.1430,0.9940,0.851000,0.151,0.524,125.232,3
586669,0GLyKUbNvQXoDPu6tWldbQ,O Come All Ye Faithful,0,139213,0,['Julie Andrews'],['5RdqZVi36tpDPYNPw8jJbO'],1945,0.394,0.2280,7,-15.435,1,0.0303,0.6960,0.000000,0.366,0.289,104.027,4
586670,0ulNCe6co5KxRPdGqBv8TK,Alegrias,0,159414,0,['Carlos Montoya'],['0fqQJD6wePdVDxuPUVrLyX'],1949-07-09,0.487,0.7570,10,-13.430,1,0.0739,0.9850,0.901000,0.140,0.467,82.625,4
