# Workshop #2 :
## EDA - Spotify Tracks Dataset

------------------------------------------------------------

https://www.kaggle.com/datasets/maharshipandya/-spotify-tracks-dataset

In [58]:
import sys
import os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from tabulate import tabulate
import logging

sys.path.append(os.path.abspath('../'))
from src.logging_config import setup_logging

In [59]:
setup_logging()

## Data load
-------------------

In [2]:
# Path to the Spotify's dataset in the project directory
csv_file = '../data/external/spotify_dataset.csv'

df = pd.read_csv(csv_file)

df.head()

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,1,4qPNDBW1i3p13qLCt0Ki3A,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,...,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,2,1iJBSr7s7jYXzM8EGcbK5b,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,...,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,3,6lfxq3CG4xtTiEg7opyCyx,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,...,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,4,5vjLSffimiIP26QG5WcN2K,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,...,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


## Data Overview an Descriptive Statistics
-------------------------------------------

### Overview

The number of observations and features are obtained through Panda's `.shape` method. The "Spotify" dataset contains **114.000 observations (rows)** and **21 features (columns)**.

In [3]:
df.shape

(114000, 21)

The data types are obtained through Panda's `.dtypes` method. The Dataframe contains only 1 boolean feature, 5 object type features, 6 int64 type features and  9 float64 type features.


In [116]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 113549 entries, 0 to 113999
Data columns (total 20 columns):
 #   Column            Non-Null Count   Dtype  
---  ------            --------------   -----  
 0   track_id          113549 non-null  object 
 1   artists           113549 non-null  object 
 2   album_name        113549 non-null  object 
 3   track_name        113549 non-null  object 
 4   popularity        113549 non-null  int64  
 5   duration_ms       113549 non-null  int64  
 6   explicit          113549 non-null  bool   
 7   danceability      113549 non-null  float64
 8   energy            113549 non-null  float64
 9   key               113549 non-null  int64  
 10  loudness          113549 non-null  float64
 11  mode              113549 non-null  int64  
 12  speechiness       113549 non-null  float64
 13  acousticness      113549 non-null  float64
 14  instrumentalness  113549 non-null  float64
 15  liveness          113549 non-null  float64
 16  valence           113549 

`Unnamed: 0` column is dropped as it is not part of the original dataset.

In [5]:
df =  df.drop(columns=["Unnamed: 0"])

The duplicated rows are obtained through Panda's `.duplicated` method. The Dataframe has 450 duplicate rows.

In [6]:
df[df.duplicated()].shape[0]

450

The missing values per feature are obtained through Panda's `.isnull().sum()` method. Only the features "artists", "album_name" and "track_name" have missing values, one missing value for each feature. This features, as indicated by their object datatype, are qualitative variables.

In [7]:
df.isnull().sum()

track_id            0
artists             1
album_name          1
track_name          1
popularity          0
duration_ms         0
explicit            0
danceability        0
energy              0
key                 0
loudness            0
mode                0
speechiness         0
acousticness        0
instrumentalness    0
liveness            0
valence             0
tempo               0
time_signature      0
track_genre         0
dtype: int64

Rows with null values are filtered  using the `.isnull()` method combined with `.any(axis=1)`. Only one row contains the missing values.

In [8]:
filtered_rows = df[df.isnull().any(axis=1)]

filtered_rows

Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
65900,1kR4gIb7nGxHPI3D2ifs59,,,,0,0,False,0.501,0.583,7,-9.46,0,0.0605,0.69,0.00396,0.0747,0.734,138.391,4,k-pop


The percentage of missing data is aproximmately 0.0001%, which in itself is not very significative.

In [9]:
round(df.isnull().sum().sum() / df.size * 100, 4) 

np.float64(0.0001)

### Descriptive statistics

#### Quantitative variables

Descriptive statistics of quantitative are generated through Panda's `.describe` method.



In [10]:
df.describe()

Unnamed: 0,popularity,duration_ms,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature
count,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0,114000.0
mean,33.238535,228029.2,0.5668,0.641383,5.30914,-8.25896,0.637553,0.084652,0.31491,0.15605,0.213553,0.474068,122.147837,3.904035
std,22.305078,107297.7,0.173542,0.251529,3.559987,5.029337,0.480709,0.105732,0.332523,0.309555,0.190378,0.259261,29.978197,0.432621
min,0.0,0.0,0.0,0.0,0.0,-49.531,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,17.0,174066.0,0.456,0.472,2.0,-10.013,0.0,0.0359,0.0169,0.0,0.098,0.26,99.21875,4.0
50%,35.0,212906.0,0.58,0.685,5.0,-7.004,1.0,0.0489,0.169,4.2e-05,0.132,0.464,122.017,4.0
75%,50.0,261506.0,0.695,0.854,8.0,-5.003,1.0,0.0845,0.598,0.049,0.273,0.683,140.071,4.0
max,100.0,5237295.0,0.985,1.0,11.0,4.532,1.0,0.965,0.996,1.0,1.0,0.995,243.372,5.0


#### Qualitative variables

Panda's `.describe` method is used with the parameter `include='object'` for describing all qualitative columns of the DataFrame.

In [11]:
df.describe(include='object') 

Unnamed: 0,track_id,artists,album_name,track_name,track_genre
count,114000,113999,113999,113999,114000
unique,89741,31437,46589,73608,114
top,6S3JlDAGk3uu3NtZbPnuhS,The Beatles,Alternative Christmas 2022,Run Rudolph Run,acoustic
freq,9,279,195,151,1000


1. **Observation Count**: The column `track_id` matches the total number of observations (114,000), which indicates every row has a `track_id` entry.

2. **Uniqueness**: Despite having 114,000 observations, `track_id` contains only 89,741 unique values. This suggests that some `track_id`s are repeated across multiple rows.

4. **track_genre**: This column seems to have very few unique values (only 114), meaning it's highly categorical or repetitive.

5. **Frequent Entries**: The `top` values show the most frequent entry for each column, and `freq` gives the count of that value.  "The Beatles" is the mode in `artists`; "Alternative Christmas 2022"	is the mode in `album_name`; and "Run Rudolph Run" is the mode in `track_name`; and acoustic is the mode in `track_genre`.

## Handling missing values
---------------------------

As stablished previously, only one rows contains missing values, amounting to approximmately 0.0001%, of the data which in itself is not very significative.So this missing data is going to be dropped.

In [12]:
df = df.dropna()

Now the Dataframe has 113.999 rows and 20 columns.

In [13]:
df.shape

(113999, 20)

## Handling duplicated values
---------------------------

As stated previously the Dataframe has 450 duplicate rows.There are going to be dropped.

In [65]:
df = df.drop_duplicates()

In [66]:
logging.info(f"The Dataframe without duplicates has {df.shape[0]} rows and {df.shape[1]} columns")

2025-04-05 20:06:42,285 - INFO - root - The Dataframe without duplicates has 113549 rows and 20 columns


### Inspecting the duplicated entries in `track_id`

We create a `duplicates_id` Dataframe where containing only the rows where the column `track_id` has duplicates and check its dimensions to assert the number of duplicated rows.

In [68]:
duplicates_id = df[df.duplicated(subset=['track_id'], keep=False)]
logging.info(f"The Dataframe with duplicates has {duplicates_id.shape[0]} rows.")

2025-04-05 20:06:48,207 - INFO - root - The Dataframe with duplicates has 40108 rows.


Now we compute the difference in the number of rows and columns between our original DataFrame (`df`) and the filtered DataFrame of duplicates (`duplicates_id`). 

In [70]:
difference = (df.shape[0] - duplicates_id.shape[0])

# Compute the percentage of non-duplicated rows
percentage_non_duplicated = (difference / df.shape[0]) * 100

# Print the result
logging.info(f"The number of non-duplicated rows is {difference}, which is {percentage_non_duplicated:.2f}% of the original Spotify DataFrame.")

2025-04-05 20:06:53,162 - INFO - root - The number of non-duplicated rows is 73441, which is 64.68% of the original Spotify DataFrame.


We need to check if pairs with duplicated `track_id` along with their respective `track_name` have a correspondence. To ensure that all pairs of duplicated `track_id`s have the same `track_name`, the data is grouped by `track_id` and we check if each group has only one unique `track_name`.

1. **`groupby('track_id')`**: Groups the DataFrame by `track_id`.
2. **`nunique()`**: Counts the number of unique `track_name` values in each group.
3. **Check for inconsistencies**: Identifies `track_id`s where there is more than one unique `track_name`.

In [71]:
grouped = df.groupby('track_id')['track_name'].nunique()

# Check for track_ids with more than one unique track_name
inconsistent = grouped[grouped > 1]

if inconsistent.empty:
    logging.info("All duplicated track_ids have the same track_name.")
else:
    logging.info("Some duplicated track_ids have inconsistent track_names.")


2025-04-05 20:06:56,318 - INFO - root - All duplicated track_ids have the same track_name.


As all duplicated `track_id`s have the same `track_name`s, we need to further inspect if there is anything that differentiates this duplicate tracks. 

In [61]:
# Group by 'track_id' and check for identical rows within each group
identical_groups = duplicates_id.groupby('track_id').filter(
    lambda group: group.drop_duplicates().shape[0] == 1
)

if identical_groups.empty:
    logging.info("No duplicates are fully identical across all fields.")
else:
    logging.info("Fully identical rows:")
    logging.info(identical_groups.shape[0])

2025-04-05 20:06:27,023 - INFO - root - No duplicates are fully identical across all fields.


Now we are going to check the inconsistencies with a function:


1. **Identify Duplicates**:
   - The code creates a mask to find rows where the specified `id_col` (e.g., `track_id`) has duplicate values. It counts and prints the total number of duplicated rows.

2. **Handle No Duplicates**:
   - If no duplicates are found, it prints a message and returns an empty DataFrame with columns `id_col` and `inconsistent_columns`.

3. **Analyze Inconsistencies**:
   - For each unique duplicate identifier (`track_id`), it identifies columns where values differ across rows (`nunique() > 1`).
   - It stores details about the inconsistencies, including:
     - The identifier (`track_id`).
     - The inconsistent columns.
     - The number of duplicate entries for the identifier.
     - Example values from one of the inconsistent columns.

4. **Return Results**:
   - If inconsistencies are found, it returns a DataFrame summarizing them.
   - If all duplicates are consistent across columns, it prints a message and skips the detailed summary.

In [20]:
def check_inconsistencies(df, id_col='track_id'):
    """
    Check for inconsistencies in duplicate records.

    This function identifies and analyzes duplicate entries in a DataFrame based on a unique identifier column (`id_col`).
    It checks for inconsistencies in other columns and returns a summary of the discrepancies.

    Parameters:
        df (pd.DataFrame): The DataFrame to analyze.
        id_col (str): The name of the column used to identify duplicates. Defaults to 'track_id'.

    Returns:
        pd.DataFrame: A DataFrame containing details about inconsistencies:
            - The identifier (`id_col`) of the duplicates.
            - The columns with inconsistent values.
            - The number of duplicate entries for each identifier.
            - Example inconsistent values for the first flagged column.
    
    Behavior:
        - Prints the total number of duplicate records.
        - If no duplicates are found, it prints a message and returns an empty DataFrame.
        - If duplicates are consistent across all columns, it prints a message.
        - Otherwise, it provides details about the inconsistencies in the duplicates.
    """

    dup_mask = df.duplicated(subset=id_col, keep=False)
    logging.info(f"Total duplicated registers: {dup_mask.sum()}")
    
    if not dup_mask.any():
        print("No duplicates to analyze")
        return pd.DataFrame(columns=[id_col, 'inconsistent_columns'])
    
    results = []
    for track_id in df.loc[dup_mask, id_col].unique():
        group = df[df[id_col] == track_id]
        inconsistent = [col for col in group.columns 
                       if col != id_col and group[col].nunique() > 1]
        if inconsistent:
            results.append({
                id_col: track_id,
                'inconsistent_columns': ', '.join(inconsistent),
                'n_duplicates': len(group),
                'example_values': str(group[inconsistent[0]].unique()[:3])  # Muestra primeros valores
            })
    
    if not results:
        logging.info("Duplicates found but consistent in all columns")
    
    return pd.DataFrame(results)

The `check inconsistencies` function is applies to the `duplicates_id` Dataframe returning an `inconsistencies` Dataframe.

In [21]:
inconsistencies = check_inconsistencies(duplicates_id)
if inconsistencies.empty:
    logging.info("No inconsistencies found in duplicates")
else:
    logging.info("Inconsistencies found:")
    display(inconsistencies)

Unnamed: 0,track_id,inconsistent_columns,n_duplicates,example_values
0,5SuOikwiRyPMVoIQDJUgSV,track_genre,4,['acoustic' 'j-pop' 'singer-songwriter']
1,4qPNDBW1i3p13qLCt0Ki3A,track_genre,2,['acoustic' 'chill']
2,01MVOl9KtVTNfFiBU9I7dc,track_genre,2,['acoustic' 'indie-pop']
3,6Vc5wAMmXdKIAM7WUoEb7N,track_genre,2,['acoustic' 'piano']
4,1EzrEOXmMH3G43AXT1y7pA,track_genre,2,['acoustic' 'rock']
...,...,...,...,...
16294,79cxnmnGiC0qZfxi5ogp4j,track_genre,2,['techno' 'trance']
16295,1B0FEDRzzN5GP7HGZZfNQl,track_genre,2,['techno' 'trance']
16296,4D41idYLHmXYGaHZeRWtPT,track_genre,2,['techno' 'trip-hop']
16297,27nGU2v3syK7aU3AVY2vUO,track_genre,2,['techno' 'trance']


In the `inconsistencies` Dataframe the column `inconsistent_columns` contains information about which columns had inconsistencies for each duplicate entry. Through Panda's `.value_counts()` method we are going to count the occurrence of each unique value in the `inconsistent_columns` column to pinpoint wich are the inconsistent column or columns for duplicated tracks.

In [72]:
logging.info(inconsistencies['inconsistent_columns'].value_counts())

2025-04-05 20:07:09,375 - INFO - root - inconsistent_columns
track_genre                15579
popularity, track_genre      720
Name: count, dtype: int64


It seems only `track_genre` and `popularity` are the inconsistent columns between duplicated tracks. The feature `track_genre` is a qualitative nominal variable indicating the genre in which the track belongs; whereas `popularity` is an int64 value between 0 and 100, indicating the popularity of a track.

This entails the that we cannot handle duplicates for `popularity` using the mean, as it's not ideal for integers due to potential floating-point results. The median posses the same problem with even numbered duplicates, with two being the most commen number.


In [74]:
logging.info(inconsistencies['n_duplicates'].value_counts())

2025-04-05 20:07:13,566 - INFO - root - n_duplicates
2    11424
3     2955
4     1361
5      431
6      104
7       21
8        2
9        1
Name: count, dtype: int64


#### Handling `popularity` duplicates

In this case If each value in the duplicates is unique, there is no mode, and attempting to calculate it might result in an error. In such cases, we might need a fallback strategy, such as selecting the median, maximum, or arbitrarily choosing one value (e.g., the first).

In [75]:
def resolve_popularity(x):
    
    modes = x.mode()
    if len(modes) == 1:  # Single mode
        return modes[0]
    elif len(modes) > 1:  # Multiple modes (tie)
        return max(modes)  # Choose the maximum (or any other criterion)
    else:  # No mode
        return x.median()  # Fallback to median

In [76]:
df1 = df.copy() #copy of the dataset
df1['popularity'] = df1.groupby('track_id')['popularity'].transform(resolve_popularity).astype(int)


#### Handling `track_genre` duplicates

In [77]:
np.sort(df1.track_genre.unique())

array(['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient',
       'anime', 'black-metal', 'bluegrass', 'blues', 'brazil',
       'breakbeat', 'british', 'cantopop', 'chicago-house', 'children',
       'chill', 'classical', 'club', 'comedy', 'country', 'dance',
       'dancehall', 'death-metal', 'deep-house', 'detroit-techno',
       'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm',
       'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk',
       'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove',
       'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle',
       'heavy-metal', 'hip-hop', 'honky-tonk', 'house', 'idm', 'indian',
       'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol',
       'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino',
       'malay', 'mandopop', 'metal', 'metalcore', 'minimal-techno', 'mpb',
       'new-age', 'opera', 'pagode', 'party', 'piano', 'pop', 'pop-film',
       'pow

To focus on genre solely on sound characteristics, we will remove genres like ‘British’, ‘French’, or ‘German’ from the target variable. These classifications are based on origin or language, which aren’t captured by the audio features in the dataset.

In [81]:
# Drop rows where the condition is True
non_sound_based_categories = ['british','brazilian','french','german','iranian','swedish','spanish','indian','malay','turkish','world-music','gospel']
df1 = df1.drop(df1[df1['track_genre'].isin(non_sound_based_categories)].index)

Now we are going to consolidate genres into major genres and subgenders with a dictionary (this dictionary is the result of a machine learning excercise perfomed on this same dataset by Juan Francisco Leonhardt).It can be found in [Music Genre Classification: A Machine Learning Exercise](https://medium.com/@juanfraleonhardt/music-genre-classification-a-machine-learning-exercise-9c83108fd2bb)

In [82]:
# Dictionary with descriptive names

consolidated_genres = {'agressive-fusion': ['dubstep', 'grunge', 'metal'],
                       'industrial': ['goth', 'heavy-metal', 'industrial'],
                       'punk-rock': ['alt-rock', 'garage', 'hard-rock', 'j-rock', 'punk', 'punk-rock'],
                       'hardstyle': ['happy', 'hardstyle'],
                       'disco-ska': ['disco', 'ska', 'synth-pop'],
                       'rock': ['alternative', 'rock'],
                       'anime': ['anime', 'club'],
                       'edm-house': ['deep-house', 'electronic', 'progressive-house'],
                       'edm': ['dub', 'edm', 'electro', 'groove', 'house'],
                       'j-dance': ['dancehall', 'j-dance'],
                       'funk-hip-hop': ['funk', 'hip-hop'],
                       'latin': ['dance', 'latin', 'latino', 'reggae', 'reggaeton'],
                       'pop': ['k-pop', 'pop', 'pop-film'],
                       'brazilian': ['brazil', 'mpb'],
                       'blues-rnb': ['blues', 'j-pop', 'r-n-b'],
                       'indie': ['folk', 'indie', 'indie-pop', 'psych-rock'],
                       'chill': ['chill', 'sad'],
                       'pagode-samba': ['pagode', 'samba', 'sertanejo'],
                       'country-soul': ['country', 'soul'],
                       'rock-n-roll': ['rock-n-roll', 'rockabilly'],
                       'chicago-house': ['chicago-house', 'detroit-techno'],
                       'jazz-tango': ['honky-tonk', 'jazz', 'tango'],
                       'vocal-pop': ['acoustic', 'cantopop', 'mandopop', 'singer-songwriter', 'songwriter'],
                       'disney': ['disney', 'guitar'],
                       'soundscape': ['ambient', 'new-age']}

# Create a dictionary to map old genres to new genres
genre_map = {old_genre: new_genre for new_genre, old_genres in consolidated_genres.items() for old_genre in old_genres}

# Replace the old genres with the new genres
df1['track_genre'] = df1['track_genre'].replace(genre_map)

The remaining genres are the following.

In [83]:
logging.info(np.sort(df1.track_genre.unique()))
logging.info(f"Total number of unique values: {df1['track_genre'].nunique()}")


2025-04-05 20:08:09,504 - INFO - root - ['afrobeat' 'agressive-fusion' 'anime' 'black-metal' 'bluegrass'
 'blues-rnb' 'breakbeat' 'chicago-house' 'children' 'chill' 'classical'
 'comedy' 'country-soul' 'death-metal' 'disco-ska' 'disney'
 'drum-and-bass' 'edm' 'edm-house' 'emo' 'forro' 'funk-hip-hop'
 'grindcore' 'hardcore' 'hardstyle' 'idm' 'indie' 'industrial' 'j-dance'
 'j-idol' 'jazz-tango' 'kids' 'latin' 'metalcore' 'minimal-techno' 'opera'
 'pagode-samba' 'party' 'piano' 'pop' 'power-pop' 'punk-rock' 'rock'
 'rock-n-roll' 'romance' 'salsa' 'show-tunes' 'sleep' 'soundscape' 'study'
 'techno' 'trance' 'trip-hop' 'vocal-pop']
2025-04-05 20:08:09,511 - INFO - root - Total number of unique values: 54


For `track_genre` we are going to handle duplicates with a function in the following way:


1. **Group by `track_id`**:
   - Groups rows based on the unique identifier column (`track_id`).

2. **Mode Check**:
   - Uses `.mode()` to check for the most frequent value in the `track_genre` column.
   - If there’s a single mode, it is selected as the resolved genre.

3. **Fallback to First**:
   - If there’s no mode (or multiple modes in a tie), it defaults to the first genre in the group (`iloc[0]`).

4. **Reindexing**:
   - Aligns the resolved genres with the original DataFrame indices for proper assignment.


In [84]:
def resolve_track_genre(df, id_col='track_id', genre_col='track_genre'):
    """
    Resolves inconsistencies in the `track_genre` column for duplicate entries.

    Parameters:
        df (pd.DataFrame): The DataFrame containing track data.
        id_col (str): The column representing unique identifiers (e.g., 'track_id').
        genre_col (str): The column containing track genres (e.g., 'track_genre').

    Returns:
        pd.DataFrame: The DataFrame with consistent `track_genre` for each `track_id`,
                      choosing the mode if it exists, or the first genre otherwise.
    """
    def resolve_genre(group):
        # Check if there is a mode
        modes = group[genre_col].mode()  # Returns a Series of modes
        if len(modes) == 1:
            # If there's one clear mode, return it
            return modes.iloc[0]
        else:
            # If there's no mode or multiple modes, return the first genre
            return group[genre_col].iloc[0]

    # Create a new column to store resolved genres
    resolved_genres = df.groupby(id_col).apply(
        lambda group: resolve_genre(group)
    )

    # Map the resolved genres back to the original DataFrame
    df[genre_col] = df[id_col].map(resolved_genres)

    return df

In [85]:
# Resolve track_genre inconsistencies
df2 = df1.copy()
df2 = resolve_track_genre(df2, id_col='track_id', genre_col='track_genre')

  resolved_genres = df.groupby(id_col).apply(


The final number of genres is 54.

In [87]:
logging.info(np.sort(df2.track_genre.unique()))
logging.info(f"Total number of unique values: {df2['track_genre'].nunique()}")


2025-04-05 20:08:29,790 - INFO - root - ['afrobeat' 'agressive-fusion' 'anime' 'black-metal' 'bluegrass'
 'blues-rnb' 'breakbeat' 'chicago-house' 'children' 'chill' 'classical'
 'comedy' 'country-soul' 'death-metal' 'disco-ska' 'disney'
 'drum-and-bass' 'edm' 'edm-house' 'emo' 'forro' 'funk-hip-hop'
 'grindcore' 'hardcore' 'hardstyle' 'idm' 'indie' 'industrial' 'j-dance'
 'j-idol' 'jazz-tango' 'kids' 'latin' 'metalcore' 'minimal-techno' 'opera'
 'pagode-samba' 'party' 'piano' 'pop' 'power-pop' 'punk-rock' 'rock'
 'rock-n-roll' 'romance' 'salsa' 'show-tunes' 'sleep' 'soundscape' 'study'
 'techno' 'trance' 'trip-hop' 'vocal-pop']
2025-04-05 20:08:29,797 - INFO - root - Total number of unique values: 54


#### Final handling

The functions for handling these columns only modify the data but do not remove duplicate rows automatically, so we need to verify and clean up duplicates explicitly.Bearing in mind this we verify the duplicates rows in `df2` are identical.

In [88]:
# Group by 'track_id' and check for identical rows within each group
identical_groups = df2.groupby('track_id').filter(
    lambda group: group.drop_duplicates().shape[0] == 1
)

if identical_groups.empty:
    logging.info("No duplicates are fully identical across all fields.")
else:
    logging.info("Fully identical rows:")
    logging.info(identical_groups.shape[0])

2025-04-05 20:10:29,532 - INFO - root - Fully identical rows:
2025-04-05 20:10:29,533 - INFO - root - 100606


Now that we have verified it, we are going to drop the duplicates.We keep the first occurrence as the inconsistencies where already handled and duplicates rows contain the same data.

In [89]:
df2 = df2.drop_duplicates(subset='track_id', keep='first')

We verify the new dimensions of the `df2` Dataframe.

In [91]:
logging.info(f"The new dimensions of the Dataframe after handling and dropping duplicates are {df2.shape[0]} rows and {df2.shape[1]} columns.")

2025-04-05 20:10:35,294 - INFO - root - The new dimensions of the Dataframe after handling and dropping duplicates are 80081 rows and 20 columns.


 ## Transforming `duration_ms` into minutes for readability
 ----------------------------------------------------------

We convert `durantion_ms` into minutes in a new column `minutes` for better understanding. 

In [92]:
# Convert milliseconds to minutes
df2['minutes'] = (df2['duration_ms'] // 60000)  # Divide by 60000 to get minutes

This might return floats instead of integers, so we have to ensure they are treated as such.

In [93]:
df2['minutes'] = df2['duration_ms'].astype(float) 

## Transforming `mode`

Mode indicates the modality (major or minor) of a track, the type of scale from which its melodic content is derived. Major is represented by 1 and minor is 0. We are going to verify if the data of this feature is boolean and if its values are "0" and "1". The result is that its values are indeed "0" and "1", but its data type is int64.

In [100]:
logging.info(f"Number of unique values: {df2['mode'].nunique()}")
logging.info(f"Unique values: {df2['mode'].unique()}")
logging.info(f"Data type:{df2['mode'].dtype}")


2025-04-05 20:14:55,737 - INFO - root - Number of unique values: 2
2025-04-05 20:14:55,738 - INFO - root - Unique values: [0 1]
2025-04-05 20:14:55,740 - INFO - root - Data type:int64


We are going to transformn this feature into a nominal feature for better undersatanding, mapping the ceros to "minor" and the ones to "major".

In [103]:
df2['mode_nominal'] = df2['mode'].map({0: 'minor', 1: 'major'}).astype(str)

We verify.

In [115]:
logging.info(f"Number of unique values: {df2['mode_nominal'].nunique()}")
logging.info(f"Unique values: {df2['mode_nominal'].unique()}")
logging.info(f"Data type:{df2['mode_nominal'].dtype}")

2025-04-05 20:30:48,958 - INFO - root - Number of unique values: 2
2025-04-05 20:30:48,972 - INFO - root - Unique values: ['minor' 'major']
2025-04-05 20:30:48,975 - INFO - root - Data type:object


## Transforming `key`

The key the track is in. Integers map to pitches using standard Pitch Class notation. E.g. 0 = C, 1 = C♯/D♭, 2 = D, and so on. If no key was detected, the value is -1. We are going to verify if the data of this feature is int64 and if its values correspsond to the documented ones. The result is that its values are indeed int64 and numerical, and all keys are present (there are 12 chromatical `key`s), and no tracks without key detected.

In [109]:
logging.info(f"Number of unique values: {df2['key'].nunique()}")
logging.info(f"Unique values: {np.sort(df2.key.unique())}") 
logging.info(f"Data type:{df2['key'].dtype}")

2025-04-05 20:26:50,727 - INFO - root - Number of unique values: 12
2025-04-05 20:26:50,730 - INFO - root - Unique values: [ 0  1  2  3  4  5  6  7  8  9 10 11]
2025-04-05 20:26:50,732 - INFO - root - Data type:int64


We are going to transform this values into nominal ones for better understanding. This in done thrhough mapping and a dictionary constructed based on the documentation for this dataset. 

In [111]:
# Define a mapping dictionary for keys
key_mapping = {
    -1: 'No Key Detected',
    0: 'C',
    1: 'C♯/D♭',
    2: 'D',
    3: 'D♯/E♭',
    4: 'E',
    5: 'F',
    6: 'F♯/G♭',
    7: 'G',
    8: 'G♯/A♭',
    9: 'A',
    10: 'A♯/B♭',
    11: 'B'
}

# Map the 'key' column to its corresponding pitch notation
df2['key_nominal'] = df2['key'].map(key_mapping)

We verify again.

In [113]:
logging.info(f"Number of unique values: {df2['key_nominal'].nunique()}")
logging.info(f"Unique values: {np.sort(df2.key_nominal.unique())}") 
logging.info(f"Data type:{df2['key_nominal'].dtype}")

2025-04-05 20:29:55,473 - INFO - root - Number of unique values: 12
2025-04-05 20:29:55,485 - INFO - root - Unique values: ['A' 'A♯/B♭' 'B' 'C' 'C♯/D♭' 'D' 'D♯/E♭' 'E' 'F' 'F♯/G♭' 'G' 'G♯/A♭']
2025-04-05 20:29:55,488 - INFO - root - Data type:object


# Final exploration

-----------------------------------------------------------------------------------------------

In [None]:
minutes_counts = df2['minutes'].value_counts()

# Create the bar plot
plt.figure(figsize=(12, 6))  # Set the size of the plot
sns.barplot(x=minutes_counts.index, y=minutes_counts.values, palette="magma", dodge=False)

#Add labels and titleñ
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.xlabel("Track duration (minutes)")  # Label for x-axis
plt.ylabel("Count")  # Label for y-axis
plt.title("Distribution of Track Duration in Minutes")  # Title for the plot
plt.tight_layout()  # Adjust layout for better fit

# Step 4: Display the plot
plt.show()
plt.show()