In [2]:

# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# Explorando CSV para línea del tiempo de música rock

In [3]:
# cargamos el dataframe correspondiente 
df = pd.read_csv("archivos/UltimateClassicRock.csv")

display(df.head())

Unnamed: 0,Track,Artist,Album,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity
0,Play A Simple Song,38 Special,38 Special,1977,3:13,4,0.521,0.367,0,-13.866,1,0.0278,0.692,3e-06,0.108,0.789,83.412,16
1,Four Wheels,38 Special,38 Special,1977,4:43,4,0.535,0.71,2,-12.287,1,0.0428,0.01,0.023,0.0495,0.445,160.361,10
2,Fly Away,38 Special,38 Special,1977,5:13,4,0.563,0.563,2,-10.781,1,0.0263,0.0357,0.00185,0.14,0.564,106.739,13
3,Tell Everybody,38 Special,38 Special,1977,4:09,4,0.638,0.694,11,-10.206,0,0.031,0.161,3.4e-05,0.0908,0.936,124.962,10
4,Just Wanna Rock & Roll,38 Special,38 Special,1977,5:57,4,0.388,0.701,2,-9.984,1,0.036,0.013,0.0422,0.115,0.769,126.769,11


In [4]:
df.columns

Index(['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature',
       'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness',
       'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',
       'Popularity'],
      dtype='object')

In [5]:
def exploracion(df):
    df_info = pd.DataFrame()
    df_info["% nulos"] = round(df.isna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["% no_nulos"] = round(df.notna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["tipo_dato"] = df.dtypes
    df_info["num_valores_unicos"] = df.nunique()
    print(f"""El DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.
Tiene {df.duplicated().sum()} datos duplicados, lo que supone un porcentaje de {round(df.duplicated().sum()/df.shape[0], 2)}% de los datos.
Hay {len(list(df_info[(df_info["% nulos"] != "0.0%")].index))} columnas con datos nulos, y son:
{list(df_info[(df_info["% nulos"] != "0.0%")].index)}
y sin nulos hay {len(list(df_info[(df_info["% nulos"] == "0.0%")].index))} columnas y son:
{list(df_info[(df_info["% nulos"] == "0.0%")].index)}
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:""")
    display(df_info.head())
    print("Principales estadísticos de las columnas categóricas:")
    display(df.describe(include="O").T)
    print("Principales estadísticos de las columnas numéricas:")
    display(df.describe(exclude="O").T)
    return df_info

exploracion(df)

El DataFrame tiene 14418 filas y 18 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 0 columnas con datos nulos, y son:
[]
y sin nulos hay 18 columnas y son:
['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Popularity']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Track,0.0%,100.0%,object,13328
Artist,0.0%,100.0%,object,94
Album,0.0%,100.0%,object,1247
Year,0.0%,100.0%,int64,63
Duration,0.0%,100.0%,object,911


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
Track,14418,13328,Start Me Up - Live,8
Artist,14418,94,Bob Dylan,454
Album,14418,1247,Fleetwood Mac,35
Duration,14418,911,4:13,102


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,14418.0,1987.634693,15.318819,1962.0,1975.0,1983.0,1999.0,2024.0
Time_Signature,14418.0,3.917811,0.356628,0.0,4.0,4.0,4.0,5.0
Danceability,14418.0,0.503063,0.142619,0.0,0.405,0.509,0.603,0.987
Energy,14418.0,0.656563,0.229607,0.0,0.493,0.6905,0.854,0.998
Key,14418.0,5.166597,3.503423,0.0,2.0,5.0,9.0,11.0
Loudness,14418.0,-9.438675,4.179623,-60.0,-11.91425,-8.8105,-6.3415,-0.203
Mode,14418.0,0.730129,0.443908,0.0,0.0,1.0,1.0,1.0
Speechiness,14418.0,0.051354,0.046291,0.0,0.0318,0.039,0.0543,0.952
Acousticness,14418.0,0.226924,0.268857,0.0,0.0131,0.104,0.37,0.995
Instrumentalness,14418.0,0.089682,0.215783,0.0,1.3e-05,0.000737,0.028375,0.992


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Track,0.0%,100.0%,object,13328
Artist,0.0%,100.0%,object,94
Album,0.0%,100.0%,object,1247
Year,0.0%,100.0%,int64,63
Duration,0.0%,100.0%,object,911
Time_Signature,0.0%,100.0%,int64,5
Danceability,0.0%,100.0%,float64,788
Energy,0.0%,100.0%,float64,1075
Key,0.0%,100.0%,int64,12
Loudness,0.0%,100.0%,float64,8993


## Valores unicos de las variables categoricas ✨

In [6]:
# creamos una lista con los nombres de las columnas categoricas 
columnas = df.select_dtypes(include='object').columns.tolist()
print(columnas)
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df[columna].value_counts()} ")

['Track', 'Artist', 'Album', 'Duration']
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'TRACK' -----------

Sus valores únicos son: ['Play A Simple Song' 'Four Wheels' 'Fly Away' ... 'Have A Little Mercy'
 "Flyin' High" 'Heartache In Blue']

Las frecuencias de los valores únicos de las categorías son: Track
Start Me Up - Live         8
Tumbling Dice - Live       8
Honky Tonk Women - Live    8
Tonight                    7
Without You                7
                          ..
Four Little Diamonds       1
Rock 'N' Roll Is King      1
Without Someone            1
Sorrow About to Fall       1
Heartache In Blue          1
Name: count, Length: 13328, dtype: int64 
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'ARTIST' -----------

Sus valores únicos son: ['38 Special' 'ABBA' 'Aerosmith' 'Air Suppy' 'Alice Cooper'
 'Allman Brothers' 'America' 'Bachman-Turner Overdrive' 'Bad Company'
 'Bee Gees' 'Billy Joel' 'Black Sabbath' 'Blondie' 'Blue Oyster Cult'
 'Bob Dylan' 'Bob Seger' 'Bon Jovi' 'Bos

## DUPLICADOS

In [7]:
get_duplicate_rows(df)

NameError: name 'get_duplicate_rows' is not defined

In [43]:
# index=False es útil si solo quieres los datos y no necesitas la columna de índice.
#df.to_csv("archivos/informacion_artista_MOD.csv", index=False)

# Coger desde el 2010 hasta el 2020 de los dos géneros para que el CSV pese menos. Haciendo limpieza.

In [10]:
df_filtrado = df[(df['Year'] >= 2000) & (df['Year'] <= 2020)]

In [13]:
df_filtrado.to_csv('archivos/UltimateClassicRock_Final.csv')

### Explorando otro archivo de música electrónica para ver de cuántos años hay datos.

In [19]:
# cargamos el dataframe correspondiente 
df2 = pd.read_csv("archivos/Electro_music_on_Spotify.csv", encoding="utf-8")

display(df2.head())

Unnamed: 0,track_name,track_id,track_popularity,track_number,explicit,available_markets,artists_names,artists_ids,album_id,main_artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_sec,time_signature,album_name,album_release_date,total_tracks,type,image_url,album_popularity,album_label,followers,genres,artist_name,artist_popularity,lowest position,mean_position,position_std,best_position,times_in_rating,born_or_founded_in,positions_and_years_data,dj_score,release_year,release_month,track_name_length,main_artist_name_length,album_name_length,available_markets_count,artists_count,cover_id
0,God Is A Dancer,6mIrY9axk9DkBCk4eHXL6c,41.0,1.0,False,"['AD', 'AE', 'AG', 'AL', 'AR', 'AT', 'AU', 'BA...","['Tiësto', 'Mabel']","['2o5jDhtHVPhrJdv3cEQ99Z', '1MIVXf74SZHmTIp4V4...",6CIslPQSknp875cigkhKJC,2o5jDhtHVPhrJdv3cEQ99Z,0.773,0.747,1.0,-3.908,0.0,0.082,0.0241,6e-06,0.342,0.833,119.94,168.125,4.0,The London Sessions,2020-05-15,13.0,album,https://i.scdn.co/image/ab67616d00001e029759d6...,66.0,"Universal Music, a division of Universal Inter...",6229039.0,"['big room', 'brostep', 'dance pop', 'dutch ed...",TIËSTO,87.0,16.0,4.945,4.234,1.0,18.0,"Breda, Breda, Noord-Brabant, Netherlands","{2004: 1, 2005: 2, 2006: 3, 2007: 2, 2008: 2, ...",94.56,2020,5,15,6,19,106,2,ab67616d00001e029759d6dfa2c19091814fccb3
1,Nothing Really Matters,39TATbzOKDwiWdrmuQBLGK,40.0,2.0,False,"['AD', 'AE', 'AG', 'AL', 'AR', 'AT', 'AU', 'BA...","['Tiësto', 'Becky Hill']","['2o5jDhtHVPhrJdv3cEQ99Z', '4EPJlUEBy49EX1wuFO...",6CIslPQSknp875cigkhKJC,2o5jDhtHVPhrJdv3cEQ99Z,0.732,0.847,5.0,-4.254,0.0,0.0426,0.00491,2e-06,0.281,0.697,123.0,157.478,4.0,The London Sessions,2020-05-15,13.0,album,https://i.scdn.co/image/ab67616d00001e029759d6...,66.0,"Universal Music, a division of Universal Inter...",6229039.0,"['big room', 'brostep', 'dance pop', 'dutch ed...",TIËSTO,87.0,16.0,4.945,4.234,1.0,18.0,"Breda, Breda, Noord-Brabant, Netherlands","{2004: 1, 2005: 2, 2006: 3, 2007: 2, 2008: 2, ...",94.56,2020,5,22,6,19,106,2,ab67616d00001e029759d6dfa2c19091814fccb3
2,Ride,6GpoUPegO1TBbZCoE7FxZ7,38.0,3.0,True,"['AD', 'AE', 'AG', 'AL', 'AR', 'AT', 'AU', 'BA...","['Tiësto', 'The Kid Daytona', 'ROE']","['2o5jDhtHVPhrJdv3cEQ99Z', '1U77TS18o4qUO3bwq0...",6CIslPQSknp875cigkhKJC,2o5jDhtHVPhrJdv3cEQ99Z,0.734,0.855,11.0,-4.438,1.0,0.0412,0.0397,0.000278,0.105,0.223,117.0,203.479,4.0,The London Sessions,2020-05-15,13.0,album,https://i.scdn.co/image/ab67616d00001e029759d6...,66.0,"Universal Music, a division of Universal Inter...",6229039.0,"['big room', 'brostep', 'dance pop', 'dutch ed...",TIËSTO,87.0,16.0,4.945,4.234,1.0,18.0,"Breda, Breda, Noord-Brabant, Netherlands","{2004: 1, 2005: 2, 2006: 3, 2007: 2, 2008: 2, ...",94.56,2020,5,4,6,19,106,3,ab67616d00001e029759d6dfa2c19091814fccb3
3,Ritual,0teJO13Uua0AamcZ681qOd,51.0,4.0,False,"['AD', 'AE', 'AG', 'AL', 'AR', 'AT', 'AU', 'BA...","['Tiësto', 'Jonas Blue', 'Rita Ora']","['2o5jDhtHVPhrJdv3cEQ99Z', '1HBjj22wzbscIZ9sEb...",6CIslPQSknp875cigkhKJC,2o5jDhtHVPhrJdv3cEQ99Z,0.647,0.726,3.0,-4.39,0.0,0.0552,0.157,0.0,0.0844,0.769,115.0,198.996,4.0,The London Sessions,2020-05-15,13.0,album,https://i.scdn.co/image/ab67616d00001e029759d6...,66.0,"Universal Music, a division of Universal Inter...",6229039.0,"['big room', 'brostep', 'dance pop', 'dutch ed...",TIËSTO,87.0,16.0,4.945,4.234,1.0,18.0,"Breda, Breda, Noord-Brabant, Netherlands","{2004: 1, 2005: 2, 2006: 3, 2007: 2, 2008: 2, ...",94.56,2020,5,6,6,19,106,3,ab67616d00001e029759d6dfa2c19091814fccb3
4,Jackie Chan,4vvnuJlgBeNVwq3TNmLMNX,69.0,5.0,True,"['AD', 'AE', 'AG', 'AL', 'AR', 'AT', 'AU', 'BA...","['Tiësto', 'Dzeko', 'Preme', 'Post Malone']","['2o5jDhtHVPhrJdv3cEQ99Z', '5vQfv3s2Z2SRdPZKr8...",6CIslPQSknp875cigkhKJC,2o5jDhtHVPhrJdv3cEQ99Z,0.747,0.833,3.0,-2.867,0.0,0.045,0.374,0.0,0.0586,0.687,128.0,215.76,4.0,The London Sessions,2020-05-15,13.0,album,https://i.scdn.co/image/ab67616d00001e029759d6...,66.0,"Universal Music, a division of Universal Inter...",6229039.0,"['big room', 'brostep', 'dance pop', 'dutch ed...",TIËSTO,87.0,16.0,4.945,4.234,1.0,18.0,"Breda, Breda, Noord-Brabant, Netherlands","{2004: 1, 2005: 2, 2006: 3, 2007: 2, 2008: 2, ...",94.56,2020,5,11,6,19,106,4,ab67616d00001e029759d6dfa2c19091814fccb3


In [20]:
df2.sample()

Unnamed: 0,track_name,track_id,track_popularity,track_number,explicit,available_markets,artists_names,artists_ids,album_id,main_artist_id,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,duration_sec,time_signature,album_name,album_release_date,total_tracks,type,image_url,album_popularity,album_label,followers,genres,artist_name,artist_popularity,lowest position,mean_position,position_std,best_position,times_in_rating,born_or_founded_in,positions_and_years_data,dj_score,release_year,release_month,track_name_length,main_artist_name_length,album_name_length,available_markets_count,artists_count,cover_id
28398,K-una,7sV5xw7mIZ6SUFq0P9ob9q,0.0,8.0,False,"['MX', 'US']",['Sébastien Léger'],['17j0kFtqn9Fss3D916jSlp'],7glyJ72Yf7ZaQ160657A1t,17j0kFtqn9Fss3D916jSlp,0.688,0.887,11.0,-8.75,1.0,0.0641,0.00124,0.217,0.0612,0.748,130.1,467.306,4.0,The Collection (1999-2006),2013-01-01,72.0,album,https://i.scdn.co/image/ab67616d00001e02814976...,17.0,DJ Center,52404.0,"['organic house', 'tech house']",SÉBASTIEN LÉGER,48.0,93.0,86.7,7.766,78.0,3.0,DJ and producer from France. Owner of the labe...,"{2007: 93, 2008: 89, 2009: 78}",1.847,2013,1,5,15,26,2,1,ab67616d00001e02814976f1f6a139aeca9ae8db


In [21]:
df2.columns

Index(['track_name', 'track_id', 'track_popularity', 'track_number',
       'explicit', 'available_markets', 'artists_names', 'artists_ids',
       'album_id', 'main_artist_id', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'duration_sec', 'time_signature',
       'album_name', 'album_release_date', 'total_tracks', 'type', 'image_url',
       'album_popularity', 'album_label', 'followers', 'genres', 'artist_name',
       'artist_popularity', 'lowest position', 'mean_position', 'position_std',
       'best_position', 'times_in_rating', 'born_or_founded_in',
       'positions_and_years_data', 'dj_score', 'release_year', 'release_month',
       'track_name_length', 'main_artist_name_length', 'album_name_length',
       'available_markets_count', 'artists_count', 'cover_id'],
      dtype='object')

In [None]:
exploracion(df2)

In [23]:
# Viendo el mínimo y máximo de release_year. 1900 no es correcto. Cambiar por 1990.
print(df2['release_year'].min())
print(df2['release_year'].max())

1900
2022


In [24]:
unique_years = sorted(df2['release_year'].unique(), reverse=True)

In [None]:
sorted(map(int, unique_years), reverse=True)

In [26]:
if not df2[df2['release_year'] == 1900].empty:
    df2.loc[df2['release_year'] == 1900, 'release_year'] = 1990


In [27]:
df2['release_year'].unique()

array([2020, 2019, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009,
       2008, 2022, 2021, 2018, 2007, 2005, 2004, 2003, 2006, 2000, 1999,
       1998, 2002, 1994, 1993, 1992, 1995, 2001, 1997, 1991, 1990, 1996,
       1959, 1989, 1987])

In [28]:
filtered_rows = df2[df2['release_year'] == 1959]

In [29]:
#Borrando las filas donde el release year es 1959, ya que el género es blues, no electrónica.
df2 = df2.drop(filtered_rows.index)

In [32]:
df2_filtrado = df2[(df2['release_year'] >= 2000) & (df2['release_year'] <= 2020)]

In [34]:
df2_filtrado['release_year'].unique()

array([2020, 2019, 2017, 2016, 2015, 2014, 2013, 2012, 2011, 2010, 2009,
       2008, 2018, 2007, 2005, 2004, 2003, 2006, 2000, 2002, 2001])

In [36]:
# index=False es útil si solo quieres los datos y no necesitas la columna de índice.
df2_filtrado.to_csv("archivos/Electro_music_on_Spotify_final.csv", index=False)