In [1]:

# importamos las librerías que necesitamos

# Tratamiento de datos
# -----------------------------------------------------------------------
import pandas as pd
import numpy as np

# Visualización
# ------------------------------------------------------------------------------
import matplotlib.pyplot as plt
import seaborn as sns

# Evaluar linealidad de las relaciones entre las variables
# y la distribución de las variables
# ------------------------------------------------------------------------------
import scipy.stats as stats
from scipy.stats import chi2_contingency, ttest_ind

# Imputación de nulos usando métodos avanzados estadísticos
# -----------------------------------------------------------------------
from sklearn.impute import SimpleImputer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
from sklearn.impute import KNNImputer

# Configuración
# -----------------------------------------------------------------------
pd.set_option('display.max_columns', None) # para poder visualizar todas las columnas de los DataFrames

# Gestión de los warnings
# -----------------------------------------------------------------------
import warnings
warnings.filterwarnings("ignore")


# Explorando CSV para línea del tiempo de música electrónica

In [5]:
# cargamos el dataframe correspondiente 
df = pd.read_csv("archivos/discogs_electronic.csv")

display(df.head())

Unnamed: 0,artist,title,label,country,format,release_date,genre,styles,have,want,num_ratings,average_rating,lowest_price,median_price,highest_price
0,Subterfuge,The Foundation Series Volume One,Visillusion,US,Vinyl,1997,Electronic,"House,Techno,Electro",93,423,31,3.81,$2.00,$39.02,$86.96
1,Titiyo,My Body Says Yes,Arista,UK,Vinyl,1991-04-01,Electronic,House,136,30,11,4.36,$0.43,$1.88,$5.43
2,Mariah Carey,Joy To The World,Columbia,US,Vinyl,1994-11,Electronic,"House,Garage House,Holiday",75,106,5,4.4,$1.99,$16.29,$33.71
3,Rhythmstate,Everybody,Nitebeat,US,Vinyl,1997,Electronic,"House,Breakbeat",22,57,6,4.0,$2.00,$7.00,$25.00
4,Exposé,"Stop, Listen, Look & Think",Arista,US,Vinyl,1990,Electronic,House,115,19,12,3.83,$0.79,$1.50,$4.34


In [6]:
def exploracion(df):
    df_info = pd.DataFrame()
    df_info["% nulos"] = round(df.isna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["% no_nulos"] = round(df.notna().sum()/df.shape[0]*100, 2).astype(str)+"%"
    df_info["tipo_dato"] = df.dtypes
    df_info["num_valores_unicos"] = df.nunique()
    print(f"""El DataFrame tiene {df.shape[0]} filas y {df.shape[1]} columnas.
Tiene {df.duplicated().sum()} datos duplicados, lo que supone un porcentaje de {round(df.duplicated().sum()/df.shape[0], 2)}% de los datos.
Hay {len(list(df_info[(df_info["% nulos"] != "0.0%")].index))} columnas con datos nulos, y son:
{list(df_info[(df_info["% nulos"] != "0.0%")].index)}
y sin nulos hay {len(list(df_info[(df_info["% nulos"] == "0.0%")].index))} columnas y son:
{list(df_info[(df_info["% nulos"] == "0.0%")].index)}
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:""")
    display(df_info.head())
    print("Principales estadísticos de las columnas categóricas:")
    display(df.describe(include="O").T)
    print("Principales estadísticos de las columnas numéricas:")
    display(df.describe(exclude="O").T)
    return df_info

exploracion(df)

El DataFrame tiene 34923 filas y 15 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 2 columnas con datos nulos, y son:
['country', 'styles']
y sin nulos hay 13 columnas y son:
['artist', 'title', 'label', 'format', 'release_date', 'genre', 'have', 'want', 'num_ratings', 'average_rating', 'lowest_price', 'median_price', 'highest_price']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
artist,0.0%,100.0%,object,18034
title,0.0%,100.0%,object,27329
label,0.0%,100.0%,object,7911
country,3.07%,96.93%,object,56
format,0.0%,100.0%,object,7


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
artist,34923,18034,Various,1616
title,34923,27329,Untitled,195
label,34923,7911,Virgin,358
country,33850,56,UK,11229
format,34923,7,Vinyl,31816
release_date,34923,1770,1996,3345
genre,34923,5,Electronic,32214
styles,33461,6367,House,3338
average_rating,34923,256,4,2523
lowest_price,34923,1149,$1.09,2002


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
have,34923.0,216.222633,334.54966,0.0,48.5,116.0,254.0,10643.0
want,34923.0,155.975002,266.956669,0.0,28.0,67.0,172.5,7699.0
num_ratings,34923.0,32.956934,55.145888,0.0,6.0,16.0,39.0,1771.0


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
artist,0.0%,100.0%,object,18034
title,0.0%,100.0%,object,27329
label,0.0%,100.0%,object,7911
country,3.07%,96.93%,object,56
format,0.0%,100.0%,object,7
release_date,0.0%,100.0%,object,1770
genre,0.0%,100.0%,object,5
styles,4.19%,95.81%,object,6367
have,0.0%,100.0%,int64,1652
want,0.0%,100.0%,int64,1427


## Valores unicos de las variables categoricas ✨

In [7]:
# creamos una lista con los nombres de las columnas categoricas 
columnas = df.select_dtypes(include='object').columns.tolist()
print(columnas)
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df[columna].value_counts()} ")

['artist', 'title', 'label', 'country', 'format', 'release_date', 'genre', 'styles', 'average_rating', 'lowest_price', 'median_price', 'highest_price']
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'ARTIST' -----------

Sus valores únicos son: ['Subterfuge' 'Titiyo' 'Mariah Carey' ... "The '06 Style" 'Navigators (2)'
 'Orishas']

Las frecuencias de los valores únicos de las categorías son: artist
Various           1616
Madonna             89
Unknown Artist      64
Moby                58
Underworld          49
                  ... 
Control D.C.         1
Ram Science          1
Symbiotic (2)        1
Movie Cops           1
Orishas              1
Name: count, Length: 18034, dtype: int64 
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'TITLE' -----------

Sus valores únicos son: ['The Foundation Series Volume One' 'My Body Says Yes' 'Joy To The World'
 ... '2030' "That's What I Like (No Cream In My Coffee)" 'A Lo Cubano']

Las frecuencias de los valores únicos de las categorías son: title
Un

## DUPLICADOS

In [8]:
def get_duplicate_rows(df):
    """
    Encuentra y devuelve las filas completas duplicadas de un DataFrame.

    Args:
        df (pd.DataFrame): El DataFrame de entrada.

    Returns:
        pd.DataFrame: Un nuevo DataFrame con las filas duplicadas completas.
    """
    # Identificar duplicados basados en todas las columnas
    duplicate_rows = df[df.duplicated(keep=False)]
    return duplicate_rows

get_duplicate_rows(df)

Unnamed: 0,artist,title,label,country,format,release_date,genre,styles,have,want,num_ratings,average_rating,lowest_price,median_price,highest_price


In [7]:
# index=False es útil si solo quieres los datos y no necesitas la columna de índice.
#df.to_csv("archivos/informacion_artista_MOD.csv", index=False)

# Explorando CSV para línea del tiempo de música rock

In [9]:
# cargamos el dataframe correspondiente 
df1 = pd.read_csv("archivos/UltimateClassicRock.csv")

display(df1.head())

Unnamed: 0,Track,Artist,Album,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity
0,Play A Simple Song,38 Special,38 Special,1977,3:13,4,0.521,0.367,0,-13.866,1,0.0278,0.692,3e-06,0.108,0.789,83.412,16
1,Four Wheels,38 Special,38 Special,1977,4:43,4,0.535,0.71,2,-12.287,1,0.0428,0.01,0.023,0.0495,0.445,160.361,10
2,Fly Away,38 Special,38 Special,1977,5:13,4,0.563,0.563,2,-10.781,1,0.0263,0.0357,0.00185,0.14,0.564,106.739,13
3,Tell Everybody,38 Special,38 Special,1977,4:09,4,0.638,0.694,11,-10.206,0,0.031,0.161,3.4e-05,0.0908,0.936,124.962,10
4,Just Wanna Rock & Roll,38 Special,38 Special,1977,5:57,4,0.388,0.701,2,-9.984,1,0.036,0.013,0.0422,0.115,0.769,126.769,11


In [10]:
def exploracion(df):
    df1_info = pd.DataFrame()
    df1_info["% nulos"] = round(df1.isna().sum()/df1.shape[0]*100, 2).astype(str)+"%"
    df1_info["% no_nulos"] = round(df1.notna().sum()/df1.shape[0]*100, 2).astype(str)+"%"
    df1_info["tipo_dato"] = df1.dtypes
    df1_info["num_valores_unicos"] = df1.nunique()
    print(f"""El DataFrame tiene {df1.shape[0]} filas y {df1.shape[1]} columnas.
Tiene {df1.duplicated().sum()} datos duplicados, lo que supone un porcentaje de {round(df1.duplicated().sum()/df1.shape[0], 2)}% de los datos.
Hay {len(list(df1_info[(df1_info["% nulos"] != "0.0%")].index))} columnas con datos nulos, y son:
{list(df1_info[(df1_info["% nulos"] != "0.0%")].index)}
y sin nulos hay {len(list(df1_info[(df1_info["% nulos"] == "0.0%")].index))} columnas y son:
{list(df1_info[(df1_info["% nulos"] == "0.0%")].index)}
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:""")
    display(df1_info.head())
    print("Principales estadísticos de las columnas categóricas:")
    display(df1.describe(include="O").T)
    print("Principales estadísticos de las columnas numéricas:")
    display(df1.describe(exclude="O").T)
    return df1_info

exploracion(df1)

El DataFrame tiene 14418 filas y 18 columnas.
Tiene 0 datos duplicados, lo que supone un porcentaje de 0.0% de los datos.
Hay 0 columnas con datos nulos, y son:
[]
y sin nulos hay 18 columnas y son:
['Track', 'Artist', 'Album', 'Year', 'Duration', 'Time_Signature', 'Danceability', 'Energy', 'Key', 'Loudness', 'Mode', 'Speechiness', 'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo', 'Popularity']
A continuación tienes un detalle sobre los datos nulos y los tipos y número de datos:


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Track,0.0%,100.0%,object,13328
Artist,0.0%,100.0%,object,94
Album,0.0%,100.0%,object,1247
Year,0.0%,100.0%,int64,63
Duration,0.0%,100.0%,object,911


Principales estadísticos de las columnas categóricas:


Unnamed: 0,count,unique,top,freq
Track,14418,13328,Start Me Up - Live,8
Artist,14418,94,Bob Dylan,454
Album,14418,1247,Fleetwood Mac,35
Duration,14418,911,4:13,102


Principales estadísticos de las columnas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Year,14418.0,1987.634693,15.318819,1962.0,1975.0,1983.0,1999.0,2024.0
Time_Signature,14418.0,3.917811,0.356628,0.0,4.0,4.0,4.0,5.0
Danceability,14418.0,0.503063,0.142619,0.0,0.405,0.509,0.603,0.987
Energy,14418.0,0.656563,0.229607,0.0,0.493,0.6905,0.854,0.998
Key,14418.0,5.166597,3.503423,0.0,2.0,5.0,9.0,11.0
Loudness,14418.0,-9.438675,4.179623,-60.0,-11.91425,-8.8105,-6.3415,-0.203
Mode,14418.0,0.730129,0.443908,0.0,0.0,1.0,1.0,1.0
Speechiness,14418.0,0.051354,0.046291,0.0,0.0318,0.039,0.0543,0.952
Acousticness,14418.0,0.226924,0.268857,0.0,0.0131,0.104,0.37,0.995
Instrumentalness,14418.0,0.089682,0.215783,0.0,1.3e-05,0.000737,0.028375,0.992


Unnamed: 0,% nulos,% no_nulos,tipo_dato,num_valores_unicos
Track,0.0%,100.0%,object,13328
Artist,0.0%,100.0%,object,94
Album,0.0%,100.0%,object,1247
Year,0.0%,100.0%,int64,63
Duration,0.0%,100.0%,object,911
Time_Signature,0.0%,100.0%,int64,5
Danceability,0.0%,100.0%,float64,788
Energy,0.0%,100.0%,float64,1075
Key,0.0%,100.0%,int64,12
Loudness,0.0%,100.0%,float64,8993


## Valores unicos de las variables categoricas ✨

In [11]:
# creamos una lista con los nombres de las columnas categoricas 
columnas = df1.select_dtypes(include='object').columns.tolist()
print(columnas)
# empezamos a iterar por cada una de las columnas para sacar sus valores únicos y sus frecuencias
for columna in columnas:
    print(f" \n----------- ESTAMOS ANALIZANDO LA COLUMNA: '{columna.upper()}' -----------\n")
    print(f"Sus valores únicos son: {df1[columna].unique()}\n")
    print(f"Las frecuencias de los valores únicos de las categorías son: {df1[columna].value_counts()} ")

['Track', 'Artist', 'Album', 'Duration']
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'TRACK' -----------

Sus valores únicos son: ['Play A Simple Song' 'Four Wheels' 'Fly Away' ... 'Have A Little Mercy'
 "Flyin' High" 'Heartache In Blue']

Las frecuencias de los valores únicos de las categorías son: Track
Start Me Up - Live         8
Tumbling Dice - Live       8
Honky Tonk Women - Live    8
Tonight                    7
Without You                7
                          ..
Four Little Diamonds       1
Rock 'N' Roll Is King      1
Without Someone            1
Sorrow About to Fall       1
Heartache In Blue          1
Name: count, Length: 13328, dtype: int64 
 
----------- ESTAMOS ANALIZANDO LA COLUMNA: 'ARTIST' -----------

Sus valores únicos son: ['38 Special' 'ABBA' 'Aerosmith' 'Air Suppy' 'Alice Cooper'
 'Allman Brothers' 'America' 'Bachman-Turner Overdrive' 'Bad Company'
 'Bee Gees' 'Billy Joel' 'Black Sabbath' 'Blondie' 'Blue Oyster Cult'
 'Bob Dylan' 'Bob Seger' 'Bon Jovi' 'Bos

## DUPLICADOS

In [12]:
def get_duplicate_rows(df):
    """
    Encuentra y devuelve las filas completas duplicadas de un DataFrame.

    Args:
        df (pd.DataFrame): El DataFrame de entrada.

    Returns:
        pd.DataFrame: Un nuevo DataFrame con las filas duplicadas completas.
    """
    # Identificar duplicados basados en todas las columnas
    duplicate_rows = df1[df1.duplicated(keep=False)]
    return duplicate_rows

get_duplicate_rows(df1)

Unnamed: 0,Track,Artist,Album,Year,Duration,Time_Signature,Danceability,Energy,Key,Loudness,Mode,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Popularity


In [None]:
# index=False es útil si solo quieres los datos y no necesitas la columna de índice.
#df.to_csv("archivos/informacion_artista_MOD.csv", index=False)