In [33]:
import pandas as pd
import os
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns


In [None]:
import pandas as pd

# Cargar el archivo completo (si puedes en tu equipo local)
file_path = "df_sample_processed.csv"
df = pd.read_csv(file_path, encoding="utf-8", low_memory=False)

# Dividir en partes de 10,000 filas cada una
chunk_size = 10000
for i, chunk in enumerate(range(0, len(df), chunk_size)):
    df_chunk = df.iloc[chunk:chunk+chunk_size]
    df_chunk.to_csv(f"df_part_{i}.csv", index=False)


In [1]:
import pandas as pd
import numpy as np
from googletrans import Translator
from collections import Counter
from tqdm import tqdm
import os

# 📌 Ruta de los archivos
dataset_path = r"C:\Users\solan\Downloads\get_data_from_songs\data\df_lyrics_faltan_traduc_actualizado.csv"
temp_file_path = r"C:\Users\solan\Downloads\get_data_from_songs\data\temp_language_detection.csv"
final_file_path = r"C:\Users\solan\Downloads\get_data_from_songs\data\df_lyrics_faltan_traduc_language_detected.csv"

# 📌 Cargar el dataset
print("🔄 Cargando el dataset...")
df_main = pd.read_csv(dataset_path, encoding="utf-8", low_memory=False)
print("✅ Dataset cargado correctamente.")

# 📌 Verificar si hay un archivo temporal para reanudar
if os.path.exists(temp_file_path):
    print("🔄 Archivo temporal encontrado. Cargando progreso...")
    df_temp = pd.read_csv(temp_file_path, encoding="utf-8")
    df_main.update(df_temp)  # Actualizar df_main con el progreso guardado
    print("✅ Progreso recuperado.")

# 📌 Inicializar el traductor de Google
translator = Translator()

# 📌 Función para dividir texto en fragmentos de 500 caracteres
def split_text(text, chunk_size=500):
    return [text[i:i+chunk_size] for i in range(0, len(text), chunk_size)]

# 📌 Función para detectar idioma con Google Translate, manejando textos largos
def detect_language_google(text):
    try:
        if not isinstance(text, str) or not text.strip():
            return np.nan  # Si el texto está vacío, devolver NaN

        if len(text) <= 1000:
            return translator.detect(text).lang  # Detectar directamente si el texto es corto

        # 📌 Si el texto es largo, dividirlo y detectar idioma en cada fragmento
        fragments = split_text(text, chunk_size=500)
        detected_languages = [translator.detect(fragment).lang for fragment in fragments]

        # 📌 Determinar el idioma más frecuente
        most_common_lang = Counter(detected_languages).most_common(1)[0][0]
        return most_common_lang

    except Exception:
        return np.nan  # Si hay error en la detección, devolver NaN

# 📌 Filtrar filas donde `language` está vacío y `lyrics` no es NaN
mask_missing_lang = df_main["language"].isna() & df_main["lyrics"].notna()
missing_lang_indices = df_main[mask_missing_lang].index

# 📌 Aplicar detección de idioma con barra de progreso y guardado automático
print("🔍 Detectando idioma en las letras de canciones...")
for i, idx in enumerate(tqdm(missing_lang_indices, desc="Procesando letras", unit=" canción")):
    df_main.at[idx, "language"] = detect_language_google(df_main.at[idx, "lyrics"])

    # 📌 Guardar progreso cada 100 canciones procesadas
    if i % 100 == 0:
        df_main.to_csv(temp_file_path, index=False, encoding="utf-8")
        print(f"💾 Progreso guardado en {temp_file_path} ({i+1} canciones procesadas).")

# 📌 Guardar el dataset final con los idiomas detectados
df_main.to_csv(final_file_path, index=False, encoding="utf-8")
print("✅ Detección de idiomas completada. Archivo final guardado en:", final_file_path)

# 📌 Eliminar el archivo temporal después de completar la detección
if os.path.exists(temp_file_path):
    os.remove(temp_file_path)
    print("🗑️ Archivo temporal eliminado.")


🔄 Cargando el dataset...
✅ Dataset cargado correctamente.
🔍 Detectando idioma en las letras de canciones...


Procesando letras:   0%|          | 2/20352 [00:13<31:57:23,  5.65s/ canción]

💾 Progreso guardado en C:\Users\solan\Downloads\get_data_from_songs\data\temp_language_detection.csv (1 canciones procesadas).


Procesando letras:   0%|          | 100/20352 [01:05<3:40:18,  1.53 canción/s]


KeyboardInterrupt: 

In [41]:
import pandas as pd
import os

# 📌 Rutas de los archivos
main_file_path = r"C:\Users\solan\Downloads\get_data_from_songs\src\functions b\df_lyrics_faltan_traduc.csv"
new_data_file_path = r"C:\Users\solan\Downloads\get_data_from_songs\data\traducidas.csv"

# 📌 Verificar si los archivos existen
if not os.path.exists(main_file_path):
    raise FileNotFoundError(f"❌ No se encontró el archivo principal: {main_file_path}")
if not os.path.exists(new_data_file_path):
    raise FileNotFoundError(f"❌ No se encontró el archivo de traducciones: {new_data_file_path}")

# 📌 Cargar los datasets con `low_memory=False` para evitar problemas de tipos de datos
df_main = pd.read_csv(main_file_path, encoding="utf-8", low_memory=False)
df_new = pd.read_csv(new_data_file_path, encoding="utf-8", low_memory=False)

# 📌 Limpiar nombres de columnas en df_new (eliminar espacios extra)
df_new.columns = df_new.columns.str.strip()

# 📌 Asegurar que `translated_lyrics` existe en df_main
if 'translated_lyrics' not in df_main.columns:
    df_main['translated_lyrics'] = pd.NA  # Crear la columna si no existe

# 📌 Forzar tipos de datos a `str` en `translated_lyrics` para evitar problemas en el merge
df_main["translated_lyrics"] = df_main["translated_lyrics"].astype(str)
df_new["translated_lyrics"] = df_new["translated_lyrics"].astype(str)

# 📌 Hacer merge usando `recording_id` sin modificar otras columnas
df_main = df_main.merge(df_new[['recording_id', 'translated_lyrics', 'language']], 
                        on="recording_id", how="left", suffixes=("", "_new"))

# 📌 Actualizar solo las columnas que tienen nuevos valores
df_main["language"] = df_main["language_new"].combine_first(df_main["language"])
df_main["translated_lyrics"] = df_main["translated_lyrics_new"].combine_first(df_main["translated_lyrics"])

# 📌 Eliminar columnas temporales `_new`
df_main.drop(columns=["language_new", "translated_lyrics_new"], inplace=True)

# 📌 Restaurar el orden original basado en `recording_id`
df_main = df_main.sort_values(by="recording_id").reset_index(drop=True)

# 📌 Guardar los cambios en un nuevo archivo para evitar sobreescribir el original
updated_file_path = r"C:\Users\solan\Downloads\get_data_from_songs\data\df_lyrics_faltan_traduc_actualizado.csv"
df_main.to_csv(updated_file_path, index=False, encoding="utf-8")

# 📌 Verificar que todo está correcto
print("✅ Archivo actualizado guardado en:", updated_file_path)
print(df_main.info())  # Para verificar tipos de datos
print(df_main.head())  # Para ver las primeras filas del dataset actualizado


✅ Archivo actualizado guardado en: C:\Users\solan\Downloads\get_data_from_songs\data\df_lyrics_faltan_traduc_actualizado.csv
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 157811 entries, 0 to 157810
Data columns (total 88 columns):
 #   Column                              Non-Null Count   Dtype  
---  ------                              --------------   -----  
 0   artist_name                         157811 non-null  object 
 1   song_name                           157809 non-null  object 
 2   recording_id                        157811 non-null  object 
 3   danceable                           157811 non-null  float64
 4   not_danceable                       157811 non-null  float64
 5   male                                157811 non-null  float64
 6   female                              157811 non-null  float64
 7   timbre_bright                       157811 non-null  float64
 8   timbre_dark                         157811 non-null  float64
 9   tonal                            

In [42]:
df["translated_lyrics"] = df["translated_lyrics"].astype(str).replace("nan", "").fillna("")


In [9]:
import pandas as pd

# 📂 Cargar el CSV
file_path = r"C:\Users\solan\Downloads\get_data_from_songs\src\functions b\df_final_correcto.csv"
df = pd.read_csv(file_path, low_memory=False)

# 📌 Ver las primeras filas de lyrics
print(df[['artist_name', 'song_name', 'lyrics']].head(10))

# 📌 Revisar valores extraños en lyrics
print(df['lyrics'].apply(type).value_counts())


         artist_name                  song_name  \
0   anthony hamilton          change your world   
1  adriano celentano                 rock matto   
2   jedi mind tricks  poison in the birth water   
3           buckshot                 we in here   
4            g herbo                     street   
5     strand of oaks                    shut in   
6         tori kelly                  city dove   
7            tinashe                        bet   
8        wild beasts                  albatross   
9       snoh aalegra                      peace   

                                                                                                                                                                                                                                                                                                                                                                                                                                                      

In [10]:
# mostrar las filas completas
pd.set_option('display.max_colwidth', None)     
#mostrar las columnas language, song_name , artist_name, lyrics, lyrcis_translated, spotify_url y recording_id     
df[['language', 'song_name', 'artist_name', 'lyrics', 'translated_lyrics', 'spotify_url', 'recording_id']].sample(10)                 

Unnamed: 0,language,song_name,artist_name,lyrics,translated_lyrics,spotify_url,recording_id
133539,en,neon rust,frank carter & the rattlesnakes,"[Verse 1] Don't breathe, there's no oxygen here Only ammonia to suffocate fear And the pastures are flooded And all the soil is sour And the trees are brittle And so are the flowers All dried up just an empty shell Away on the wind being blown to hell What happened, it was all going so well [Chorus] Sweet golden rose, don't ever rust Be ever so delicate, when lending your trust And don't come apart at the seams Sweat the nightmares, live in the dreams Be anything you believe Be anything you believe [Post-Chorus] We don't belong in a wasteland We don't belong in a wasteland We don't belong in a wasteland [Verse 2] Don't leave, there's nothing out there to see Everything is murdered, there's no mystery And history's eyes, they have seen it all It's a Modern Ruin and we are responsible It's bleak and we are pale We are the savages of a future fail No sun, no stars, just endless nights Repeat prescriptions of drunken fights What happened to us, under these neon lights [Chorus] Sweet golden rose, don't ever rust Be ever so delicate, when lending your trust And don't come apart at the seams Sweat the nightmares, live in the dreams Be anything you believe Be anything you believe [Post-Chorus] We don't belong in a wasteland We don't belong in a wasteland We don't belong in a wasteland [Bridge] We don't belong in a wasteland We don't belong in a wasteland We don't belong in a wasteland We don't belong in a wasteland [Chorus] Sweet golden rose, don't ever rust Be ever so delicate, when lending your trust And don't come apart at the seams Sweat the nightmares, live in the dreams Be anything you believe Be anything you believe [Post-Chorus] We don't belong in a wasteland You don't belong in a wasteland You don't belong in a wasteland",,https://open.spotify.com/track/5OCH598greI36cfms3c0h5,d87507df-78fe-432e-a2a6-65a3c01922fc
8416,en,the christmas blues,bob dylan,the jingle bells are jingling the streets are white with snow the happy crowds are mingling but theres no one that i know im sure that youll forgive me if i dont enthuse i guess ive got the christmas blues ive done my window shopping theres not a store ive missed but whats the use of stopping when theres no one on your list youll know the way im feeling when you love and you lose i guess ive got the christmas blues when somebody wants you somebody needs you christmas is a joy of joy but friends when youre lonely youll find that its only a thing for little girls and little boys may all your days be merry your seasons full of cheer but till its january ill just go and disappear old santa may have brought you some stars for your shoes but santa only brought me the blues those brightly packaged tinsel covered christmas blues old santa may have brought you some stars for your shoes but santa only brought me the blues those brightly packaged tinsel covered christmas blues,,https://open.spotify.com/track/6d69Gd9D4dpSfahKTOiyWw,0da68734-6f0b-441b-b86a-8f0d7bd4fb20
91292,en,come on in,the association,come on in you know youre welcome here come on in its been a long cold year now i dont know what went wrong but one whole year has passed us by and gone were back together now where we belong so baby come on in come on in you sure look good to me come on in im as lonely as i can be over there in the corner is your old chair a waitin sittin in the corner justa anticipatin what a long long time its been so baby come on in come on in you know youre welcome here come on in its been a long cold year now you know why i waited around time is nothin that could bring me down i knew if i waited i would win so baby come on in come on in,,https://open.spotify.com/track/5DYErbB4qyQv5MJUMpNt0D,93cabf63-ed6e-4d9c-82c4-9d6ce72af9a2
114935,de,staub,max herre,strophe 1 ich seh staub im ersten licht augen auf beweg mich nicht er legt sich auf alles drauf auf meine seele meine haut macht mich stumm macht mich taub und all das woran ich glaub wird zu staub ich seh staub in jedem blick alles grau jeder geht fr sich chorus und das schweigen wird so laut macht uns stumm macht uns taub legt sich auf alles drauf und all das woran man glaubt wird zu staub strophe 2 unter all dem schutt begraben liegt ein traum unter all dem liegt der grund auf den wir bauen ich trume wie aus dunklen wolken regen bricht nach dem regen bleibt kein staub im sonnenlicht ich seh staub und denk an dich siehst dus auch denkst du an mich chorus,verse 1 i see dust in the first light open my eyes don't move it settles on everything on my soul my skin makes me mute makes me deaf and everything that I believe in turns to dust i see dust in every glance everything is grey everyone goes by themselves chorus and the silence becomes so loud makes us mute makes us deaf settles on everything and everything that we believe in turns to dust verse 2 buried under all the rubble lies a dream under all this lies the foundation that we build on i dream like rain breaks out of dark clouds after the rain there is no dust left in the sunlight I see dust and think of you do you see it too do you think of me chorus,https://open.spotify.com/track/7fZyBOWu5qPPb80xJ3t1Oq,ba4ebb3e-8c14-471e-8eb6-cf639e32933d
135423,en,a little bit of sorry,buju banton,taught you mother taught you well cause you papa taught you well humble yourself right now a little bit of sorry that never cause no strife anyhow you think you bigger than say youre sorry then you have a problem in life excuse me thats a simple word of courtesy i rather pack up and move than to have you walk all over me and please and thank you let me say that in advance man if you havent been brought up too well you need to give yourself a chance nothing to lose and you got nothing to prove let nothing and no one stand in your way just get inside the groove you can be never be to be ignored to look listen and care mans greatest in life that is your inner fear little bit of sorry that never cause no strife anyhow you think you bigger than say youre sorry then you have a problem in life excuse me thats a simple word of courtesy i rather pack up and move than to have you walk all over me nothing to lose and you got nothing to prove let no one stand in your way just get inside the groove a little bit of sorry that never cause no strife anyhow you think you bigger than say youre sorry then you have a problem in life they make you abandon and not a frown and think you going down bombs and crown on another side i hope they dont alive a little bit of sorry that never cause no strife anyhow you think you bigger than say youre sorry then you have a problem in life excuse me thats a simple word of courtesy i rather pack up and move than to have you walk all over me and please and thank you let me say that in advance man if you havent been brought up too well you need to give yourself a chance man if you havent been brought up too well you need to give yourself a chance man if you havent been brought up too well you need to give yourself a chance man if you havent been brought up too well you need to give yourself a chance,,https://open.spotify.com/track/4ucsxZi1B3H81GtKEupKqA,db8a91c1-75d4-4c9d-a72e-7fc98f95eb3d
93571,en,west coast town,chris shiflett,verse 1 i walked home all the way from east beach with an oil spill stickin to my feet to that little house on salinas street throwin rocks in the creek my brothers and me verse 2 yeah mom held it down on a county wage clothes on our backs and food on our plates every saturday night mariachis played and i listened through the window to the low rider sway chorus yeah i grew up in a west coast town back before they chased the working class out you know we dont fuck around where i grew up in a west coast town verse 3 fourth of july leadbetter beach salt air digging in my lungs so deep chased her all night til she couldnt see it was paradise but i had to leave chorus yeah i grew up in a west coast town back before they chased the working class out you know we dont fuck around where i grew up in a west coast town bridge now im long gone like stoplights on the freeway you can tear me down but ill ill never change instrumental break chorus yeah i grew up in a west coast town back before they chased the working class out you know we dont fuck around where i grew up yeah i grew up in a west coast town back before they chased the working class out you know we dont fuck around where i grew up in a west coast town in a west coast town,,https://open.spotify.com/track/4jfoj6rS1kEtVQ88D1TE9A,979782fd-7234-46ab-93f9-7edc3c82c7a1
102338,en,recollections,a sense of purpose,the sickening sense of anxiety bears down on me it crushes my spirit and leaves behind a lifeless entity from what nature from does this hell come a darkness that binds weakens and blinds i succumb i watch my world turn before my eyes some things just cant stay the same regrets they break me down they break me down and i struggle again just to make a sound take this from my weary weathered soul a cancer that crucifies and seeks to agonize till im no longer whole im no longer whole what fear still lays in me and what conquering will set me free all of the misery comes from within i watch my world turn before my eyes some things just cant stay the same regrets they break me down they break me down and i struggle again just to make a sound unburden me from the demons that dwell in me,,https://open.spotify.com/track/4kXYpHqFWtDUd7AwyH1J9e,a60a8a11-b29a-4334-a00a-91b5fabb053c
55339,en,food chains,napalm death,artificial for these strictly conscious times organic prothesis with a view to paying in kind to ease guilt of scores of undignified ends strung up disemboweled right out of the pen so unbeknowing in their anonimity cause when youre marked for death ears switch off to the screams primal urges blindly cull tear and chew remember dont scorn what god gave to you what god gave to you reverting technologically advanced yet bloodily we regress reversal looking forward to a pressure bolt through the head numbness second only to dumbness sure they dont feel a thing travesty communication block ensures no further usage travesty travesty travesty,,https://open.spotify.com/track/7mlPZOx0t5N3GFsvz2BCWb,599bb4e8-8b99-4afa-b16d-77127824594c
74875,en,the beginningat last,black label society,have all you are and all youre to be fade in your world and all that you see this hole in the wall is all under your thumb pulling your senses until you become until you become i gotta run i just gotta run cought in a world where you could never run too fast to finally reach the beginningat last dismantling comfort of those you surround place joy upon strangers now aint that profound this hole in the wall is under your thumb pulling your senses until you become where you going never knowing just who and where you gotta turn turn forever caring forever sharing you never learn learn lifetime of getting burned burned,,https://open.spotify.com/track/2shBXMWGDusaxhkO6WL8RE,790b662e-7709-40c7-bafe-7c83a104309e
58839,en,hard luck kid,beach slang,i hardly ever listen cause its all a lie almost everything is a waste of time i never really care and i never really try it never really matters so whatever never mind im mostly kind of holy but thats half a lie ive got a pretty clean heart but a dirty mind ive got a foot in the gutter the other in the light i was born all wrong but im dying alright im a hard luck kid so why even try im a nowhere bum im dumb i dont mind nothing really happens if you think it might nothing really changes when you change your mind i try to shake it off try to get alive really im okay kinda getting by im a hard luck kid so why even try im a nowhere bum im dumb i dont mind im a hard luck kid so why even try im a nowhere bum im dumb i dont mind,,https://open.spotify.com/track/4oUzpzjiaJIXwueMMedzlf,5f138da1-5f10-41ba-8d29-f9ad5810fb78


In [7]:
df.sample(10)

Unnamed: 0,artist_name,song_name,recording_id,danceable,not_danceable,male,female,timbre_bright,timbre_dark,tonal,...,duration_ms,popularity,language,views,track_uri,playlist_ids,positions,playlists_names,combined_genres,translated_lyrics
82887,melee,what good is love without you,861fcba1-4ac6-4cb3-a681-561bb6901d28,0.995,0.005,0.313,0.687,0.978,0.022,0.436,...,,25.0,en,1153.0,,,,,rock,
25123,loretta lynn,what sundown does to you,28890965-3d77-41a0-b9e3-e00e97353010,0.195,0.805,0.018,0.982,1.0,0.0,0.952,...,134893.0,9.0,en,12134.0,,,,,"country country female vocalists classic country singersongwriter americana, country female female vocalists female vocals female vocalist, country female female vocalists female vocals female vocalist",
23547,cloud nothings,water turns back,2612a4ed-aeb0-4983-8ec9-2e1218aec818,0.004,0.996,0.409,0.591,0.577,0.423,0.737,...,277139.0,17.0,en,339.0,,,,,"indie rock lofi seen live indie posthardcore, indie lofi, indie lofi",
126169,wardruna,runaljod,ccade59f-fcec-4dda-8076-f767b9b4f45a,0.157,0.843,0.173,0.827,0.677,0.323,0.064,...,454493.0,28.0,en,526.0,spotify:track:3KdGXep9wfDwqsCBG3b0c5,"[1123, 122730, 141332, 500986, 559303]","[25, 35, 150, 108, 7]","['Folk', 'Pagan', 'Collab', 'Vikings', 'Wardruna']","folk neofolk ambient pagan folk norwegian, ambient, dark folk, nordic folk, norwegian, ambient, dark folk, nordic folk, norwegian",
103266,bon jovi,work for the working man,a7866027-6a5d-40bc-a825-d34f3a9d3287,0.775,0.225,0.02,0.98,0.94,0.06,0.787,...,244053.0,27.0,en,864.0,spotify:track:6iQ6aFY6qEjh9E6JvfC0GF,"[115649, 305787, 31740, 316812, 544883, 576104, 578778, 583628]","[47, 89, 27, 150, 42, 20, 1, 99]","['Workout Playlist....', 'Bon Jovi', 'Bon Jovi', 'All-time Favorites', 'Jordan', 'Rock of Ages', 'Classic Rock', 'Metal']","['Hard Rock'] rock, hard rock, classic rock, 80s, hair metal, rock, hard rock, hair metal, heavy metal, metal, rock, hard rock, hair metal, heavy metal, metal",
52596,reed deming,mercy on me,55266225-c6bd-4a39-81ef-71201d6b6a26,0.849,0.151,0.066,0.934,0.982,0.018,0.245,...,186785.0,5.0,en,219.0,,,,,"pop pop male vocalists kid rock brutal death metal creepy, c hair metal fat 3 g, c hair metal fat 3 g",
3789,nas,war is necessary,063bb03a-a2c3-45c1-a2f3-31165a0d051c,1.0,0.0,0.5,0.5,0.054,0.946,0.016,...,148401.0,58.0,en,13310.0,spotify:track:0g9NT0ve6iUVUfNbRIaGNz,[175192],[58],['Rap & Hip-Hop'],"['Soundtrack'] Hip-Hop, rap, hip hop, east coast rap, new york, GTA IV, The Beat, Grand Theft Auto, Hip-Hop, GTA, GTA IV, The Beat, Grand Theft Auto, Hip-Hop, GTA",
34635,tom petty and the heartbreakers,when a kid goes bad,380b4359-437a-4955-af9e-47f9639f2f05,0.0,1.0,0.622,0.378,0.005,0.995,0.006,...,296546.0,57.0,en,753.0,spotify:track:3fSVMOj2BXP3nexp80BHyR,"[157628, 16429, 276712, 387716, 434665]","[27, 65, 5, 109, 41]","['Old', 'Petty', 'oldies', 'Tom Petty', 'tom petty']","classic rock, rock, 80s, singer-songwriter, folk, rock, Rock Roll, guitar, rock, Rock Roll, guitar",
92209,waxahatchee,swan dive,95454d08-960e-47cf-8399-cfaa284c79ac,0.0,1.0,0.237,0.763,0.922,0.078,0.902,...,194861.0,23.0,en,9352.0,,,,,"singersongwriter seen live indie female vocalists acoustic, female vocalists indie rock 10s indie rock, female vocalists indie rock 10s indie rock",
102492,hermans hermits,show me girl,a6493bf7-f0c4-4558-8ca6-b6f373c23c7b,0.999,0.001,0.011,0.989,0.913,0.087,0.853,...,156066.0,15.0,en,647.0,,,,,"rock 60s classic rock oldies british invasion british, 60s beat pop rock oldies, 60s beat pop rock oldies",


### Nulos

In [11]:
# Análisis de valores nulos
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

# Crear un resumen de los nulos
missing_summary = pd.DataFrame({
    'Column': df.columns,
    'Missing Values': missing_values,
    'Missing Percentage': missing_percentage
}).sort_values(by='Missing Percentage', ascending=False)

# Mostrar columnas con nulos
print(missing_summary[missing_summary['Missing Percentage'] > 0])



                                Column  Missing Values  Missing Percentage
translated_lyrics    translated_lyrics          142842           90.514603
playlists_names        playlists_names           63084           39.974400
positions                    positions           63084           39.974400
playlist_ids              playlist_ids           63084           39.974400
track_uri                    track_uri           63072           39.966796
views                            views           43204           27.377052
duration_ms                duration_ms            4451            2.820462
lyrics                          lyrics            2711            1.717878
language                      language            2705            1.714076
combined_genres        combined_genres            1174            0.743928
album_release_date  album_release_date             198            0.125467
album_name                  album_name             198            0.125467
spotify_url              

- Columnas con más del 60% de nulos:   
Estas son candidatas para eliminación si no son críticas para el análisis. Ejemplos: playlists_names *Necesito saber como funciona*, positions *es de playlist?*, playlist_ids *idem*, track_uri *podemos quitarla teniendo ya las urls*  
- Columnas con entre 12% y 40% de nulos:  
Decidimos caso por caso. Por ejemplo, genre, views, popularity podrían ser importantes.  
- Columnas críticas (por ejemplo, lyrics o genre). Ver qué falta.

##### - Language la he pasado por mi script detector de idiomas y ya estaría lista excepto por las que no tienen lyrics    
##### - Track Uri una vez tenemos los datos no nos hace falta  
##### - Tags tiene muchas columnas que solo sale unknown

In [19]:
# columnas que se deciden eliminar (a ir rellenando)
# df = df.drop(columns=['track_uri'], errors='ignore')

## Análisis de categóricas

In [17]:
# Configurar pandas para mostrar todas las columnas completas
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)

Lyrics

In [18]:
# Crear un dataframe con filas que tienen valores nulos en 'lyrics'
df_no_lyrics = df[df['lyrics'].isnull()]

# Verificar las primeras filas del nuevo dataframe
print(df_no_lyrics.head())

# Guardar el nuevo dataframe como un archivo CSV
df_no_lyrics.to_csv('df_no_lyrics.csv', index=False)
print("El archivo `df_no_lyrics.csv` ha sido guardado exitosamente.")

              artist_name                song_name  \
578   pupkulies & rebecca         la vie est belle   
1956      lata mangeshkar   pyar kiya to darna kya   
2178            the hives      civilizations dying   
3964        the saturdays  dont let me dance alone   
4073        demis roussos      good days have gone   

                              recording_id  danceable  not_danceable   male  \
578   29361f21-4857-451f-999f-800ff226390d      0.416          0.584  0.823   
1956  8746ded9-a4f4-4f2c-89af-1ae34c5b501f      0.161          0.839  0.573   
2178  95a33ad5-aa04-4411-b864-d334678ee861      0.149          0.851  0.412   
3964  007bc866-4bf5-44d3-8f89-2fe7f193d38b      1.000          0.000  0.173   
4073  00bd88f9-9257-42a8-b29f-50c66dbe5fb6      0.355          0.645  0.577   

      female  timbre_bright  timbre_dark  tonal  ...  duration_ms  popularity  \
578    0.177          0.977        0.023  0.246  ...     465533.0        38.0   
1956   0.427          0.581        0.4

### 'album_name', 'popularity', 'album_release_date', 'spotify_url'

In [16]:
# Crear un nuevo dataframe con las columnas de interés
columns_to_check = ['album_name', 'popularity', 'album_release_date', 'spotify_url']
df_no_url = df[columns_to_check]

# Contar nulos en cada columna
null_counts = df_no_url.isnull().sum()

# Mostrar resultados
print("Número de nulos por columna:")
print(null_counts)

# Verificar si todas las columnas tienen el mismo número de nulos
same_nulls = null_counts.nunique() == 1
print(f"\n¿Tienen todas las columnas el mismo número de nulos? {'Sí' if same_nulls else 'No'}")
print(f"\nParece igualmente que es caso de urls no encontradas, voy a pasarlas de nuevo a buscar urls por si hubiera suerte")


Número de nulos por columna:
album_name            480
popularity            462
album_release_date    462
spotify_url           461
dtype: int64

¿Tienen todas las columnas el mismo número de nulos? No

Parece igualmente que es caso de urls no encontradas, voy a pasarlas de nuevo a buscar urls por si hubiera suerte


In [75]:
# Crear un dataframe con filas que tienen valores nulos en 'spotify_url'
df_no_url = df[df['spotify_url'].isnull()]

# Verificar las primeras filas del nuevo dataframe
print(df_no_url.head())

# Guardar el nuevo dataframe como un archivo CSV
df_no_url.to_csv('df_no_url.csv', index=False)
print("El archivo `df_no_url.csv` ha sido guardado exitosamente.")


              artist_name                  song_name  \
121886     ray lamontagne           change your mind   
121888          dane cook           tire in the face   
121890  car seat headrest             leave together   
121892       teddy geiger  for you i will confidence   
121894      alyson stoner                       flow   

                                recording_id  danceable  not_danceable   male  \
121886  00539498-834a-4f92-bf66-e373bcee774c      0.011          0.989  0.989   
121888  0064ee08-1005-47a2-88a6-5eb867619215      0.625          0.375  0.035   
121890  0094daaa-f455-44a3-90fa-08dd4bf73b20      0.187          0.813  0.919   
121892  009f96ec-0c89-4733-a6a7-dfab81424dcc      0.939          0.061  0.827   
121894  00b44294-dc3e-46c6-af3b-77b4265f3084      0.979          0.021  0.008   

        female  timbre_bright  timbre_dark  tonal  ...  album_release_date  \
121886   0.011          0.996        0.004  0.549  ...                 NaN   
121888   0.965      

Duplicados

In [11]:
# Paso 1: Contar duplicados por columna
duplicate_counts = df.apply(lambda col: col.duplicated().sum())

# Mostrar el número de duplicados por columna
print("Número de duplicados por columna:")
print(duplicate_counts)



Número de duplicados por columna:
artist_name                           137166
song_name                              43992
recording_id                               0
danceable                             156822
not_danceable                         156822
male                                  156818
female                                156818
timbre_bright                         156822
timbre_dark                           156822
tonal                                 156822
atonal                                156822
instrumental                          156818
voice                                 156818
dortmund_alternative                  157216
dortmund_blues                        157058
dortmund_electronic                   156810
dortmund_folkcountry                  156953
dortmund_funksoulrnb                  157670
dortmund_jazz                         157221
dortmund_pop                          157595
dortmund_raphiphop                    157001
dortmund_rock        