In [None]:
import pandas as pd
import glob
import os

# Load all Parquet files and merge them
parquet_columns = ["track_uri", "album_name", "artist_name", "track_name"]
parquet_files = glob.glob("audio_features_for_all_songs.parquet")
parquet_df = pd.concat([pd.read_parquet(file, columns=parquet_columns) for file in parquet_files], ignore_index=True)

# Load CSV file containing song attributes and lyrics
csv_df = pd.read_csv("songs_with_attributes_and_lyrics.csv", usecols=["id", "lyrics"], dtype=str)  # Read as string to avoid type issues

# Remove "spotify:track:" prefix from track_uri in Parquet dataset
parquet_df["clean_track_id"] = parquet_df["track_uri"].str.replace("spotify:track:", "", regex=False)

# Merge Parquet dataset with lyrics from CSV
merged_df = parquet_df.merge(csv_df, left_on="clean_track_id", right_on="id", how="left").drop(columns=["clean_track_id", "id"])

# Keep only songs with available lyrics
songs_with_lyrics_df = merged_df.dropna(subset=["lyrics"])
songs_without_lyrics_df = merged_df[merged_df["lyrics"].isna()]

# Count the number of tracks with lyrics
num_tracks_with_lyrics = len(songs_with_lyrics_df)
num_without_lyrics = len(songs_without_lyrics_df)

print(f"Number of songs with available lyrics: {num_tracks_with_lyrics}")
print(f"Number of songs without lyrics: {num_without_lyrics}")

# Save the filtered dataset to Parquet
output_folder = "lyrics"
output_file = os.path.join(output_folder, "songs_with_lyrics_part1.parquet")

songs_with_lyrics_df.to_parquet(output_file, engine="pyarrow", compression="snappy", index=False)

Number of songs with available lyrics: 72987
                               track_uri                 album_name  \
4   spotify:track:5wWDfRF7NQKMx8ZPfrhBwa  Funk Volume 2013 - Single   
10  spotify:track:1Prj89bq2VcT6YC6rNI14D                      Bravo   
12  spotify:track:5m8amrP9cCBCeUZ1X0mTQk             All Your Fault   
14  spotify:track:6sZHCkmQa8hzxfdI9MuywS                Sho Me Love   
15  spotify:track:37fulLx1QJTGmxcuvEszaA    Gazing at the Moonlight   

        artist_name        track_name lyrics  
4            Hopsin  Funk Volume 2013    NaN  
10  Jerry Purpdrank             Bravo    NaN  
12           Hopsin    All Your Fault    NaN  
14        Rich Gang       Sho Me Love    NaN  
15           Hopsin     Story of Mine    NaN  


In [3]:
# Load CSV file containing lyrics
csv_df = pd.read_csv("spotify_millsongdata.csv", usecols=["artist", "song", "text"], dtype=str)
csv_df.rename(columns={"text": "lyrics"}, inplace=True)  # Rename lyrics column for consistency

# Load Parquet file (only required columns)
filtered_parquet_df = songs_without_lyrics_df.drop(columns=["lyrics"])

# Create a set of (artist, song_name) tuples for both datasets
parquet_tracks_set = set(zip(filtered_parquet_df["artist_name"].str.lower(), filtered_parquet_df["track_name"].str.lower()))
csv_tracks_set = set(zip(csv_df["artist"].str.lower(), csv_df["song"].str.lower()))

# Find common tracks based on (artist, song name)
common_tracks = parquet_tracks_set.intersection(csv_tracks_set)

# Merge Parquet dataset with lyrics based on (artist, track_name)
merged_df = filtered_parquet_df.merge(csv_df, 
                                      left_on=["artist_name", "track_name"], 
                                      right_on=["artist", "song"], 
                                      how="left").drop(columns=["artist", "song"])

# Split datasets
songs_with_lyrics_df = merged_df.dropna(subset=["lyrics"])   
songs_without_lyrics_df = merged_df[merged_df["lyrics"].isna()]

# Count and print stats
num_with_lyrics = len(songs_with_lyrics_df)
num_without_lyrics = len(songs_without_lyrics_df)

print(f"Number of songs with lyrics: {num_with_lyrics}")
print(f"Number of songs without lyrics: {num_without_lyrics}")

# Save datasets to Parquet
songs_with_lyrics_df.to_parquet(os.path.join(output_folder, "songs_with_lyrics_part2.parquet"), engine="pyarrow", compression="snappy", index=False)

Number of songs with lyrics: 3160
Number of songs without lyrics: 176089


In [4]:
# Load CSV file containing labeled lyrics (with the 'seq' column for lyrics) and rename 'seq' to 'lyrics'
csv_df = pd.read_csv("labeled_lyrics_cleaned.csv", usecols=["artist", "song", "seq"], dtype=str)
csv_df.rename(columns={"seq": "lyrics"}, inplace=True)

# Load Parquet file (only required columns)
filtered_parquet_df = songs_without_lyrics_df.drop(columns=["lyrics"])

# Create a set of (artist, song_name) tuples for both datasets
parquet_tracks_set = set(zip(filtered_parquet_df["artist_name"].str.lower(), filtered_parquet_df["track_name"].str.lower()))
csv_tracks_set = set(zip(csv_df["artist"].str.lower(), csv_df["song"].str.lower()))

# Find common tracks based on (artist, song name)
common_tracks = parquet_tracks_set.intersection(csv_tracks_set)

# Merge Parquet dataset with lyrics (from the 'lyrics' column in the CSV) based on (artist, track_name)
merged_df = filtered_parquet_df.merge(csv_df, 
                                      left_on=["artist_name", "track_name"], 
                                      right_on=["artist", "song"], 
                                      how="left").drop(columns=["artist", "song"])

# Split datasets
songs_with_lyrics_df = merged_df.dropna(subset=["lyrics"])   # Songs that have lyrics (based on 'lyrics' column)
songs_without_lyrics_df = merged_df[merged_df["lyrics"].isna()]  # Songs that don't have lyrics

# Count and print stats
num_with_lyrics = len(songs_with_lyrics_df)
num_without_lyrics = len(songs_without_lyrics_df)

print(f"Number of songs with lyrics: {num_with_lyrics}")
print(f"Number of songs without lyrics: {num_without_lyrics}")

# Save datasets to Parquet
songs_with_lyrics_df.to_parquet(os.path.join(output_folder, "songs_with_lyrics_part3.parquet"), engine="pyarrow", compression="snappy", index=False)

Number of songs with lyrics: 6493
Number of songs without lyrics: 169599


In [5]:
# Load CSV file containing song attributes and lyrics
csv_df = pd.read_csv("spotify_tracks.csv", usecols=["uri", "lyrics"], dtype=str)  

# Load Parquet file (only required columns)
filtered_parquet_df = songs_without_lyrics_df.drop(columns=["lyrics"])

# Merge Parquet dataset with lyrics from CSV
merged_df = filtered_parquet_df.merge(csv_df, left_on="track_uri", right_on="uri", how="left").drop(columns=["uri"])

# Keep only songs with available lyrics
songs_with_lyrics_df = merged_df.dropna(subset=["lyrics"])
songs_without_lyrics_df = merged_df[merged_df["lyrics"].isna()]

# Count and print stats
num_with_lyrics = len(songs_with_lyrics_df)
num_without_lyrics = len(songs_without_lyrics_df)

print(f"Number of songs with lyrics: {num_with_lyrics}")
print(f"Number of songs without lyrics: {num_without_lyrics}")

# Save datasets to Parquet
songs_with_lyrics_df.to_parquet(os.path.join(output_folder, "songs_with_lyrics_part4.parquet"), engine="pyarrow", compression="snappy", index=False)

Number of songs with lyrics: 3400
Number of songs without lyrics: 166199


In [7]:
parquet_files = glob.glob("songs")
parquet_df = pd.concat([pd.read_parquet(file) for file in parquet_files], ignore_index=True)

# Load Parquet file (only required columns)
filtered_parquet_df = pd.read_parquet("songs_without_lyrics.parquet")

# Create a set of (artist, song_name) tuples for both datasets
parquet_tracks_set = set(zip(filtered_parquet_df["artist_name"].str.lower(), filtered_parquet_df["track_name"].str.lower()))
csv_tracks_set = set(zip(parquet_df["artist"].str.lower(), parquet_df["title"].str.lower()))

# Find common tracks based on (artist, song name)
common_tracks = parquet_tracks_set.intersection(csv_tracks_set)

# Merge Parquet dataset with lyrics (from the 'lyrics' column in the CSV) based on (artist, track_name)
merged_df = filtered_parquet_df.merge(parquet_df, 
                                      left_on=["artist_name", "track_name"], 
                                      right_on=["artist", "title"], 
                                      how="left").drop(columns=["artist", "title"])

# Split datasets
songs_with_lyrics_df = merged_df.dropna(subset=["lyrics"])   # Songs that have lyrics (based on 'lyrics' column)
songs_without_lyrics_df = merged_df[merged_df["lyrics"].isna()]  # Songs that don't have lyrics

# Count and print stats
num_with_lyrics = len(songs_with_lyrics_df)
num_without_lyrics = len(songs_without_lyrics_df)

print(f"Number of songs with lyrics: {num_with_lyrics}")
print(f"Number of songs without lyrics: {num_without_lyrics}")

# Save datasets to Parquet
songs_with_lyrics_df.to_parquet(os.path.join(output_folder, "songs_with_lyrics_part5.parquet"), engine="pyarrow", compression="snappy", index=False)

: 