# PART 1:THIS SECTION AIMS TO FETCH SONG METADATA FOR A LIST OF SONGS

In [1]:
#fetch spotify metadata
import pandas as pd
import requests
import base64
import time
import os

# FUNCTION TO GET SPOTIFY API ACCESS TOKEN
def get_my_spotify_token(client_id, client_secret):
    url = "https://accounts.spotify.com/api/token"
    headers = {
        "Authorization": "Basic " + base64.b64encode(f"{client_id}:{client_secret}".encode()).decode()
    }
    data = {"grant_type": "client_credentials"}
    
    response = requests.post(url, headers=headers, data=data)
    if response.status_code == 200:
        return response.json()["access_token"]
    else:
        raise Exception(f"Failed to retrieve token: {response.status_code}, {response.text}")

# FUNCTION TO FETCH METADATA FROM APOTIFY API WITH RATE LIMITING FUNCTIONALITY
def get_my_spotify_metadata(track_name, artist_name, access_token):
    search_url = f"https://api.spotify.com/v1/search?q=track:{track_name}%20artist:{artist_name}&type=track&limit=1"
    headers = {"Authorization": f"Bearer {access_token}"}
    
    while True:
        response = requests.get(search_url, headers=headers)
        
        if response.status_code == 200:
            data = response.json()
            if data['tracks']['items']:
                track = data['tracks']['items'][0]
                # EXTRACTING ONLY THE RELEASE YEAR FROM RELEASE DATE
                release_date = track.get("album")["release_date"]
                release_year = release_date.split("-")[0] if release_date else None
                return {
                    "Track Name": track.get("name"),
                    "Artist": track.get("artists")[0]["name"],
                    "Album": track.get("album")["name"],
                    "Release Year": release_year,  
                    "Track Popularity": track.get("popularity")
                }
            else:
                print(f"Track '{track_name}' by {artist_name} not found.")
                return None
        elif response.status_code == 429:  # Rate limit hit
            retry_after = int(response.headers.get("Retry-After", 1))
            print(f"Rate limit hit. Retrying after {retry_after} seconds...")
            time.sleep(retry_after)
        else:
            print(f"Error: {response.status_code}, {response.text}")
            return None

# LOADING THE SONG LIST FROM CSV FILE
song_list_df = pd.read_csv("C:/Users/Nutan/Downloads/songs.csv", encoding="ISO-8859-1")  

# INITIALIZING SPOTIFY API CREDENTIALS
SPOTIFY_CLIENT_ID = "584b24e6cc7d437cb9fe17ae417796cf"
SPOTIFY_CLIENT_SECRET = "079a6905fe0b4f0aa57170b94af24f14"

# GETTING THE SPOTIFY ACCESS TOKEN
spotify_access_token = get_my_spotify_token(SPOTIFY_CLIENT_ID, SPOTIFY_CLIENT_SECRET)

# SETTING UP BATCH PROCESSING
batch_size = 300  
output_file = "SPOTIFY_GENIUS_SONGS_DATASET.csv"
my_spotify_data = []

# RESUME FUNCTIONALITY : CHECKING IF THE OUTPUT FILE ALREADY EXISTS
if os.path.exists(output_file):
    my_spotify_data = pd.read_csv(output_file).to_dict(orient="records")
    processed_tracks = {(item["Track Name"], item["Artist"]) for item in my_spotify_data}
else:
    processed_tracks = set()

# PROCESSING THE SONGS IN BATCHES TO AVOID RATE LIMIT
total_tracks = len(song_list_df)
for start_idx in range(0, total_tracks, batch_size):
    end_idx = min(start_idx + batch_size, total_tracks)
    batch = song_list_df.iloc[start_idx:end_idx]
    print(f"Processing batch {start_idx + 1} to {end_idx} of {total_tracks}...")

    for _, row in batch.iterrows():
        track_name = row['Track Name']
        artist_name = row['Artist']

        # SKIPPING ALREADY PROCESSED TRACKS
        if (track_name, artist_name) in processed_tracks:
            continue

        # FETCHING THE SONG METADATA FROM SPOTIFY
        song_metadata = get_my_spotify_metadata(track_name, artist_name, spotify_access_token)

        if song_metadata:
            my_spotify_data.append(song_metadata)
            processed_tracks.add((track_name, artist_name))
        else:
            print(f"Metadata not found for {track_name} by {artist_name}")

        # ADDING A DELAY BETWEEN REQUESTS
        time.sleep(0.1)

    # SAVING PROGESS AFTER EVERY BATCH
    pd.DataFrame(my_spotify_data).to_csv(output_file, index=False)
    print(f"Batch {start_idx + 1} to {end_idx} saved to {output_file}.")

    # PAUSING BETWEEN BATCHES
    print("Pausing for 30 seconds between batches...")
    time.sleep(30)

# SAVING ALL DATA TO CSV FILE
pd.DataFrame(my_spotify_data).to_csv(output_file, index=False)

print(f"All batches processed. Final dataset saved to {output_file}.")


Processing batch 1 to 300 of 1494...
Batch 1 to 300 saved to SPOTIFY_GENIUS_SONGS_DATASET.csv.
Pausing for 30 seconds between batches...
Processing batch 301 to 600 of 1494...
Batch 301 to 600 saved to SPOTIFY_GENIUS_SONGS_DATASET.csv.
Pausing for 30 seconds between batches...
Processing batch 601 to 900 of 1494...
Batch 601 to 900 saved to SPOTIFY_GENIUS_SONGS_DATASET.csv.
Pausing for 30 seconds between batches...
Processing batch 901 to 1200 of 1494...
Batch 901 to 1200 saved to SPOTIFY_GENIUS_SONGS_DATASET.csv.
Pausing for 30 seconds between batches...
Processing batch 1201 to 1494 of 1494...
Track 'One Way Up' by Fredde Le Grand not found.
Metadata not found for One Way Up by Fredde Le Grand
Batch 1201 to 1494 saved to SPOTIFY_GENIUS_SONGS_DATASET.csv.
Pausing for 30 seconds between batches...
All batches processed. Final dataset saved to SPOTIFY_GENIUS_SONGS_DATASET.csv.


In [3]:
#DISPLAYING ABOVE CREATED CSV FILE
fetched_songs= pd.read_csv("SPOTIFY_GENIUS_SONGS_DATASET.csv")
fetched_songs.head(10)

Unnamed: 0,Track Name,Artist,Album,Release Year,Track Popularity
0,Oops!...I Did It Again,Britney Spears,Oops!... I Did It Again,2000,79
1,Bye Bye Bye - From Deadpool and Wolverine Soun...,*NSYNC,No Strings Attached,2000,85
2,What a Girl Wants,Christina Aguilera,Christina Aguilera (Expanded Edition),1999,64
3,Candy,Mandy Moore,So Real,1999,50
4,Shape of My Heart,Backstreet Boys,Black & Blue,2000,70
5,I Think I'm in Love with You,Jessica Simpson,Sweet Kisses,1999,47
6,Lucky,Britney Spears,Oops!... I Did It Again,2000,64
7,Give Me Just One Night (Una Noche),98º,Revelation,2000,41
8,Reach,S Club,"""7""",2000,52
9,Halfway Around The World,A*Teens,Teen Spirit,2001,36


# PART 2: THIS SECTIONS AIMS AT FETCHING THE LYRICS OF SONGS USING THE GENIUS API

In [21]:
# fetch lyrics from genius using fuzzy match
import lyricsgenius
import pandas as pd
import time
import os
from rapidfuzz import fuzz, process  # For fuzzy matching

# INITIALIZING GENIUS WITH ACCESS TOKEN
GENIUS_ACCESS_TOKEN = "E2YECdbQN9JzdsNJyftP6-PxA6kG6jzPhWSYceXSZgEi3S1oZVmhjOp6jSXjMTq5"  
genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)
genius.remove_section_headers = True #THIS REMOVES UNREQUIRED HEADINGS IN LYRICS 

# LOADING THE SPOTIFY METADATA CSV AND THE ORIGINAL SONGS LIST
spotify_df = pd.read_csv("SPOTIFY_GENIUS_SONGS_DATASET.csv")  
original_list_df = pd.read_csv("C:/Users/Nutan/Downloads/songs.csv", encoding="ISO-8859-1")  

# FUNCTION TO NORMALIZE SONG TITLES
def normalize_title(title):
    # Remove "(feat. artist)" or "(with artist)" and any trailing spaces 
    return title.split(" (feat.")[0].split(" (with")[0].strip()


# ENSURING THE LYRICS COLUMN EXISTS IN THE SPOTIFY DATA
if 'Lyrics' not in spotify_df.columns:
    spotify_df['Lyrics'] = ""

# DEFINING THE OUTPUT FILE
output_file = "my_SPOTIFY_GENIUS_SONG_DATASET.csv"

# CHECKING IF THE OUTPUT FILE ALREADY EXISTS FOR RESUME FUNCTIONALITY
if os.path.exists(output_file):
    processed_df = pd.read_csv(output_file)
    processed_tracks = set(zip(processed_df['Track Name'], processed_df['Artist']))
    spotify_df = spotify_df[~spotify_df.set_index(['Track Name', 'Artist']).index.isin(processed_tracks)]
    lyrics_data = processed_df.to_dict(orient="records")  # Load existing data
else:
    lyrics_data = []
    processed_tracks = set()

# FUZZY MATCHING THRESHOLD
# SINCE I ONLY WANT THE TITLE MATCH AND NOT THE ENTIRE TITLE AS PROVIDED BY SPOTIFY, A LOW THRESHOLD OF 10 HAS BEEN CHOSEN
FUZZY_MATCH_THRESHOLD = 10  

# PROCESSING THE SONGS IN BATCHES
batch_size = 100
total_tracks = len(spotify_df)
spotify_df.reset_index(drop=True, inplace=True)

print(f"Total songs to process: {total_tracks}")

for start_idx in range(0, total_tracks, batch_size):
    end_idx = min(start_idx + batch_size, total_tracks)
    batch = spotify_df.iloc[start_idx:end_idx]
    print(f"\nProcessing batch {start_idx + 1} to {end_idx}...")

    for index, row in batch.iterrows():
        track_name = row['Track Name']
        artist_name = row['Artist']

        try:
            # NORMALIZING THE TRACK NAME
            normalized_track_name = normalize_title(track_name)
            # FETCHING LYRICS USING SPOTIFY TITLE
            song = genius.search_song(normalized_track_name, artist_name)
            if song:
                lyrics_data.append({
                    "Track Name": track_name,
                    "Artist": artist_name,
                    "Album": row['Album'],  
                    "Release Year": row['Release Year'],  
                    "Track Popularity": row['Track Popularity'], 
                    "Lyrics": song.lyrics
                })
            else:
                print(f"Lyrics not found for '{track_name}' by {artist_name}. Attempting with original list...")

                # FUZZY MATCHING: FINDS THE BEST MATCH IN THE ORIGINAL LIST 
                artist_matches = original_list_df[
                    original_list_df['Artist'].str.lower() == artist_name.lower()
                ]  # FILTER BY ARTIST
                if not artist_matches.empty:
                    best_match = process.extractOne(
                        track_name,  # SEARCH FOR TRACK NAME 
                        artist_matches['Track Name'],  # COMPARE AGAINST ORIGINAL LIST TRACK NAMES 
                        scorer=fuzz.token_sort_ratio
                    )

                    if best_match and best_match[1] >= FUZZY_MATCH_THRESHOLD:
                        # FUZZY MATCH SUCCESSFUL
                        original_title = best_match[0]
                        print(f"Fuzzy match found: '{original_title}' with score {best_match[1]}")
                        song = genius.search_song(original_title, artist_name)
                        if song:
                            lyrics_data.append({
                                "Track Name": track_name,
                                "Artist": artist_name,
                                "Album": row['Album'],
                                "Release Year": row['Release Year'],
                                "Track Popularity": row['Track Popularity'],
                                "Lyrics": song.lyrics
                            })
                            print(f"Lyrics found using fuzzy match title '{original_title}'.")
                        else:
                            lyrics_data.append({
                                "Track Name": track_name,
                                "Artist": artist_name,
                                "Album": row['Album'],
                                "Release Year": row['Release Year'],
                                "Track Popularity": row['Track Popularity'],
                                "Lyrics": "Lyrics not found"
                            })
                            print(f"Lyrics not found for fuzzy match title '{original_title}'.")
                    else:
                        # FUZZY MATCH FAILED
                        lyrics_data.append({
                            "Track Name": track_name,
                            "Artist": artist_name,
                            "Album": row['Album'],
                            "Release Year": row['Release Year'],
                            "Track Popularity": row['Track Popularity'],
                            "Lyrics": "Lyrics not found"
                        })
                        print(f"No suitable fuzzy match found for '{track_name}' by {artist_name}.")
                else:
                    # NO ARTIST MATCHES FOUND
                    lyrics_data.append({
                        "Track Name": track_name,
                        "Artist": artist_name,
                        "Album": row['Album'],
                        "Release Year": row['Release Year'],
                        "Track Popularity": row['Track Popularity'],
                        "Lyrics": "Lyrics not found"
                    })
                    print(f"No matching artist found in original list for '{track_name}' by {artist_name}.")
        except Exception as e:
            lyrics_data.append({
                "Track Name": track_name,
                "Artist": artist_name,
                "Album": row['Album'],
                "Release Year": row['Release Year'],
                "Track Popularity": row['Track Popularity'],
                "Lyrics": "Error fetching lyrics"
            })
            print(f"Error fetching lyrics for '{track_name}' by {artist_name}: {e}")

        # DELAY TO AVOID RATE LIMITS
        time.sleep(0.5)

    # SAVING PROGESS AFTER EACH BATCH
    pd.DataFrame(lyrics_data).to_csv(output_file, index=False)
    print(f"Batch {start_idx + 1} to {end_idx} processed and saved.")

print("\nAll batches processed. Final dataset saved.")


Total songs to process: 1490

Processing batch 1 to 100...
Searching for "Oops!...I Did It Again" by Britney Spears...
Done.
Searching for "Bye Bye Bye - From Deadpool and Wolverine Soundtrack" by *NSYNC...
Done.
Searching for "What a Girl Wants" by Christina Aguilera...
Done.
Searching for "Candy" by Mandy Moore...
Done.
Searching for "Shape of My Heart" by Backstreet Boys...
Done.
Searching for "I Think I'm in Love with You" by Jessica Simpson...
Done.
Searching for "Lucky" by Britney Spears...
Done.
Searching for "Give Me Just One Night (Una Noche)" by 98º...
Done.
Searching for "Reach" by S Club...
Done.
Searching for "Halfway Around The World" by A*Teens...
Done.
Searching for "Summer Girls" by LFO...
Done.
Searching for "He Loves You Not" by Dream...
Done.
Searching for "I'm a Slave 4 U" by Britney Spears...
Done.
Searching for "Pop" by *NSYNC...
Done.
Searching for "Irresistible" by Jessica Simpson...
Done.
Searching for "Whenever, Wherever" by Shakira...
Done.
Searching for "I 

### RETRYING FOR SONGS WHICH HAD ERROR FETCHING LYRICS

In [27]:
# retrying for songs where there was error fetching lyrics
import lyricsgenius
import pandas as pd
import time
from rapidfuzz import fuzz, process  # For fuzzy matching

# INITIALIZING GENIUS ACCESS TOKEN
GENIUS_ACCESS_TOKEN = "E2YECdbQN9JzdsNJyftP6-PxA6kG6jzPhWSYceXSZgEi3S1oZVmhjOp6jSXjMTq5" 
genius = lyricsgenius.Genius(GENIUS_ACCESS_TOKEN)
genius.remove_section_headers = True  

# LOADING THE DATASET WITH ERRORS
input_file = "my_SPOTIFY_GENIUS_SONG_DATASET.csv"  
original_list_df = pd.read_csv("C:/Users/Nutan/Downloads/songs.csv", encoding="ISO-8859-1")  

# FUNCTION TO NORMALIZE SONG TITLES
def normalize_title(title):
    # Remove "(feat. artist)" or "(with artist)" and any trailing spaces 
    return title.split(" (feat.")[0].split(" (with")[0].strip()
    
# lOADING DATASET
df = pd.read_csv(input_file)

# FILTERING ROWS WHERE LYRICS FETCHING FAILED
retry_df = df[df['Lyrics'] == "Error fetching lyrics"].copy()

# FUZZY MATCH THRESHOLD
FUZZY_MATCH_THRESHOLD = 10 

# PROCESSING EACH ROW WITH ERROR
print(f"Retrying lyrics fetching for {len(retry_df)} rows with errors...")

for index, row in retry_df.iterrows():
    track_name = row['Track Name']
    artist_name = row['Artist']

    try:
        # NORMALIZING THE TRACK NAME
        normalized_track_name = normalize_title(track_name)
        # FETCHING LYRICS USING SPOTIFY TITLE
        song = genius.search_song(normalized_track_name, artist_name)
        if song:
            df.loc[index, 'Lyrics'] = song.lyrics
            print(f"Lyrics fetched for '{track_name}' by {artist_name}.")
        else:
            print(f"Lyrics not found for '{track_name}' by {artist_name}. Attempting fuzzy match...")

            # FUZZY MATCHING: FINDS THE BEST MATCH IN THE ORIGINAL LIST
            artist_matches = original_list_df[
                original_list_df['Artist'].str.lower() == artist_name.lower()
            ]  # FILTER BY ARTIST
            if not artist_matches.empty:
                best_match = process.extractOne(
                    track_name,  # SEARCH FOR TRACK NAME
                    artist_matches['Track Name'],  # COMPARE AGINST ORIGINAL LIST TRACK NAMES
                    scorer=fuzz.token_sort_ratio
                )

                if best_match and best_match[1] >= FUZZY_MATCH_THRESHOLD:
                    # FUZZY MATCH SUCCESSFUL
                    original_title = best_match[0]
                    print(f"Fuzzy match found: '{original_title}' with score {best_match[1]}")
                    song = genius.search_song(original_title, artist_name)
                    if song:
                        df.loc[index, 'Lyrics'] = song.lyrics
                        print(f"Lyrics fetched using fuzzy match '{original_title}'.")
                    else:
                        df.loc[index, 'Lyrics'] = "Lyrics not found"
                        print(f"Lyrics not found for fuzzy match '{original_title}'.")
                else:
                    df.loc[index, 'Lyrics'] = "Lyrics not found"
                    print(f"No suitable fuzzy match found for '{track_name}' by {artist_name}.")
            else:
                df.loc[index, 'Lyrics'] = "Lyrics not found"
                print(f"No matching artist found in original list for '{track_name}' by {artist_name}.")
    except Exception as e:
        df.loc[index, 'Lyrics'] = "Error fetching lyrics"
        print(f"Error fetching lyrics for '{track_name}' by {artist_name}: {e}")

    # DELAY TO AVOID RATE LIMIT
    time.sleep(0.5)

# SAVING THE UODATED DATASET TO THE SAME CSV
df.to_csv(input_file, index=False)

print(f"Retried lyrics fetching completed. Dataset updated in '{input_file}'.")


Retrying lyrics fetching for 19 rows with errors...
Searching for "Toxic" by Britney Spears...
Done.
Lyrics fetched for 'Toxic' by Britney Spears.
Searching for "Umbrella" by Rihanna...
Done.
Lyrics fetched for 'Umbrella' by Rihanna.
Searching for "Shake It Off" by Taylor Swift...
Done.
Lyrics fetched for 'Shake It Off' by Taylor Swift.
Searching for "Love Yourself" by Justin Bieber...
Done.
Lyrics fetched for 'Love Yourself' by Justin Bieber.
Searching for "Into You" by Ariana Grande...
Done.
Lyrics fetched for 'Into You' by Ariana Grande.
Searching for "Blinding Lights" by The Weeknd...
Done.
Lyrics fetched for 'Blinding Lights' by The Weeknd.
Searching for "In the End" by Linkin Park...
Done.
Lyrics fetched for 'In the End' by Linkin Park.
Searching for "21 Questions" by 50 Cent...
Done.
Lyrics fetched for '21 Questions' by 50 Cent.
Searching for "Feel Good Inc." by Gorillaz...
Done.
Lyrics fetched for 'Feel Good Inc.' by Gorillaz.
Searching for "Started From the Bottom" by Drake...

# PART 3: CLEANING THE FETCHED LYRICS

In [29]:
#clean the lyrics
import pandas as pd
import re

# LOADING THE DATASET
input_file = "my_SPOTIFY_GENIUS_SONG_DATASET.csv"  
spotify_df = pd.read_csv(input_file)

# FUNCTION TO CLEAN THE LYRICS BY REMOVING LINE LABELS AND ANNOTATIONS
def clean_lyrics(lyrics):
    if not lyrics or pd.isna(lyrics):  # HANDLING MISSING OR EMPTY LYRICS
        return lyrics

    # REMOVE THE FIRST LINE IF ITS UNRELATED TEXT
    lyrics_lines = lyrics.split('\n')
    if len(lyrics_lines) > 1:
        lyrics = "\n".join(lyrics_lines[1:]).strip()
    
    # REMOVE SECTION HEADERS LIKE [Chorus], [Verse 1], [Bridge: Artist], etc.
    lyrics = re.sub(r'\[.*?\]', '', lyrics)
    
    # REMOVE TRAILING NUMBER AND 'Embed'IF PRESENT
    lyrics = re.sub(r'\d*Embed$', '', lyrics)
    
    # REMOVE EXTRA WHITESPACE AND NEW LINES
    lyrics = re.sub(r'\n+', '\n', lyrics).strip()
    
    return lyrics

#  APPLYING clean_lyrics function TO THE "Lyrics" COLUMN
if 'Lyrics' in spotify_df.columns:
    spotify_df['Lyrics'] = spotify_df['Lyrics'].apply(lambda x: clean_lyrics(str(x)))
    print("Lyrics cleaned successfully.")
else:
    print("Error: 'Lyrics' column is missing in the dataset.")

# OVERWRITE THE SAME CSV FILE WITH CLEANED LYRICS
spotify_df.to_csv(input_file, index=False)

spotify_df[['Track Name', 'Artist', 'Lyrics']].head(10)


Lyrics cleaned successfully.


Unnamed: 0,Track Name,Artist,Lyrics
0,Oops!...I Did It Again,Britney Spears,"Mmm, yeah\nYeah, yeah, yeah, yeah, yeah, yeah\..."
1,Bye Bye Bye - From Deadpool and Wolverine Soun...,*NSYNC,"Bye, bye, bye\nBye, bye\n♪\nBye, bye\n♪\nI, I'..."
2,What a Girl Wants,Christina Aguilera,"What a girl wants, what a girl needs\nWhatever..."
3,Candy,Mandy Moore,"Give it to me\nOoh, oh\nYeah, yeah, yeah, yeah..."
4,Shape of My Heart,Backstreet Boys,"Hmm, mmmh, yeah, yeah\nBaby, please try to for..."
5,I Think I'm in Love with You,Jessica Simpson,"Yeah, yeah, yeah, yeah\nOh-oh\nOh-oh-oh\nEvery..."
6,Lucky,Britney Spears,This is a story about a girl named Lucky\nEarl...
7,Give Me Just One Night (Una Noche),98º,You keep telling me you want me\nHold me close...
8,Reach,S Club,When the world leaves you feeling blue\nYou ca...
9,Halfway Around The World,A*Teens,(Halfway around the world)\nHalfway around the...


# PART 4: PREPROCESSING THE LYRICS COLUMN FOR NLP

In [31]:
# preprocess the lyrics for nlp
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# DOWNLOAD THE NECESSARY NLTK DATA FILES
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# LOADING THE DATASET WITH LYRICS
spotify_df = pd.read_csv("my_SPOTIFY_GENIUS_SONG_DATASET.csv")

# INITIALIZE NLP TOOLS
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# FUNCTION TO PREPROCESS LYRICS
def preprocess_lyrics(lyrics):
    # LOWERCASE THE TEXT
    lyrics = lyrics.lower()
    
    # REMOVING CONTEXT SPECIFIC NOISE
    lyrics = re.sub(r'\b(oh|yeah|na|la|uh|ah)+\b', '', lyrics)
    
    # REMOVEING PUNCTUATION
    lyrics = re.sub(r'[^\w\s]', '', lyrics)
    
    # TOKENIZEING THE TEXT
    words = word_tokenize(lyrics)
    
    # REMOVEING STOP WORDS AND APPLY LEMMATIZATION
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    
    # REJOINING WORDS TO FORM THE CLEANED TEXT
    return ' '.join(words)

# APPLYING PREPROCESSING TO THE 'Lyrics' COLUMN
spotify_df['Processed_Lyrics'] = spotify_df['Lyrics'].apply(lambda x: preprocess_lyrics(x) if isinstance(x, str) else "")

# SAVING THE CHANGES
spotify_df.to_csv("my_SPOTIFY_GENIUS_SONG_DATASET.csv", index=False)

spotify_df[['Track Name', 'Artist','Lyrics', 'Processed_Lyrics']].head(10)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Nutan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Nutan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Nutan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Track Name,Artist,Lyrics,Processed_Lyrics
0,Oops!...I Did It Again,Britney Spears,"Mmm, yeah\nYeah, yeah, yeah, yeah, yeah, yeah\...",mmm think made believe friend baby might seem ...
1,Bye Bye Bye - From Deadpool and Wolverine Soun...,*NSYNC,"Bye, bye, bye\nBye, bye\n♪\nBye, bye\n♪\nI, I'...",bye bye bye bye bye bye bye im tonight youre p...
2,What a Girl Wants,Christina Aguilera,"What a girl wants, what a girl needs\nWhatever...",girl want girl need whatever make happy set fr...
3,Candy,Mandy Moore,"Give it to me\nOoh, oh\nYeah, yeah, yeah, yeah...",give ooh ooh give ooh ooh give im addicted lov...
4,Shape of My Heart,Backstreet Boys,"Hmm, mmmh, yeah, yeah\nBaby, please try to for...",hmm mmmh baby please try forgive stay dont put...
5,I Think I'm in Love with You,Jessica Simpson,"Yeah, yeah, yeah, yeah\nOh-oh\nOh-oh-oh\nEvery...",every time youre near baby get kinda crazy hea...
6,Lucky,Britney Spears,This is a story about a girl named Lucky\nEarl...,story girl named lucky early morning wake knoc...
7,Give Me Just One Night (Una Noche),98º,You keep telling me you want me\nHold me close...,keep telling want hold close night know deep i...
8,Reach,S Club,When the world leaves you feeling blue\nYou ca...,world leaf feeling blue count seems hope dream...
9,Halfway Around The World,A*Teens,(Halfway around the world)\nHalfway around the...,halfway around world halfway around world half...


# PART 5: SENTIMENT ANALYISIS OF LYRICS

In [33]:
#sentiment and sentiment score column
from textblob import TextBlob
import pandas as pd

# FUNCTION TO ANALYZE SENTIMENT AND PROVIDE SENTIMENT SCORE
def analyze_sentiment(text):
    blob = TextBlob(text)
    polarity = blob.sentiment.polarity
    sentiment = "Positive" if polarity > 0 else "Negative" if polarity < 0 else "Neutral"
    return sentiment, polarity

# APPLYING THE SENTIMENT ANALYSIS TO GET BOTH SENTIMENT AND SENTIMENT SCORE
spotify_df[['Sentiment', 'Sentiment Score']] = spotify_df['Processed_Lyrics'].apply(
    lambda x: pd.Series(analyze_sentiment(x) if isinstance(x, str) else ("", 0))
)
spotify_df.to_csv("my_SPOTIFY_GENIUS_SONG_DATASET.csv", index=False)


spotify_df[['Track Name', 'Artist', 'Processed_Lyrics', 'Sentiment', 'Sentiment Score']].head(10)


Unnamed: 0,Track Name,Artist,Processed_Lyrics,Sentiment,Sentiment Score
0,Oops!...I Did It Again,Britney Spears,mmm think made believe friend baby might seem ...,Positive,0.133514
1,Bye Bye Bye - From Deadpool and Wolverine Soun...,*NSYNC,bye bye bye bye bye bye bye im tonight youre p...,Negative,-0.062693
2,What a Girl Wants,Christina Aguilera,girl want girl need whatever make happy set fr...,Positive,0.36138
3,Candy,Mandy Moore,give ooh ooh give ooh ooh give im addicted lov...,Positive,0.060714
4,Shape of My Heart,Backstreet Boys,hmm mmmh baby please try forgive stay dont put...,Positive,0.09
5,I Think I'm in Love with You,Jessica Simpson,every time youre near baby get kinda crazy hea...,Positive,0.203346
6,Lucky,Britney Spears,story girl named lucky early morning wake knoc...,Positive,0.217949
7,Give Me Just One Night (Una Noche),98º,keep telling want hold close night know deep i...,Negative,-0.343878
8,Reach,S Club,world leaf feeling blue count seems hope dream...,Positive,0.241615
9,Halfway Around The World,A*Teens,halfway around world halfway around world half...,Negative,-0.097436


# PART 6: KEYWORD ECTRACTION FROM LYRICS

In [35]:
from keybert import KeyBERT
from sentence_transformers import SentenceTransformer
import pandas as pd

# LOADING DATASET
music_df = spotify_df 

# INITIALIZING THE KeyBERT MODEL (THIS MODEL USES BERT EMBEDDINGS INTERNALLY)
model = KeyBERT('all-MiniLM-L6-v2')

# FUCNTION TO EXTRACT KEYWORDS USING KeyBERT
def extracting_keywords_with_keybert(text, top_n=5):
    keywords = model.extract_keywords(text, top_n=top_n)
    return ", ".join([kw[0] for kw in keywords])

# APPLYING THE KeyBERT KEYWORD EXTRACTION TO MOVIE PLOT COLUMN 
music_df['Extracted_Keywords'] = music_df['Lyrics'].fillna("").apply(extracting_keywords_with_keybert)

# SAVING THE COLUMN TO THE CSV FILE
music_df.to_csv("my_SPOTIFY_GENIUS_SONG_DATASET.csv", index=False)

# DISPLAYING THE EXTRACTED KEYWORDS COLUMN
music_df[['Track Name', 'Lyrics', 'Extracted_Keywords']].head(10)

  from tqdm.autonotebook import tqdm, trange


Unnamed: 0,Track Name,Lyrics,Extracted_Keywords
0,Oops!...I Did It Again,"Mmm, yeah\nYeah, yeah, yeah, yeah, yeah, yeah\...","dreamin, wishin, fool, aw, love"
1,Bye Bye Bye - From Deadpool and Wolverine Soun...,"Bye, bye, bye\nBye, bye\n♪\nBye, bye\n♪\nI, I'...","byeyou, ain, leave, bye, baby"
2,What a Girl Wants,"What a girl wants, what a girl needs\nWhatever...","girl, christina, needs, love, wants"
3,Candy,"Give it to me\nOoh, oh\nYeah, yeah, yeah, yeah...","craving, sugar, candy, begging, addicted"
4,Shape of My Heart,"Hmm, mmmh, yeah, yeah\nBaby, please try to for...","thinkin, confession, lookin, stay, door"
5,I Think I'm in Love with You,"Yeah, yeah, yeah, yeah\nOh-oh\nOh-oh-oh\nEvery...","love, baby, tellin, think, mind"
6,Lucky,This is a story about a girl named Lucky\nEarl...,"actress, hollywood, tears, cries, wakes"
7,Give Me Just One Night (Una Noche),You keep telling me you want me\nHold me close...,"night, una, tu, noche, movimiento"
8,Reach,When the world leaves you feeling blue\nYou ca...,"rainbow, dreams, shining, ocean, stars"
9,Halfway Around The World,(Halfway around the world)\nHalfway around the...,"loving, baby, growin, love, world"


In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Track Name          1490 non-null   object 
 1   Artist              1490 non-null   object 
 2   Album               1490 non-null   object 
 3   Release Year        1490 non-null   int64  
 4   Track Popularity    1490 non-null   int64  
 5   Lyrics              1490 non-null   object 
 6   Processed_Lyrics    1490 non-null   object 
 7   Sentiment           1490 non-null   object 
 8   Sentiment Score     1490 non-null   float64
 9   Extracted_Keywords  1490 non-null   object 
 10  Dominant Topic      1490 non-null   int64  
dtypes: float64(1), int64(3), object(7)
memory usage: 128.2+ KB
