# DATASET PREPROCESSING : SONGS DATASET

In [5]:
import pandas as pd
music_df= pd.read_csv("my_SPOTIFY_GENIUS_SONG_DATASET.csv")

In [9]:
music_df.drop(columns=['Dominant Topic','emotion_scores','emotion_trust_normalized','emotion_joy_normalized','emotion_positive_normalized','emotion_positive_normalized','emotion_anger_normalized','emotion_disgust_normalized','emotion_fear_normalized','emotion_negative_normalized','emotion_sadness_normalized','emotion_anticipation_normalized','emotion_surprise_normalized'], inplace=True)
music_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1490 entries, 0 to 1489
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Track Name          1490 non-null   object 
 1   Artist              1490 non-null   object 
 2   Album               1490 non-null   object 
 3   Release Year        1490 non-null   int64  
 4   Track Popularity    1490 non-null   int64  
 5   Lyrics              1490 non-null   object 
 6   Processed_Lyrics    1490 non-null   object 
 7   Sentiment           1490 non-null   object 
 8   Sentiment Score     1490 non-null   float64
 9   Extracted_Keywords  1490 non-null   object 
dtypes: float64(1), int64(2), object(7)
memory usage: 116.5+ KB


### STEP 1: PREPROCESSING THE TEXT COLUMNS 

### Lyrics, Preprocessed lyrics, extracted keywords have already been processed when creating the dataset, therefore those columns have been skipped from this step 

In [11]:
import pandas as pd
import re

#FUNCTION TO PREPROCESS TEXT
def preprocess_text(text):
    if isinstance(text, str): #CHECKS IF THE INPUT IS A STRING
        # LOWERCASING
        text = text.lower()
        # TRIMMING WHITESPACE
        text = text.strip()
        # REMOVING SPECIAL CHARACTERS
        #text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text

# APPLYING PREPROCESSING TO THE TEXT COLUMNS
music_df['Track Name'] = music_df['Track Name'].apply(preprocess_text)
music_df['Artist'] = music_df['Artist'].apply(preprocess_text)
music_df['Album'] = music_df['Album'].apply(preprocess_text)
music_df['Sentiment'] = music_df['Sentiment'].apply(preprocess_text)

music_df.to_csv('music_preprocess.csv', index=False) 
music_df.head()


Unnamed: 0,Track Name,Artist,Album,Release Year,Track Popularity,Lyrics,Processed_Lyrics,Sentiment,Sentiment Score,Extracted_Keywords
0,oops!...i did it again,britney spears,oops!... i did it again,2000,79,"Mmm, yeah\nYeah, yeah, yeah, yeah, yeah, yeah\...",mmm think made believe friend baby might seem ...,positive,0.133514,"dreamin, wishin, fool, aw, love"
1,bye bye bye - from deadpool and wolverine soun...,*nsync,no strings attached,2000,85,"Bye, bye, bye\nBye, bye\n♪\nBye, bye\n♪\nI, I'...",bye bye bye bye bye bye bye im tonight youre p...,negative,-0.062693,"byeyou, ain, leave, bye, baby"
2,what a girl wants,christina aguilera,christina aguilera (expanded edition),1999,64,"What a girl wants, what a girl needs\nWhatever...",girl want girl need whatever make happy set fr...,positive,0.36138,"girl, christina, needs, love, wants"
3,candy,mandy moore,so real,1999,50,"Give it to me\nOoh, oh\nYeah, yeah, yeah, yeah...",give ooh ooh give ooh ooh give im addicted lov...,positive,0.060714,"craving, sugar, candy, begging, addicted"
4,shape of my heart,backstreet boys,black & blue,2000,70,"Hmm, mmmh, yeah, yeah\nBaby, please try to for...",hmm mmmh baby please try forgive stay dont put...,positive,0.09,"thinkin, confession, lookin, stay, door"


### STEP 2: NORMALIZING THE NUMERICAL COLUMNS

### columns for each emotion score have already been normalized during the dataset creation process, therefore are excluded from this step

In [13]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# INITIALIZE THE MINMAX SCALER
min_max_scaler = MinMaxScaler()

# DEFINING MY NUMERICAL COLUMNS
numerical_columns_music = [
    "Track Popularity", "Sentiment Score"
]

# APPLYING THE MIN-MAX SCALER TO THE NUMERICAL COLUMNS
music_df[numerical_columns_music] = min_max_scaler.fit_transform(music_df[numerical_columns_music])

# Display the normalized columns
music_df[['Track Popularity', 'Sentiment Score']].head()


Unnamed: 0,Track Popularity,Sentiment Score
0,0.88764,0.484433
1,0.955056,0.367688
2,0.719101,0.620015
3,0.561798,0.441116
4,0.786517,0.458541


In [65]:
#music_df['Extracted_Keywords'] = music_df['Extracted_Keywords'].apply(lambda x: ' '.join(x) if isinstance(x, list) else x)
# Replace commas with spaces in the Extracted_Keywords column
#music_df['Extracted_Keywords'] = music_df['Extracted_Keywords'].str.replace(',', ' ')

#music_df['Extracted_Keywords'].head()

0             dreamin  wishin  fool  aw  love
1               byeyou  ain  leave  bye  baby
2         girl  christina  needs  love  wants
3    craving  sugar  candy  begging  addicted
4     thinkin  confession  lookin  stay  door
Name: Extracted_Keywords, dtype: object

### STEP 4 : CHECKING FOR DUPLICATES

In [15]:
# CHECKING THE NUMBER OF DUPLICATE ROWS
num_duplicates = music_df.duplicated(subset=['Track Name', 'Artist']).sum()

print(f"Number of duplicate rows: {num_duplicates}")

# REMOVING THE DUPLICATES
music_df.drop_duplicates(inplace=True)

# Save the cleaned DataFrame
#music_df.to_csv("cleaned_music_dataset.csv", index=False)

#print("Duplicate rows removed and cleaned dataset saved as 'cleaned_music_dataset.csv'.")


Number of duplicate rows: 24


### STEP 5 : DROPPING ROWS WITH EMPTY 'LYRICS'

In [17]:
import pandas as pd

# COUNTING ROWS WITH EMPTY Lyrics
empty_lyrics_count = music_df[music_df['Lyrics'].isna() | (music_df['Lyrics'] == '')].shape[0]

print(f"Number of rows with empty or NaN Lyrics: {empty_lyrics_count}")
print("No rows to drop")

Number of rows with empty or NaN Lyrics: 0
No rows to drop


In [19]:
music_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1466 entries, 0 to 1489
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Track Name          1466 non-null   object 
 1   Artist              1466 non-null   object 
 2   Album               1466 non-null   object 
 3   Release Year        1466 non-null   int64  
 4   Track Popularity    1466 non-null   float64
 5   Lyrics              1466 non-null   object 
 6   Processed_Lyrics    1466 non-null   object 
 7   Sentiment           1466 non-null   object 
 8   Sentiment Score     1466 non-null   float64
 9   Extracted_Keywords  1466 non-null   object 
dtypes: float64(2), int64(1), object(7)
memory usage: 126.0+ KB


In [21]:
music_df.head()


Unnamed: 0,Track Name,Artist,Album,Release Year,Track Popularity,Lyrics,Processed_Lyrics,Sentiment,Sentiment Score,Extracted_Keywords
0,oops!...i did it again,britney spears,oops!... i did it again,2000,0.88764,"Mmm, yeah\nYeah, yeah, yeah, yeah, yeah, yeah\...",mmm think made believe friend baby might seem ...,positive,0.484433,"dreamin, wishin, fool, aw, love"
1,bye bye bye - from deadpool and wolverine soun...,*nsync,no strings attached,2000,0.955056,"Bye, bye, bye\nBye, bye\n♪\nBye, bye\n♪\nI, I'...",bye bye bye bye bye bye bye im tonight youre p...,negative,0.367688,"byeyou, ain, leave, bye, baby"
2,what a girl wants,christina aguilera,christina aguilera (expanded edition),1999,0.719101,"What a girl wants, what a girl needs\nWhatever...",girl want girl need whatever make happy set fr...,positive,0.620015,"girl, christina, needs, love, wants"
3,candy,mandy moore,so real,1999,0.561798,"Give it to me\nOoh, oh\nYeah, yeah, yeah, yeah...",give ooh ooh give ooh ooh give im addicted lov...,positive,0.441116,"craving, sugar, candy, begging, addicted"
4,shape of my heart,backstreet boys,black & blue,2000,0.786517,"Hmm, mmmh, yeah, yeah\nBaby, please try to for...",hmm mmmh baby please try forgive stay dont put...,positive,0.458541,"thinkin, confession, lookin, stay, door"


In [23]:
music_df.to_csv('my_preprocessed_SPOTIFY_GENIUS_SONG_DATASET.csv', index=False)