In [12]:
import pandas as pd
from pathlib import Path
import re
from sklearn.model_selection import train_test_split



In [2]:
path = Path.cwd()
print(path)

c:\Users\viggo\Documents\Cognitive Science Masters\1st. semester\Natural Language Processing\NLP_exam_project\NLP-Project-CogSci


In [5]:
#Loading in genius lyrics data from kaggle: https://www.kaggle.com/datasets/carlosgdcj/genius-song-lyrics-with-language-information/data
#To properly run these scripts on other machines, the data must be locally downloaded and it's path referred to by data_path
data_path = path.parents[0] / "NLP_data" / "song_lyrics.csv"
raw_genius_lyrics = pd.read_csv(data_path)

In [6]:
#Sanity check for if data fits online description of dataset
print(raw_genius_lyrics.head())
print(raw_genius_lyrics.info())

               title  tag     artist  year   views  \
0          Killa Cam  rap    Cam'ron  2004  173166   
1         Can I Live  rap      JAY-Z  1996  468624   
2  Forgive Me Father  rap   Fabolous  2003    4743   
3       Down and Out  rap    Cam'ron  2004  144404   
4             Fly In  rap  Lil Wayne  2005   78271   

                                       features  \
0                   {"Cam\\'ron","Opera Steve"}   
1                                            {}   
2                                            {}   
3  {"Cam\\'ron","Kanye West","Syleena Johnson"}   
4                                            {}   

                                              lyrics  id language_cld3  \
0  [Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...   1            en   
1  [Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...   3            en   
2  Maybe cause I'm eatin\nAnd these bastards fien...   4            en   
3  [Produced by Kanye West and Brian Miller]\n\n[...   5            en  

In [7]:
#Similar sanity check for the columns of the dataset
data_columns = raw_genius_lyrics.columns.tolist()
print(data_columns)

['title', 'tag', 'artist', 'year', 'views', 'features', 'lyrics', 'id', 'language_cld3', 'language_ft', 'language']


In [8]:
#Checking the most popular genres and what percentage of the data they constitute
raw_genius_lyrics["tag"].value_counts().head(10)
raw_genius_lyrics["tag"].value_counts(normalize=True) * 100


tag
pop        41.648432
rap        33.590348
rock       15.447755
rb          3.826047
misc        3.533789
country     1.953628
Name: proportion, dtype: float64

Given the largy disparity in the distribution of genres, i should keep this in mind so that a Naive classifier doesn't just score 42 % by guessing pop on everything. Look up some solutions for this that seem worthwhile

In [9]:
#Checking the most prevalent languages and what percentage of the data they constitute
raw_genius_lyrics["language"].value_counts().head(10)
raw_genius_lyrics["language"].value_counts(normalize=True) * 100


language
en    68.749809
es     5.611970
fr     3.859788
pt     3.421946
ru     3.383172
        ...    
mt     0.000102
uz     0.000082
tg     0.000061
bs     0.000020
gu     0.000020
Name: proportion, Length: 84, dtype: float64

In [10]:
lyrics_processed = raw_genius_lyrics.copy()

lyrics_processed = lyrics_processed[lyrics_processed["language"] == "en"]

lyrics_processed = lyrics_processed.reset_index(drop=True)

lyrics_processed["language"].value_counts().head()
print("Rows after English filter:", len(lyrics_processed))

Rows after English filter: 3374198


In [11]:
lyrics_processed["language"].value_counts().head(10)
lyrics_processed["language"].value_counts(normalize=True) * 100

language
en    100.0
Name: proportion, dtype: float64

In [14]:
#To optimise the time for the remainder of the preprocessing pipeline and eventual analysis, we create a copy of the data only including the relevant columns
data_short = lyrics_processed[["lyrics", "tag"]].copy()

data_short["lyrics"] = data_short["lyrics"].astype(str)

# empty or whitespace-only
data_short = data_short[data_short["lyrics"].str.strip().ne("")]

# remove duplicates (exact)
data_short = data_short.drop_duplicates(subset=["lyrics"])
data_short = data_short.reset_index(drop=True)

In [15]:
# 1) drop true missing values first (prevents NaN -> "nan" problems later)
data_short = data_short.dropna(subset=["lyrics"])

# 2) make sure lyrics are strings
data_short["lyrics"] = data_short["lyrics"].astype(str)

# 3) remove empty or whitespace-only lyrics
data_short = data_short[data_short["lyrics"].str.strip().ne("")]

# 4) remove exact duplicate lyrics (important to avoid train/test leakage)
data_short = data_short.drop_duplicates(subset=["lyrics"])

# 5) clean index after filtering
data_short = data_short.reset_index(drop=True)

In [16]:
import re
import unicodedata

BRACKET_TAG_RE = re.compile(r"\[.*?\]")  # removes anything in [ ... ]
WS_RE = re.compile(r"\s+")

def normalize_lyrics(text: str) -> str:
    # Ensure string
    s = str(text)

    # Normalize unicode (helps with odd characters)
    s = unicodedata.normalize("NFKC", s)

    # Normalize apostrophes (curly -> straight)
    s = s.replace("’", "'").replace("‘", "'")

    # Convert literal backslash-n sequences into real newlines
    # (handles cases where the text contains "\\n" as two characters)
    s = s.replace("\\r\\n", "\n").replace("\\n", "\n").replace("\\r", "\n")

    # Remove bracketed metadata (Genius structure tags)
    s = BRACKET_TAG_RE.sub(" ", s)

    # Lowercase (standard for classic NLP baselines)
    s = s.lower()

    # Collapse all whitespace (spaces, tabs, newlines) to single spaces
    s = WS_RE.sub(" ", s).strip()

    return s

data_short["lyrics_norm"] = data_short["lyrics"].map(normalize_lyrics)

data_short[["lyrics", "lyrics_norm"]].head(3)

Unnamed: 0,lyrics,lyrics_norm
0,"[Chorus: Opera Steve & Cam'ron]\nKilla Cam, Ki...","killa cam, killa cam, cam killa cam, killa cam..."
1,"[Produced by Irv Gotti]\n\n[Intro]\nYeah, hah,...","yeah, hah, yeah, roc-a-fella we invite you to ..."
2,Maybe cause I'm eatin\nAnd these bastards fien...,maybe cause i'm eatin and these bastards fiend...


In [None]:
X = data_short["lyrics_norm"]
y = data_short["tag"]

# 80% train, 20% temp
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.2,
    random_state=558,
    stratify=y
)

# split temp into 10% val, 10% test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=558,
    stratify=y_temp
)