### Packages

In [None]:
import pandas as pd
import json 
import re
import os
import numpy as np
from os import path
import string

import nltk 
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt_tab')
nltk.download('averaged_perceptron_tagger_eng')
nltk.download('wordnet')

### Data Loading 

In [None]:
# read JSON file from Genius API scraping
df = json.load(open("data/Lyrics_NoahKahan.json", "r"))
df.keys()

In [None]:
df = pd.DataFrame(df['songs'])
df['release_date']=pd.to_datetime(df['release_date'])

In [None]:
df_clean=df[['title','lyrics','release_date']]

#adding album name into the dataframe

album_list=[]
for i in range(len(df['album'])):
    try:
        album_list.append(df['album'][i]['name'])
    except:
        album_list.append(None)
        continue
df_clean['Album']=album_list

df_clean.head()

### Clean Lyrics

1. Remove the first few phrases before the actual lyrics 
2. Remove the ones that says 'lyrics from instagram live, lyrics from live performance' etc. 

In [None]:
def clean_lyrics(text):
    # Keep only lyrics (after "Lyrics")
    match = re.search(r'Lyrics\s*(.*)', text, re.DOTALL)
    text = match.group(1).strip() if match else text  

    # Remove unwanted sources (Live, Instagram, TikTok, etc.)
    text = re.sub(r'(?i)Lyrics from .*?Live.*?\n+', '', text)  
    text = re.sub(r'(?i)Lyrics from Instagram.*?\n+', '', text)  
    text = re.sub(r'(?i)Lyrics From TikTok Video.*?\n+', '', text) 
    text = re.sub(r'(?i)Lyrics from Youtube Short.*?\n+', '', text) 
    text = re.sub(r'(?i)Lyrics from X & TikTok Video.*?\n+', '', text) 

    return text.strip()  

def clean_song_title(song):
    """Removes version details from song titles (e.g., 'Live', 'Acoustic')."""
    return re.sub(r"\s*\(.*?\)", "", song).strip()

df_clean["lyrics"] = df_clean["lyrics"].apply(clean_lyrics)
df_clean["title"] = df_clean["title"].apply(clean_song_title)
df_clean = df_clean.drop_duplicates(subset=["title"], keep="first")

df_clean.head()

#### Clean Lyrics for NLTK Analysis

1. Convert to lowercase
2. Remove text inside square brackets (e.g. [Verse 1])
3. Remove newlines 
4. Remove punctuation, numbers, URLs
5. Remove stopwards as defined by ntlk 

In [None]:
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()  
    text = re.sub(r'\[.*?\]', '', text)  
    text = re.sub(r'\n', ' ', text)  
    text = text.translate(str.maketrans("", "", string.punctuation)) 
    text = re.sub(r'\d+', '', text)  
    text = re.sub(r'http\S+|www\S+', '', text)  
    words = text.split() 
    words = [word for word in words if word not in stop_words] 
    return " ".join(words)

df_clean["cleaned_lyrics"] = df_clean["lyrics"].apply(clean_text)
df_clean = df_clean.drop(['lyrics'], axis = 1)

df_clean.head()

In [None]:
stop = set(stopwords.words("english"))

#Mapping POS tag to first character lemmatize() accepts
def get_wordnet_pos(word):
    
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

#Initializing Lemmatizer
lemmatizer = WordNetLemmatizer()

#Function to Lemmatize every word and remove stopwords 
def lemma(text):
    text = [lemmatizer.lemmatize(x, get_wordnet_pos(x)) for x in nltk.word_tokenize(text)]
    text = [x for x in text if x not in stop]
    return ' '.join(text)

df_clean['cleaned_lyrics'] = df_clean['cleaned_lyrics'].apply(lambda x: lemma(x))
df_clean.head()


In [None]:
df_clean['WordLength'] = np.array([len(text) for text in df_clean['cleaned_lyrics']])
df_clean.head()

In [None]:
df_clean.to_csv("data/Lyrics_Noah_Kahan.csv", index=False)

#### Clean Lyrics for NLTK Analysis

1. Remove text inside square brackets (e.g. [Verse 1])
2. Remove newlines and replace with .

In [None]:
def clean_lyrics(text):
    text = re.sub(r"\[.*?\]", "", text)  
    text = text.replace("\n", ". ")  
    text = re.sub(r"\s+", " ", text).strip()  
    return text

df_clean["cleaned_lyrics_2"] = df_clean["lyrics"].apply(clean_lyrics)
df_clean.to_csv("data/Lyrics_Noah_Kahan_bert.csv", index=False)