In [None]:
import pandas as pd
import nltk
import re


In [None]:

from nltk.corpus import stopwords

from nltk import FreqDist
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

from sklearn.pipeline import Pipeline

from sklearn.naive_bayes import MultinomialNB

In [76]:
df = pd.read_csv('../data/lyrics_data.csv', index_col=0)


In [77]:
df

Unnamed: 0,genre,id,track,artist,album,release_date,release_year,length_ms,explicit,popularity,...,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics
0,pop,1V6gIisPpYqgFeWbMLI0bA,Heart Attack,['Demi Lovato'],Demi,2013-01-01,2013,210840,False,85,...,8,-4.802,1,0.1040,0.0738,0.000000,0.2390,0.502,173.968,104 ContributorsTranslationsTiếng ViệtFrançais...
1,pop,0K6iKgHPPhAb4Acmg9aD77,On My Love,"['Zara Larsson', 'David Guetta']",On My Love (Sped Up),2023-09-12,2023,222911,False,76,...,10,-4.989,0,0.0460,0.0724,0.002900,0.0881,0.316,123.069,21 ContributorsTranslationsDeutschEspañolOn My...
2,pop,4e4fqjx0Izh4svvTef1z7e,Meant to Be (feat. Florida Georgia Line),"['Bebe Rexha', 'Florida Georgia Line']",Expectations,2018-06-22,2018,164205,False,77,...,10,-6.610,1,0.0848,0.0476,0.000000,0.0646,0.589,153.995,5 Contributors​limykelsy’s listening log for 2...
3,pop,0HPD5WQqrq7wPWR7P7Dw1i,TiK ToK,['Kesha'],Animal (Expanded Edition),2010-01-01,2010,199693,False,86,...,2,-2.718,0,0.1420,0.0991,0.000000,0.2890,0.714,120.028,111 ContributorsTranslationsPortuguêsHrvatskiT...
4,pop,3zHq9ouUJQFQRf3cm1rRLu,"Love Me Like You Do - From ""Fifty Shades Of Grey""",['Ellie Goulding'],Delirium (Deluxe),2015-11-06,2015,252534,False,80,...,8,-6.646,1,0.0484,0.2470,0.000000,0.1250,0.275,189.857,63 Contributors2016 Nominees LyricsSong of the...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,country,6olKv2HP3XgBpvVxAswowe,Bartender,['Lady A'],747,2014-09-30,2014,198266,False,66,...,11,-3.858,0,0.0340,0.0198,0.000007,0.3270,0.654,101.011,15 ContributorsBartender Lyrics[Verse 1]\n8 o’...
2433,country,0DAjiINHQTnPnnY939Qyhw,Let It Rain,"['David Nail', 'Sarah Buxton']",Let It Rain,2011-01-01,2011,222253,False,51,...,4,-3.013,1,0.0405,0.3850,0.000001,0.2470,0.561,77.881,9 ContributorsLet It Rain Lyrics[Verse 1: Davi...
2434,country,2oHW5EW14gL2VG4q0gRh6j,Fine Wine,['Runaway June'],"Smoke, Wine & Whiskey",2024-01-26,2024,164413,False,44,...,9,-4.127,0,0.0395,0.5140,0.000000,0.1420,0.856,106.502,1 ContributorFine Wine Lyrics[Verse 1]\nI trie...
2435,country,6Ser4pIAKEoXok7eDJPRK7,The Devil I Know,['Ashley McBryde'],The Devil I Know,2023-09-08,2023,187880,False,57,...,2,-5.195,1,0.0437,0.0861,0.000000,0.1100,0.607,168.184,6 ContributorsThe Devil I Know Lyrics[Verse 1]...


In [78]:
# function to extract list of song sections
def get_sections(lyrics):
    # find all section headers enclosed in brackets
    headers = re.findall(r'\[.*?\]', lyrics)
    # extract values before colon
    sections = [re.findall(r'\[([^:]+)', header)[0] for header in headers]
    # remove non-letter characters and create list of unique values
    sections = [re.sub(r'[^a-zA-Z\s]', '', section) for section in sections]
    sections = [word for section in sections for word in section.split()]
    sections = list(set(sections))
    return sections

In [None]:
# function to remove headers, footers, and extra spaces
def clean_text(lyrics):
    lyrics = re.sub(r'^.*?Lyrics', '', lyrics)
    lyrics = re.sub(r'\d*Embed\b', '', lyrics)
    lyrics = re.sub(r'\s+', ' ', lyrics)
    lyrics = re.sub(r'\n\s*\n', '\n', lyrics)
    return lyrics

In [71]:
custom_sw = ["i'd", "i'm",
             'yeah', 'ah', 'oh']

sw = set(stopwords.words('english'))
sw.update(custom_sw)

In [72]:
# function to clean lyric text

def preprocess_lyrics(lyrics):
    
    # remove text before section header
    lyrics = re.sub(r'\[.*?\]', '', lyrics)

    # remove section headers inside brackets
    lyrics = re.sub(r'\[.*?\]', '', lyrics)

    # replace new line with space
    lyrics = lyrics.replace('\n', ' ')

    # remove numbers and special characters - keep only letters, apostrophes, dashes
    lyrics = re.sub(r'[^a-zA-Z\s\'-]', '', lyrics)

    # replace dashes with space
    lyrics = lyrics.replace('-', ' ')

    # lowercase all - or analyze based on capitalization such as names, cities, etc.
    lyrics = lyrics.lower()

    # remove stopwords - set custom list or don't remove
    lyrics = ' '.join([word for word in lyrics.split() if word not in sw])

    return lyrics


In [80]:
# extract song sections and apply basic text cleaning
df['sections'] = df['lyrics'].apply(get_sections)
df['lyrics'] = df['lyrics'].apply(clean_text)

In [82]:
df['lyrics_text'] = df['lyrics'].apply(preprocess_lyrics)
df

Unnamed: 0,genre,id,track,artist,album,release_date,release_year,length_ms,explicit,popularity,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics,sections,lyrics_text
0,pop,1V6gIisPpYqgFeWbMLI0bA,Heart Attack,['Demi Lovato'],Demi,2013-01-01,2013,210840,False,85,...,1,0.1040,0.0738,0.000000,0.2390,0.502,173.968,Puttin' my defenses up 'Cause I don't wanna f...,"[Chorus, Intro, Verse, PreChorus, Bridge]",puttin' defenses 'cause wanna fall love ever t...
1,pop,0K6iKgHPPhAb4Acmg9aD77,On My Love,"['Zara Larsson', 'David Guetta']",On My Love (Sped Up),2023-09-12,2023,222911,False,76,...,0,0.0460,0.0724,0.002900,0.0881,0.316,123.069,"(That's on my love) (That's on my love) No, I...","[Chorus, Intro, Verse, PreChorus, Drop]",that's love that's love need time make mind th...
2,pop,4e4fqjx0Izh4svvTef1z7e,Meant to Be (feat. Florida Georgia Line),"['Bebe Rexha', 'Florida Georgia Line']",Expectations,2018-06-22,2018,164205,False,77,...,1,0.0848,0.0476,0.000000,0.0646,0.589,153.995,January:Albums:Weezer (Teal Album) Bring Me th...,"[B, Deluxe, Tracks, Edition, feat, Chainz, Ray...",januaryalbumsweezer teal album bring horizon a...
3,pop,0HPD5WQqrq7wPWR7P7Dw1i,TiK ToK,['Kesha'],Animal (Expanded Edition),2010-01-01,2010,199693,False,86,...,0,0.1420,0.0991,0.000000,0.2890,0.714,120.028,Wake up in the morning feelin' like P. Diddy ...,"[Chorus, Verse, PreChorus, Break, Bridge]",wake morning feelin' like p diddy hey girl gra...
4,pop,3zHq9ouUJQFQRf3cm1rRLu,"Love Me Like You Do - From ""Fifty Shades Of Grey""",['Ellie Goulding'],Delirium (Deluxe),2015-11-06,2015,252534,False,80,...,1,0.0484,0.2470,0.000000,0.1250,0.275,189.857,Song of the YearKendrick Lamar - Alright Taylo...,[],song yearkendrick lamar alright taylor swift b...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2432,country,6olKv2HP3XgBpvVxAswowe,Bartender,['Lady A'],747,2014-09-30,2014,198266,False,66,...,0,0.0340,0.0198,0.000007,0.3270,0.654,101.011,8 o’clock on Friday night I’m still at home A...,"[Verse, Chorus, Bridge]",oclock friday night im still home girls keep b...
2433,country,0DAjiINHQTnPnnY939Qyhw,Let It Rain,"['David Nail', 'Sarah Buxton']",Let It Rain,2011-01-01,2011,222253,False,51,...,1,0.0405,0.3850,0.000001,0.2470,0.561,77.881,It's hard to find the perfect time to say som...,"[Verse, Chorus, Break, Instrumental]",hard find perfect time say something know gonn...
2434,country,2oHW5EW14gL2VG4q0gRh6j,Fine Wine,['Runaway June'],"Smoke, Wine & Whiskey",2024-01-26,2024,164413,False,44,...,0,0.0395,0.5140,0.000000,0.1420,0.856,106.502,I tried to get over him sober But I didn't ge...,"[Verse, Chorus, Bridge]",tried get sober get far gonna pour closure say...
2435,country,6Ser4pIAKEoXok7eDJPRK7,The Devil I Know,['Ashley McBryde'],The Devil I Know,2023-09-08,2023,187880,False,57,...,1,0.0437,0.0861,0.000000,0.1100,0.607,168.184,A little outside of Elizabethtown There's a l...,"[Verse, Chorus, PostChorus, PreChorus]",little outside elizabethtown there's little ba...


In [83]:
# drop rows that don't include song sections
song_sections = ['Intro','Chorus', 'Verse', 'Bridge', 'Break', 'Solo', 'Instrumental', 'Outro']
df = df[df['sections'].apply(lambda x: any(section in x for section in song_sections) if x else False)]


In [84]:
df

Unnamed: 0,genre,id,track,artist,album,release_date,release_year,length_ms,explicit,popularity,...,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,lyrics,sections,lyrics_text
0,pop,1V6gIisPpYqgFeWbMLI0bA,Heart Attack,['Demi Lovato'],Demi,2013-01-01,2013,210840,False,85,...,1,0.1040,0.0738,0.000000,0.2390,0.502,173.968,Puttin' my defenses up 'Cause I don't wanna f...,"[Chorus, Intro, Verse, PreChorus, Bridge]",puttin' defenses 'cause wanna fall love ever t...
1,pop,0K6iKgHPPhAb4Acmg9aD77,On My Love,"['Zara Larsson', 'David Guetta']",On My Love (Sped Up),2023-09-12,2023,222911,False,76,...,0,0.0460,0.0724,0.002900,0.0881,0.316,123.069,"(That's on my love) (That's on my love) No, I...","[Chorus, Intro, Verse, PreChorus, Drop]",that's love that's love need time make mind th...
3,pop,0HPD5WQqrq7wPWR7P7Dw1i,TiK ToK,['Kesha'],Animal (Expanded Edition),2010-01-01,2010,199693,False,86,...,0,0.1420,0.0991,0.000000,0.2890,0.714,120.028,Wake up in the morning feelin' like P. Diddy ...,"[Chorus, Verse, PreChorus, Break, Bridge]",wake morning feelin' like p diddy hey girl gra...
5,pop,7DSAEUvxU8FajXtRloy8M0,Flowers,['Miley Cyrus'],Endless Summer Vacation,2023-08-18,2023,200600,False,94,...,1,0.0633,0.0584,0.000070,0.0232,0.632,118.048,"We were good, we were gold Kinda dream that c...","[Chorus, Verse, PostChorus, PreChorus, Bridge]",good gold kinda dream can't sold right 'til bu...
6,pop,6FZDfxM3a3UCqtzo5pxSLZ,Without Me,['Halsey'],Manic,2020-01-17,2020,201660,True,84,...,1,0.0705,0.2970,0.000009,0.0936,0.533,136.041,Found you when your heart was broke I filled ...,"[Verse, Chorus, Bridge, PreChorus]",found heart broke filled cup overflowed took f...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2431,country,4mnuiEDq1ADXUTYWJxLo4B,Helluva Life,['Frankie Ballard'],Sunshine & Whiskey,2014-04-03,2014,231693,False,50,...,1,0.0277,0.0676,0.000000,0.1440,0.528,134.015,"Saturday night and a six pack, girl Big star ...","[Verse, Chorus, Bridge, Outro]",saturday night six pack girl big star shining ...
2432,country,6olKv2HP3XgBpvVxAswowe,Bartender,['Lady A'],747,2014-09-30,2014,198266,False,66,...,0,0.0340,0.0198,0.000007,0.3270,0.654,101.011,8 o’clock on Friday night I’m still at home A...,"[Verse, Chorus, Bridge]",oclock friday night im still home girls keep b...
2433,country,0DAjiINHQTnPnnY939Qyhw,Let It Rain,"['David Nail', 'Sarah Buxton']",Let It Rain,2011-01-01,2011,222253,False,51,...,1,0.0405,0.3850,0.000001,0.2470,0.561,77.881,It's hard to find the perfect time to say som...,"[Verse, Chorus, Break, Instrumental]",hard find perfect time say something know gonn...
2434,country,2oHW5EW14gL2VG4q0gRh6j,Fine Wine,['Runaway June'],"Smoke, Wine & Whiskey",2024-01-26,2024,164413,False,44,...,0,0.0395,0.5140,0.000000,0.1420,0.856,106.502,I tried to get over him sober But I didn't ge...,"[Verse, Chorus, Bridge]",tried get sober get far gonna pour closure say...


# modeling

In [85]:
# train test split
X = df['lyrics_text']
y = df['genre']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)

In [89]:
y_train.value_counts(normalize=True)

rock       0.312268
pop        0.254647
country    0.244734
hip hop    0.188352
Name: genre, dtype: float64

In [None]:
def get_metrics(model_name, model, X, y,):
    #create labels and empty df
    labels = ['country','dance_pop','hip_hop','rock']
    #set up model
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42, stratify=y)
    
    model.fit(X_train,y_train)
    preds = model.predict(X_test)
    train_score = model.score(X_train,y_train)
    
    #store results
    data = confusion_matrix(y_test, preds)
    results = {labels[i] : data[i][i]/sum(data[i]) for i in range(0, len(data))}
    
    result_df = pd.DataFrame(results, index=[0])
    result_df['model'] = model_name
    result_df['train_accuracy'] = train_score
    result_df['accuracy'] = model.score(X_test,y_test)
    return result_df[['model','train_accuracy','accuracy'] + labels]

In [None]:
pipe_cv = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('model', MultinomialNB())
])



In [None]:
counts = count_df.sum()
counts.sort_values(ascending=False)

In [None]:

# instantiate vectorizer
tfidf = TfidfVectorizer()

# Fit and transform the lyrics_cleaned column
lyrics_vectorized = tfidf.fit_transform(test_df['lyrics_cleaned'])

col_names = tfidf.get_feature_names_out()

# Convert the TF-IDF matrix to a DataFrame
tfidf_df = pd.DataFrame(lyrics_vectorized.toarray(), columns=col_names)

tfidf_df


In [None]:
word_counts = tfidf_df.sum()
word_counts.sort_values(ascending=False)

In [None]:
vector_df