In [80]:
import pandas as pd
import tiktoken
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nicolas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/nicolas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/nicolas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [81]:
DATA_PATH = '../../data/'
filename = 'english_cleaned_lyrics.csv'
zipfile = 'english_cleaned_lyrics.zip'


In [82]:
df = pd.read_csv(DATA_PATH + filename)
df

Unnamed: 0.1,Unnamed: 0,index,song,year,artist,genre,lyrics
0,0,0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know I'm gonna cut r...
1,1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it's like you seem s...
2,2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...
3,3,3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...
4,4,4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party it's pop...
...,...,...,...,...,...,...,...
218205,362232,362232,who-am-i-drinking-tonight,2012,edens-edge,Country,I gotta say Boy after only just a couple of da...
218206,362233,362233,liar,2012,edens-edge,Country,I helped you find her diamond ring You made me...
218207,362234,362234,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth Looks a...
218208,362235,362235,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth And I'm measu...


In [83]:
print('   -------------------')
print("Number of artists:", len(df['artist'].unique()))
print('   -------------------')
print("Genres:", df['genre'].unique())
print('   -------------------')
years = df['year'].unique()
years.sort()
print("years:", years)
print('   -------------------')

   -------------------
Number of artists: 10431
   -------------------
Genres: ['Pop' 'Hip-Hop' 'Rock' 'Metal' 'Other' 'Country' 'Jazz' 'Electronic'
 'Folk' 'R&B' 'Indie']
   -------------------
years: [  67  112  702 1968 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979
 1980 1981 1982 1983 1984 1985 1986 1987 1988 1989 1990 1991 1992 1993
 1994 1995 1996 1997 1998 1999 2000 2001 2002 2003 2004 2005 2006 2007
 2008 2009 2010 2011 2012 2013 2014 2015 2016]
   -------------------


In [84]:
df = df[~(df['genre'] == 'Other')]

In [85]:
df[(df['year'] == 67) | (df['year'] == 112) | (df['year'] == 702)]


Unnamed: 0.1,Unnamed: 0,index,song,year,artist,genre,lyrics
15554,27657,27657,star,702,clipse,Hip-Hop,You're my star It's such a wonder how you shin...
40894,69708,69708,anywhere-remix,112,dru-hill,Hip-Hop,Here we are all alone You and me privacy And w...
88365,147914,147914,it-s-over-now-remix,112,g-dep,Hip-Hop,What is this Numbers in your pocket I remember...
143439,238541,238541,come-see-me-remix,112,black-rob,Hip-Hop,Baby you can come see me cause I need you here...
190373,315540,315540,let-s-lurk,67,giggs,Hip-Hop,Verse 1 Still pulling up on smoke Skeng in my ...
201649,335205,335205,i-can-t-believe,112,faith-evans,Pop,I can't believe that love has gone away from ...


## Testing byte pair encoding with tiktoken on 1 sample

In [86]:
lyric = df['lyrics'][0]
preprocessed_lyric = lyric.lower()
preprocessed_lyric = ''.join(char for char in lyric if char not in string.punctuation)

stop_words = set(stopwords.words('english'))
filtered_lyric = [word for word in preprocessed_lyric.split() if word not in stop_words]
preprocessed_lyric = ' '.join(filtered_lyric)
print(f"lyrics before preprocessing {len(lyric)} words:", lyric)
print(f"lyrics after preprocessing {len(preprocessed_lyric)} words:", preprocessed_lyric)

lyrics before preprocessing 1913 words: Oh baby how you doing You know I'm gonna cut right to the chase Some women were made but me myself I like to think that I was created for a special purpose You know what's more special than you You feel me It's on baby let's get lost You don't need to call into work cause you're the boss For real want you to show me how you feel I consider myself lucky that's a big deal Why Well you got the key to my heart But you ain't gonna need it I'd rather you open up my body And show me secrets you didn't know was inside No need for me to lie It's too big it's too wide It's too strong it won't fit It's too much it's too tough He talk like this cause he can back it up He got a big ego such a huge ego I love his big ego it's too much He walk like this cause he can back it up Usually I'm humble right now I don't choose You can leave with me or you could have the blues Some call it arrogant I call it confident You decide when you find on what I'm working with D

In [87]:
def compare_encodings(example_string: str) -> None:
    """Prints a comparison of three string encodings."""
    # print the example string
    print(f'\nExample string: "{example_string}"')
    # for each encoding, print the # of tokens, the token integers, and the token bytes
    for encoding_name in ["r50k_base", "p50k_base", "cl100k_base", "gpt-4"]:
        if encoding_name == "gpt-4":
            encoding = tiktoken.encoding_for_model("gpt-4")
        else:
            encoding = tiktoken.get_encoding(encoding_name)

        token_integers = encoding.encode(example_string)
        num_tokens = len(token_integers)
        token_bytes = [encoding.decode_single_token_bytes(token) for token in token_integers]
        print()
        print(f"{encoding_name}: {num_tokens} tokens")
        print(f"token integers: {token_integers}")
        print(f"token bytes: {token_bytes}")

In [88]:
compare_encodings(lyric)


Example string: "Oh baby how you doing You know I'm gonna cut right to the chase Some women were made but me myself I like to think that I was created for a special purpose You know what's more special than you You feel me It's on baby let's get lost You don't need to call into work cause you're the boss For real want you to show me how you feel I consider myself lucky that's a big deal Why Well you got the key to my heart But you ain't gonna need it I'd rather you open up my body And show me secrets you didn't know was inside No need for me to lie It's too big it's too wide It's too strong it won't fit It's too much it's too tough He talk like this cause he can back it up He got a big ego such a huge ego I love his big ego it's too much He walk like this cause he can back it up Usually I'm humble right now I don't choose You can leave with me or you could have the blues Some call it arrogant I call it confident You decide when you find on what I'm working with Damn I know I'm killing

In [89]:
compare_encodings(df['lyrics'][2])


Example string: "If you search For tenderness It isn't hard to find You can have the love You need to live But if you look For truthfulness You might just As well be blind It always seems to be So hard to give Chorus Honesty Is such a lonely word Everyone is so untrue Honesty Is hardly ever heard And mostly What I need from you I can always Find someone To say They sympathize If I wear my heart Out on my sleeve But I don't want Some pretty face To tell me Pretty lies All I want Is someone To believe Chorus I can find a lover I can find a friend I can have security Until the bitter end Anyone can comfort me With promises again I know I know When I'm deep Inside of me Don't be Too concerned I won't ask For nothin' While I'm gone But when I want Sincerity Tell me where else Can I turn When You're the one That I depend upon Chorus "

r50k_base: 189 tokens
token integers: [1532, 345, 2989, 1114, 15403, 1108, 632, 2125, 470, 1327, 284, 1064, 921, 460, 423, 262, 1842, 921, 761, 284, 2107, 88

In [90]:
df[df['genre'] == 'Metal']

Unnamed: 0.1,Unnamed: 0,index,song,year,artist,genre,lyrics
331,415,415,northern-hymn,2011,demonaz,Metal,Gods Of The Mountains Sky Forest And Seas Land...
332,416,416,under-the-great-fires,2011,demonaz,Metal,Rise gods of the fierous black burning skies R...
333,417,417,all-blackened-sky,2011,demonaz,Metal,A Dying Skyline Cold From Wind And Rain Autumn...
334,418,418,legends-of-fire-and-ice,2011,demonaz,Metal,Fiends of the gods to war we ride Over the bla...
335,419,419,ode-to-battle,2011,demonaz,Metal,Cold Is The Winter On Its Ground We Ride To Th...
...,...,...,...,...,...,...,...
218000,361954,361954,concrete-jungle,2007,drifter,Metal,In a time when mother earth Gets laid by fathe...
218001,361955,361955,principle-of-speed,2007,drifter,Metal,I made an experience It just happens once in a...
218002,361956,361956,so-much-blood,2007,drifter,Metal,I can hear the scream of despair It's shooting...
218003,361957,361957,the-elder,2007,drifter,Metal,High on the hill he was standing there The gua...


In [91]:
lemmatizer = WordNetLemmatizer()

def preprocess_lyric(lyric: str) -> str:
    """Preprocesses a lyric for tokenization."""
    lyric = lyric.lower()
    lyric = ''.join(char for char in lyric if char not in string.punctuation)
    filtered_lyric = [word for word in lyric.split() if word not in stop_words]
    filtered_lyric = [lemmatizer.lemmatize(word) for word in filtered_lyric]
    lyric = ' '.join(filtered_lyric)
    return lyric

In [92]:
df['lyrics-preprocessed'] = df['lyrics'].apply(preprocess_lyric)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['lyrics-preprocessed'] = df['lyrics'].apply(preprocess_lyric)


In [94]:
df['byte-pair-encoding'] = df['lyrics-preprocessed'].apply(lambda x: tiktoken.encoding_for_model("gpt-4").encode(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['byte-pair-encoding'] = df['lyrics-preprocessed'].apply(lambda x: tiktoken.encoding_for_model("gpt-4").encode(x))


In [95]:
df

Unnamed: 0.1,Unnamed: 0,index,song,year,artist,genre,lyrics,lyrics-preprocessed,byte-pair-encoding
0,0,0,ego-remix,2009,beyonce-knowles,Pop,Oh baby how you doing You know I'm gonna cut r...,oh baby know im gonna cut right chase woman ma...,"[2319, 8945, 1440, 737, 16926, 4018, 1314, 335..."
1,1,1,then-tell-me,2009,beyonce-knowles,Pop,playin everything so easy it's like you seem s...,playin everything easy like seem sure still wa...,"[1387, 258, 4395, 4228, 1093, 2873, 2771, 2103..."
2,2,2,honesty,2009,beyonce-knowles,Pop,If you search For tenderness It isn't hard to ...,search tenderness isnt hard find love need liv...,"[1874, 8541, 29668, 70058, 2653, 1505, 3021, 1..."
3,3,3,you-are-my-rock,2009,beyonce-knowles,Pop,Oh oh oh I oh oh oh I If I wrote a book about ...,oh oh oh oh oh oh wrote book stand title book ...,"[2319, 14346, 14346, 14346, 14346, 14346, 6267..."
4,4,4,black-culture,2009,beyonce-knowles,Pop,Party the people the people the party it's pop...,party people people party popping sitting arou...,"[34057, 1274, 1274, 4717, 50949, 11961, 2212, ..."
...,...,...,...,...,...,...,...,...,...
218205,362232,362232,who-am-i-drinking-tonight,2012,edens-edge,Country,I gotta say Boy after only just a couple of da...,gotta say boy couple date youre hand outright ...,"[70, 22983, 2019, 8334, 5743, 2457, 71175, 145..."
218206,362233,362233,liar,2012,edens-edge,Country,I helped you find her diamond ring You made me...,helped find diamond ring made try everything t...,"[8823, 291, 1505, 22996, 10264, 1903, 1456, 43..."
218207,362234,362234,last-supper,2012,edens-edge,Country,Look at the couple in the corner booth Looks a...,look couple corner booth look lot like shes lo...,"[7349, 5743, 9309, 36506, 1427, 2763, 1093, 55..."
218208,362235,362235,christ-alone-live-in-studio,2012,edens-edge,Country,When I fly off this mortal earth And I'm measu...,fly mortal earth im measured depth girth fathe...,"[22374, 49972, 9578, 737, 17303, 8149, 342, 49..."


In [97]:
df = df[['artist', 'genre', 'year', 'lyrics', 'lyrics-preprocessed', 'byte-pair-encoding']]
df.to_csv(DATA_PATH + 'lyrics-preprocessed.csv', index=False)