In [1]:
import pandas as pd
from transformers import GPT2Tokenizer

In [35]:
# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

In [37]:
# Maximum sequence length for GPT-2
max_length = 1024

In [39]:
# Read the combined CSV file
combined_file = 'combined_artists.csv'
df = pd.read_csv(combined_file)

In [41]:
# Create a new column with tokenized lyrics
def tokenize_lyrics(text):
    # Handle potential NaN values or non-string types
    if pd.isna(text) or not isinstance(text, str):
        return []
    
    # Tokenize the text and return the token IDs as a list
    tokens = tokenizer.encode(text)
    
    # Truncate tokens if they exceed the model's maximum length
    if len(tokens) > max_length:
        print(f"Truncating a lyric from {len(tokens)} tokens to {max_length}")
        tokens = tokens[:max_length]
    
    return tokens

In [43]:
# Apply the tokenization function to each lyric
print("Tokenizing lyrics... this may take a moment")
df['Tokenized_Lyrics'] = df['Lyric'].apply(tokenize_lyrics)

Tokenizing lyrics... this may take a moment


Token indices sequence length is longer than the specified maximum sequence length for this model (1072 > 1024). Running this sequence through the model will result in indexing errors


Truncating a lyric from 1072 tokens to 1024
Truncating a lyric from 1075 tokens to 1024
Truncating a lyric from 1215 tokens to 1024
Truncating a lyric from 1222 tokens to 1024
Truncating a lyric from 1408 tokens to 1024
Truncating a lyric from 1192 tokens to 1024
Truncating a lyric from 1077 tokens to 1024
Truncating a lyric from 1036 tokens to 1024
Truncating a lyric from 1075 tokens to 1024
Truncating a lyric from 1083 tokens to 1024
Truncating a lyric from 1080 tokens to 1024
Truncating a lyric from 1055 tokens to 1024
Truncating a lyric from 1215 tokens to 1024
Truncating a lyric from 1143 tokens to 1024
Truncating a lyric from 1050 tokens to 1024
Truncating a lyric from 1036 tokens to 1024
Truncating a lyric from 1177 tokens to 1024
Truncating a lyric from 1280 tokens to 1024
Truncating a lyric from 1118 tokens to 1024
Truncating a lyric from 1351 tokens to 1024
Truncating a lyric from 1043 tokens to 1024
Truncating a lyric from 1679 tokens to 1024
Truncating a lyric from 1189 tok

In [44]:
# Save the updated DataFrame to a new CSV
output_file = 'combined_artists_tokenized.csv'
df.to_csv(output_file, index=False)

print(f"Tokenized data saved to: {output_file}")

Tokenized data saved to: combined_artists_tokenized.csv


In [45]:
df.head()

Unnamed: 0,Artist,Title,Album,Year,Date,Lyric,Tokenized_Lyrics
0,Dua Lipa,New Rules,Dua Lipa,2017.0,2017-06-02,one one one one one talkin' in my sleep at n...,"[505, 530, 530, 530, 530, 220, 220, 1561, 259,..."
1,Dua Lipa,Don’t Start Now,Future Nostalgia,2019.0,2019-11-01,if you don't wanna see me did a full 80 craz...,"[361, 345, 836, 470, 18869, 766, 502, 220, 220..."
2,Dua Lipa,IDGAF,Dua Lipa,2017.0,2017-06-02,you call me all friendly tellin' me how much y...,"[5832, 869, 502, 477, 8030, 1560, 259, 6, 502,..."
3,Dua Lipa,Blow Your Mind (Mwah),Dua Lipa,2016.0,2016-08-26,i know it's hot i know we've got something tha...,"[72, 760, 340, 338, 3024, 1312, 760, 356, 1053..."
4,Dua Lipa,Be the One,Dua Lipa,2015.0,2015-10-30,i see the moon i see the moon i see the moon o...,"[72, 766, 262, 8824, 1312, 766, 262, 8824, 131..."


In [46]:
# Count how many lyrics needed truncation
original_lengths = df['Lyric'].apply(lambda x: len(tokenizer.encode(x)) if isinstance(x, str) else 0)
truncated_count = sum(original_lengths > max_length)
print(f"\nNumber of lyrics that required truncation: {truncated_count}")


Number of lyrics that required truncation: 487


In [50]:
# Display a sample to verify tokenization
print("\nSample of tokenized data:")
sample_df = df[['Lyric', 'Tokenized_Lyrics']].head(3)
for index, row in sample_df.iterrows():
    print(f"\nOriginal: {row['Lyric']}")
    print(f"Tokens: {row['Tokenized_Lyrics'][:10]}... (total: {len(row['Tokenized_Lyrics'])})")
    # Optionally decode a portion of the tokens back
    if row['Tokenized_Lyrics']:
        decoded = tokenizer.decode(row['Tokenized_Lyrics'][:10])
        print(f"Decoded sample: {decoded}...")


Sample of tokenized data:

Original: one one one one one   talkin' in my sleep at night makin' myself crazy out of my mind out of my mind wrote it down and read it out hopin' it would save me too many times too many times  refrain my love he makes me feel like nobody else nobody else but my love he doesn't love me so i tell myself i tell myself  pre one don't pick up the phone you know he's only callin' 'cause he's drunk and alone two don't let him in you'll have to kick him out again three don't be his friend you know you're gonna wake up in his bed in the morning and if you're under him you ain't gettin' over him   i got new rules i count 'em i got new rules i count 'em i gotta tell them to myself i got new rules i count 'em i gotta tell them to myself   i keep pushin' forwards but he keeps pullin' me backwards nowhere to turn no way nowhere to turn no now i'm standin' back from it i finally see the pattern i never learn i never learn  refrain but my love he doesn't love me so i tel

In [51]:
df.head()

Unnamed: 0,Artist,Title,Album,Year,Date,Lyric,Tokenized_Lyrics
0,Dua Lipa,New Rules,Dua Lipa,2017.0,2017-06-02,one one one one one talkin' in my sleep at n...,"[505, 530, 530, 530, 530, 220, 220, 1561, 259,..."
1,Dua Lipa,Don’t Start Now,Future Nostalgia,2019.0,2019-11-01,if you don't wanna see me did a full 80 craz...,"[361, 345, 836, 470, 18869, 766, 502, 220, 220..."
2,Dua Lipa,IDGAF,Dua Lipa,2017.0,2017-06-02,you call me all friendly tellin' me how much y...,"[5832, 869, 502, 477, 8030, 1560, 259, 6, 502,..."
3,Dua Lipa,Blow Your Mind (Mwah),Dua Lipa,2016.0,2016-08-26,i know it's hot i know we've got something tha...,"[72, 760, 340, 338, 3024, 1312, 760, 356, 1053..."
4,Dua Lipa,Be the One,Dua Lipa,2015.0,2015-10-30,i see the moon i see the moon i see the moon o...,"[72, 766, 262, 8824, 1312, 766, 262, 8824, 131..."


In [52]:
df["Lyric"].iloc[12]

"in the beginning god created heaven and earth for what it's worth i think that he might've created you first just my opinion your body is the one paradise that i wanna fly to every day and every night  pre i've been sick and tired of running chasing all of the flashing lights these late nights don't mean nothing so i just wanna apologize i'm sorry so sorry i'm sorry yeah   i need your love and i'm dying for the rush 'cause my heart ain't got enough i need your touch this is getting serious tell me that it's not the end of us how can we go back to the beginning how can we go back to the beginning without you i've got no air to breathe in how can we go back to the beginning   don't matter what's written we can start all over again all over again oh how can i get you all over my skin my deep intuition tells me that i'm doing you wrong if i don't come home just say you forgive me and don't let me go  pre i've been sick and tired of running chasing all of the flashing lights these late nig

In [53]:
df["Tokenized_Lyrics"].iloc[12]

[259,
 262,
 3726,
 5770,
 2727,
 9538,
 290,
 4534,
 329,
 644,
 340,
 338,
 2861,
 1312,
 892,
 326,
 339,
 1244,
 1053,
 2727,
 345,
 717,
 655,
 616,
 4459,
 534,
 1767,
 318,
 262,
 530,
 31354,
 326,
 1312,
 18869,
 6129,
 284,
 790,
 1110,
 290,
 790,
 1755,
 220,
 662,
 1312,
 1053,
 587,
 6639,
 290,
 10032,
 286,
 2491,
 20023,
 477,
 286,
 262,
 25293,
 7588,
 777,
 2739,
 12513,
 836,
 470,
 1612,
 2147,
 523,
 1312,
 655,
 18869,
 16521,
 1312,
 1101,
 7926,
 523,
 7926,
 1312,
 1101,
 7926,
 10194,
 220,
 220,
 1312,
 761,
 534,
 1842,
 290,
 1312,
 1101,
 9950,
 329,
 262,
 10484,
 705,
 25587,
 616,
 2612,
 18959,
 470,
 1392,
 1576,
 1312,
 761,
 534,
 3638,
 428,
 318,
 1972,
 2726,
 1560,
 502,
 326,
 340,
 338,
 407,
 262,
 886,
 286,
 514,
 703,
 460,
 356,
 467,
 736,
 284,
 262,
 3726,
 703,
 460,
 356,
 467,
 736,
 284,
 262,
 3726,
 1231,
 345,
 1312,
 1053,
 1392,
 645,
 1633,
 284,
 18044,
 287,
 703,
 460,
 356,
 467,
 736,
 284,
 262,
 3726,
 220,
 220,
 83

In [61]:
# Display a sample to verify tokenization
sample_df = df[['Lyric', 'Tokenized_Lyrics']].head(1)
for index, row in sample_df.iterrows():
    print(f"\nOriginal: {row['Lyric']}")
    print(f"Tokens: {row['Tokenized_Lyrics'][:10]}... (total: {len(row['Tokenized_Lyrics'])})")
    # Optionally decode a portion of the tokens back
    if row['Tokenized_Lyrics']:
        decoded = tokenizer.decode(row['Tokenized_Lyrics'])
        print(f"\nDecoded sample: {decoded}...")


Original: one one one one one   talkin' in my sleep at night makin' myself crazy out of my mind out of my mind wrote it down and read it out hopin' it would save me too many times too many times  refrain my love he makes me feel like nobody else nobody else but my love he doesn't love me so i tell myself i tell myself  pre one don't pick up the phone you know he's only callin' 'cause he's drunk and alone two don't let him in you'll have to kick him out again three don't be his friend you know you're gonna wake up in his bed in the morning and if you're under him you ain't gettin' over him   i got new rules i count 'em i got new rules i count 'em i gotta tell them to myself i got new rules i count 'em i gotta tell them to myself   i keep pushin' forwards but he keeps pullin' me backwards nowhere to turn no way nowhere to turn no now i'm standin' back from it i finally see the pattern i never learn i never learn  refrain but my love he doesn't love me so i tell myself i tell myself i do