# Chords and Lyrics Dataset

## Dataset loading

The dataset that I will be using is one found in Kaggle called [Chords and Lyrics Dataset](https://www.kaggle.com/datasets/eitanbentora/chords-and-lyrics-dataset).

In this dataset, each song has the chords in a dictionary where the key is the line number in the original song and the value is the chords in the line.

In [2]:
import os
import pandas as pd
import ast
import re

In [3]:
data_path = "data"
raw_data_path = os.path.join(data_path, "chords-and-lyrics/chords_and_lyrics.csv")

In [4]:
raw_data = pd.read_csv(raw_data_path, index_col=0)

In [5]:
raw_data.head()

Unnamed: 0,artist_name,song_name,chords&lyrics,chords,lyrics,tabs,lang,artist_id,followers,genres,popularity,name_e_chords
0,Justin Bieber,"10,000 Hours",\nCapo on 3rd fret\n\t \t\t \r\n\r\nVerse 1:\...,{3: 'G G/B ...,"{0: '\nCapo on 3rd fret\n\t \t\t ', 1: '', 2:...","{89: ""I-----, I'm gonna love you ""}",en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber
1,Justin Bieber,2 Much,\n\t \t\t\r\nIntro: F#m7 D2 \r\n\r\nVerse 1:...,"{1: 'Intro: F#m7 D2 ', 4: 'F#m7 ', 8: 'D2 ', ...","{0: '\n\t \t\t', 2: '', 3: 'Verse 1:', 5: ' ...",{},en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber
2,Justin Bieber,2u (feat. David Guetta),\n\t \t\t\r\nEm D C ...,{1: 'Em D C C...,"{0: '\n\t \t\t', 2: ""No limit in the sky that...",{},en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber
3,Justin Bieber,All Around The World,\nCapo on 4th fret\n\t \t\t\r\n\r\n \r\n\r\n ...,"{4: ' Intro: Em Bm Am C (2x) ', 6: ' ...","{0: '\nCapo on 4th fret\n\t \t\t', 1: '', 2: ...",{},en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber
4,Justin Bieber,All Around The World (acoustic),\n\t \t\t\r\n\r\nIntro: Gm - Dm - C - C x2 ...,"{2: 'Intro: Gm - Dm - C - C x2 ', 5: ' ...","{0: '\n\t \t\t', 1: '', 3: '', 4: 'verse 1: '...",{28: 'hide this tab e|--------------10--------...,en,1uNFoZAHBGtllmzznpCI3s,44606973.0,"['canadian pop', 'pop', 'post-teen pop']",100,justin-bieber


In [6]:
total_rows = len(raw_data)
raw_data = raw_data.drop_duplicates()
deduplicated_rows = len(raw_data)

print(f"Found {total_rows - deduplicated_rows} duplicted rows")

Found 0 duplicted rows


## Data processing

Process the chords so we have a string of chords for each song

In [7]:
def process_chords(chords_str):
    chords = ast.literal_eval(chords_str)
    chords = chords.values()
    chords = " ".join(chords)
    chords = re.sub(' +', ' ', chords)
    return chords

In [8]:
process_chords(raw_data["chords"][0])

'G G/B C G G G/B C G G Em C G G Em C G G Em C G G Em C G G Em C G G Em C G G Em C G G G/B C G G G/B C G G Em C G G Em C G G Em C G G Em C G G Em C G G Em C G G Em C G G Em C G Bridge: Bm C Bm C G Em C G G Em C G G Em C G G Em C G G Em C G G Em C G G Em (C) C G (G) G Em C G '

In [9]:
raw_data["chords_str"] = raw_data["chords"].apply(process_chords)

In [10]:
raw_data["chords_str"].sample(10).values.tolist()

['\n\t \t\tIntrd: D4 D D4 D D4 D D4 D D4 A C G C/9 D A C G D A C G C/9 D A C G C/9 D A C G C/9 D A C G C/9 D A C G C/9 D Solo: D A C G C/9 D A C G C/9 D A C G C/9 D Final: D A C G C/9 D D4 D D4 D ',
 'Intro D G F#m Bm Em D G/A D D G/A D G A D Bm Gm F Em D G A D Bm C Am G/A D G D Bm Em A D G A D D G A D G A D Bm Gm F Em D G A D Bm C Am A D G D Bm Em A D D G D Bm Em A D G ',
 '\n\t \t\tA B/A D E7 F° F#m A B/A D E7 F° F#m C E7 F7+ Bb7 C E7 F7+ Bb7 A B/A D E7 F° F#m A B/A D E7 F° F#m C E7 F7+ Bb7 C E7 F7+ Bb7 A B/A D E7 F° F#m A B/A D E7 F° F#m C E7 F7+ Bb7 C E7 F7+ Bb7 ( A B/A D E7 F° F#m ) C E7 F7+ Bb7 C E7 F7+ Bb7 A B/A D E7 F° F#m A B/A D E7 F° F#m D E7 F° F#m D E7 F° F#m ',
 '\n\t \t\t B D#m C#m F# C#m F# B D#m C#m F# B F# B F# E B G#m F# D#m B F# B G#m F# B C#m F# D#m B C#m F# D#m B ',
 'Intro x3 Bm D Em G Bm D Em D Bm D G Bm D G D Bm D G Bm D G D Bm D G Bm D G D Bm D G Bm D G D Bm G D Em Bm G D Em Bm G D Em G A G Em Bm D G Em Bm D G Em Bm D G Em Bm D Bm D Em G Bm D Em D Bm D G Bm D 

In [11]:
data = raw_data[["genres", "artist_name", "song_name", "chords_str"]].copy()

In [12]:
len(data)

135783

In [13]:
data.sample(10)

Unnamed: 0,genres,artist_name,song_name,chords_str
11254,"['alternative metal', 'german metal', 'industr...",Rammstein,Rammstein - Amour,Am\t\t\t\t Em\t\t\t\t\t Am\t Em Am\t\t\t\t Em\...
82715,"['adult standards', 'brill building pop', 'lou...",Fats Domino,Who Cares,Intro: C C G C F C G C G C F C G C F C F C F D...
87003,"['bubblegum dance', 'dance pop', 'electropop',...",Girls Aloud,I Think We're Alone Now,\n\t \t\tD C E D C E A C E C E C E C A C A C E...
34021,"['downtempo', 'dream pop', 'indietronica']",The xx,Crystalised,\n\t \t\tEm Bm A Em Bm A B:--0-------------2/0...
115193,"['album rock', 'art rock', 'blues rock', 'clas...",Slade,Everyday,Intro: Em / G C / Em / Am / G G D/F# Em Em/D C...
68597,"['argentine rock', 'latin alternative', 'latin...",Divididos,"Casitas Inundadas, A Votar",B G A E (x2) B G A E B G A E B G A E B G A E B...
50323,"['argentine rock', 'latin alternative', 'latin...",Charly García,Blues Del Levante,\n\t \t\tIntro: A5 D5 A5 E5 D5 A5 D5 A5 D5 A5 ...
132698,"['corrido', 'nuevo regional mexicano', 'region...",Ariel Camacho y Los Plebes Del Rancho,El 011,G C D G G C D G C D7 G G C D G G C D G C D7 G...
4642,"['glam rock', 'mellow gold', 'piano rock', 'so...",Elton John,Step into Christmas,\n\t \t\tIntro: D A D A 2X D D7 G D D7 G Am7 G...
96380,"['argentine telepop', 'cumbia pop']",Karina,Noches Mágicas,D A G D G D A D D A Bm F#m G D A D


In [14]:
data_output_path = os.path.join(data_path, "chords-and-lyrics/dataset_aggregated.csv")
data.to_csv(data_output_path, header=True, index=False)

## Sanitization

In [16]:
from collections import Counter
tokens = []
for s in data["chords_str"]:
    tokens.extend(s.split())
tokens = Counter(tokens)

In [17]:
len(tokens)

92308

In [18]:
tokens.most_common()

[('G', 1043017),
 ('C', 857737),
 ('D', 804452),
 ('A', 639699),
 ('F', 517998),
 ('E', 452061),
 ('Am', 442944),
 ('Em', 397510),
 ('Bm', 246770),
 ('B', 219164),
 ('Dm', 209361),
 ('Bb', 169027),
 ('F#m', 163611),
 ('F#', 124769),
 ('C#m', 105605),
 ('Gm', 103030),
 ('A7', 90634),
 ('E7', 78733),
 ('Eb', 76972),
 ('Cm', 76744),
 ('Am7', 76049),
 ('Em7', 72264),
 ('B7', 68418),
 ('D7', 67494),
 ('G7', 64856),
 ('C#', 63144),
 ('Fm', 55801),
 ('G#', 55197),
 ('G#m', 54412),
 ('Ab', 51819),
 ('Bm7', 47675),
 ('Dm7', 47504),
 ('Intro:', 41863),
 ('|', 38263),
 ('C7', 34828),
 ('-', 34546),
 ('D#', 34252),
 ('A#', 33875),
 ('D/F#', 31275),
 ('G/B', 28990),
 ('F#m7', 27306),
 ('Bbm', 27022),
 ('D9', 26001),
 ('Gm7', 25196),
 ('C9', 24453),
 ('Db', 23783),
 ('F#7', 23684),
 ('C#m7', 22146),
 ('D#m', 20956),
 ('Cadd9', 20126),
 (')', 19892),
 ('(', 19813),
 ('A9', 18001),
 ('F7', 17448),
 ('Cm7', 16348),
 ('A5', 16010),
 ('Intro', 15130),
 ('Ebm', 14739),
 ('D5', 14623),
 ('G5', 13489),
 ('C

In [68]:
data[data["chords_str"].str.contains("dim", regex=False)]["chords_str"].sample(1).iloc[0]

'\n\t \t\tIntro: Em7 A9 Em7 A7/9- D7M B7 Em7 A7/13- Em7 A7/13- D7M A7/9 Fdim D7M Bm7 E7 Em7 A9 Em7 A7/9- D7M B7 Em7 A7/13- Em7 A7/13- D7M A7/9 Fdim D7M Bm7 E7 Em7 A9 A7/9- D D7M D7 G G6 Fdim D A7 A7/13+ D7 G G6 Fdim D B7 Em7 A7/13- Em7 A7/13- D7M A7/9 Fdim D7M Bm7 E7 Em7 A7 Em7 Edim D A7 Gdim D7M Interlude: Em7 A7/13- D7M A7/9 Fdim D6 Bm7 E7 Em7 A9 A7/9- D D7M D7 G G6 Fdim D A7 A7/13+ D7 G G6 Fdim D B7 Em7 A7/13- Em7 A7/13- D7M A7/9 Fdim D7M Bm7 E7 Em7 A7 Em7 Edim D A7 Gdim D7M '

Here we have a list of special tokens that have been found. Each token has been obtained by spliting the chord strings by blank space to be able to see them quickly.

We can see special tokens that are used for different things:

- Special characters: for example |, () and {} might be used to enclose a group of chords to create an structure.
- Written annotations: there are written nots that tell you how to do somthing explicitly
- Structure parts: for example labeling the 'intro' or 'solo' part
- Repetition tokens: for example x2 that tells you to repeat the last part two times
- There are also annotations on how to play the chords (chord fingering): for example E|--------------18b\\13p11-13-11-------------|

We can also see that different chords may be represented in different notations.

For starters we will use the GPT-2 tokenizer.
Later on, we can try to train or own tokenizer.
Also, added to the second step we can later remove some of the tokens like the chord fingering

In [67]:
special_tokens = [
    # Special characters
    "|", "||", "-", "(", ")", ":", "?", "%", ",", "*", "Q", "}", "{", 
    "//", "\\", "...", "..", ".", "S", "^", "~", "--", ":||", "->",
    "\n\t", "\t\t", "\t\t\t", 
        
    # Written anotations and other words
    "hide", "this", "tab",
    "repeats", "for", "the", "rest",
    "Repeat", "repeat", 
    "(repeat", "and", "fade)",
    "sinner?",
    "I", "hear", "voices"
    "and", "a",
    "Som", "que", "mexa", "e", "estremeça", "com",
    "até", "o", "final"

    # Structure parts
    "Intro:", "INTRO", "Introdução:", "(intro)", "(Intro)", "intro", "intro:", "(intro", "Introdução", "Intro.:", "Intr.:", "Introd:", "intro)", "Introd.:",
    "Bridge:", "BRIDGE", "BRIDGE:", "(bridge)", "(Bridge)", "bridge",  
    "Riff", "riff", "(Riff", "(riff)", "RIFF",
    "Solo", "Solo:", "SOLO", "solo", "SOLO:", "(solo)", "solo:", "(Solo)", "solo", "(solo)"
    "Interlude:", "Interlude", "INTERLUDE:",
    "Instrumental:", "Instrumental", "Musica:", 
    "Final", "Final:", "FINAL:",
    "Outro:", "OUTRO:", "Outro",
    "Break", "Break:", "BREAK:", "break", 
    "Verse", "Verse:", 
    "Guitar", "Guit.",
    "Chorus", "Chorus:",
    "(stop)",

    # Repetition
    "(2x)", "x2", "(x2)", "2x", "X2", "(2X)", "2X", "2x)", "(X2)", "2x:", "(2)",
    "(4x)", "x4", "(x4)", "4x", 
    "(3x)", "3x", "(x3)", "x3", "(x3)", "3x",
    "x8",
    "1", "2", "2:", "4", "1:", "3", "2)", "1)", "X", "x",
    
    
    # Chord fingering
    "x32013", "xx0235",

    
    "E|------------------------------------------|",
    "E|----------------------------------------------------|",
    "A|------------------------------------------|",
    "E|---------------------------------------------------------------------------|",
    "D|------------------------------------------|",
    "A|---------------------------------------------------------------------------|",
    "D|---------------------------------------------------------------------------|",
    "A|----------------------------------------------------|",
    "D|----------------------------------------------------|"
    "E|---------------------------------------------------",
    "G|---------------------------------------------------------------------------|",
    "G|------------------------------------------|",
    "B|------------------------------------------|",
    "B|-------------------------6p5h6p5-6/8-5-8--|"
]