In [85]:
import pandas as pd
import re
from collections import Counter

In [None]:
# Creating the dataframe

df = pd.read_csv('./chordonomicon.csv') # Shortened version of real chordonomicon

df.head()

Unnamed: 0,chords
0,<intro_1> C <verse_1> F C E7 Amin C F C G7 C F...
1,<intro_1> E D A/Cs E D A/Cs <verse_1> E D A/Cs...
2,<intro_1> D Dmaj7 D Dmaj7 <verse_1> Emin A D G...
3,<intro_1> C <verse_1> G C G C <chorus_1> F Dmi...
4,<intro_1> C G C G <verse_1> C G C G C Bmin Emi...


In [87]:
df['chords'] = df['chords'].apply(lambda s:
    ' '.join([re.sub(r"/[^/]*$", "", chord) # Remove inversions
              for chord in s.split()]
))

all_songs = df['chords'].str.replace(r'<.*?>', '', regex=True).str.strip() # Remove verse notations

In [None]:
all_songs.to_csv('./cutonomicon')

In [105]:
def root_checker(chord):
    if len(chord) == 1:
        return chord[0]

    if chord[1] == 'b' or (chord[1] == 's' and (len(chord) < 3 or chord[2] != 'u')): # Check for flat, sharp, and sus
        return chord[:2]
    
    return chord[0]  # Returns chord

all_modifiers = ['', 'min', 'maj', 'maj7', '5', 'm11', 'maj9', '7b5', '9(#11)', '9(#5)', 'aug', 'dim', '6/9', '11', '6(#11)', '7(#9)', '13(#11)', 'dim7', 'm7b5', 'sus2', 'm(maj7)', 'maj', 'min9', '7', '7(#5)', '+(#11)', 'min(maj9)', '7(b13)', '9', '7(#11)', 'add9', '13(b9)', '13', 'm13', '7sus4', 'maj13', 'min', 'min6', '9b5', 'min7', '6', '13(#9)', 'sus4', '7(b9)', 'min6/9']
all_notes = ["Ab", "A", "As", "Bb", "B", "C", "Cs", "Db", "D", "Ds", "Eb", "E", "F", "Fs", "Gb", "G", "Gs"]

valid_songs = []

for song in all_songs:
    split_song = song.split()
    valid_song = True

    for chord in split_song:
        root = root_checker(chord)
        rootless_chord = chord[len(root):]  # Removes root from chord based on the root length

        if rootless_chord not in all_modifiers:
            valid_song = False
            break  # Skip the song if it has unknown chords

    if valid_song:
        valid_songs.append(song)


In [108]:
cutonomicon = pd.DataFrame(valid_songs, columns=['chords'])

cutonomicon.to_csv('./cutonomicon')
print(cutonomicon)

                                                  chords
0      C  F C E7 Amin C F C G7 C F C E7 Amin C F G7 C...
1      E D A E D A  E D A E D A E D A E D A C  E G D ...
2      D Dmaj7 D Dmaj7  Emin A D G Emin A D G Emin A ...
3      C  G C G C  F Dmin G Dmin G C  G C  F Dmin G D...
4      C G C G  C G C G C Bmin Emin Amin D  G C D G C...
...                                                  ...
18197  C F C F G C F C  F C F G C C7  F Fs G C C7 F G...
18198  Fs E  Csmin E Csmin A C E A C E  Csmin E Csmin...
18199  E A Bmin A D E A Bmin A D E A Bmin  A D E A Bm...
18200  Bmin G Bmin Fsmin Fsmin7  Bmin Fsmin Bmin Fsmi...
18201  Emin Amin Emin D Emin D Emin D C D C D Emin  D...

[18202 rows x 1 columns]
