In [1]:
import numpy as np
import pandas as pd
import glob
import re

In [2]:
datadir = "../data/McGill-Billboard/"
# filepaths = glob.glob(datadir+'[0-9][0-9][0-9][0-9]/salami_chords.txt')
filepaths = glob.glob(datadir+'1256/salami_chords.txt')

In [75]:
fileidx = 0
filepath = filepaths[fileidx]

def ExtractFile(filepath):
    f = open(filepath, 'r')
    raw = f.readlines()
    f.close
    return raw

def ExtractMeta(raw):
    title = re.split('# title: |\n', raw[0])[1]
    artist = re.split('# artist: |\n', raw[1])[1]
    metre = re.split('# metre: |\n', raw[2])[1]
    tonic = re.split('# tonic: |\n', raw[3])[1]

    return (title, artist, metre, tonic)

raw = ExtractFile(filepath)
(title, artist, metre, tonic) = ExtractMeta(raw)

In [76]:
def ExtractPhrases(raw):
    raw = raw[6:-2]
    phrases = list()
    for row in raw:
        splitted = re.split('\t', row)[1:][0]
        splitted = re.split('\|', splitted)[1:]
        
        if not splitted: continue
        
        phrase = splitted[:-1]

        # check for repeat
        if IsRepeat(splitted[-1][1:3]):
            phrase.append(splitted[-1][1:3])

        phrases.append(phrase)
    return phrases

def CleanPhrases(phrases):
    phrases_clean = list()
    for phrase in phrases:
        phrase_clean = [chord.strip() for chord in phrase]
        phrases_clean.append(phrase_clean)
    return phrases_clean
    
def FormatChords(phrases_clean):
    chords = list()
    
    for phrase in phrases_clean:
        # clean non-chords
        phrase_chord = [RemoveNoise(chord) for chord in phrase]
        
        # remove blank (noise-only) lines
        if all([not p for p in phrase_chord]): continue

        # check for repeat
        if IsRepeat(phrase_chord[-1]):
            repeated = phrase_chord[:-1]
            n_repeat = int(phrase_chord[-1][-1])
            phrase_chord = repeated*n_repeat

        chords.append(phrase_chord)
    # flatten 2d list
    chords = sum(chords, [])

    # split chords into bars
    chords = [chord.split() for chord in chords]

    return chords

def IsRepeat(str):
    if re.match(r'x\d', str):
        return True
    else:
        return False

def RemoveNoise(chord):
    chord_clean = re.sub(r'\.|\*|\(\d+/\d+\)|N', '', chord)
    chord_clean = chord_clean.split()
    chord_clean = ' '.join(chord_clean)
    return chord_clean

phrases = ExtractPhrases(raw)
phrases_clean = CleanPhrases(phrases)
chords = FormatChords(phrases_clean)

In [77]:
def CreateIndex(chords):
    indexs_bar = list()
    for i, bar in enumerate(chords):
        index_bar = [i]*len(bar)
        indexs_bar.append(index_bar)
    return indexs_bar
chords_index = CreateIndex(chords)

In [78]:
def Raw2Basic(chord_raw):
    root = chord_raw.split(':')[0]
    func = chord_raw.split(':')[1].split('/')[0]
    
    if func[:3]=='maj':
        func = 'maj'
    elif func[:3]=='min':
        func = 'min'
    elif func[:3]=='sus':
        func = func[:4]
    elif func[0]=='1' or func[0]=='5':
        func = '15'
    
    chord_basic = root+':'+func
    return chord_basic

def Raw2Root(chord_raw):
    root = chord_raw.split(':')[0]
    return root

In [79]:
df_chords = pd.DataFrame()
df_chords['bar'] = sum(chords_index, [])
df_chords['chords'] = sum(chords, [])
df_chords['chords_basic'] = df_chords['chords'].apply(Raw2Basic)
df_chords['chords_root'] = df_chords['chords'].apply(Raw2Root)
df_chords.iloc[10:30]

Unnamed: 0,bar,chords,chords_basic,chords_root
10,10,C:min7,C:min,C
11,11,C:min,C:min,C
12,12,Bb:5/9,Bb:15,Bb
13,13,Bb:5/9,Bb:15,Bb
14,13,C:5,C:15,C
15,14,Bb:5/9,Bb:15,Bb
16,15,Bb:5/9,Bb:15,Bb
17,15,C:5,C:15,C
18,16,C:maj,C:maj,C
19,17,C:maj,C:maj,C


In [80]:
def GetEnharmonic(note):
    if note == 'Db':
        return 'C#'
    elif note == 'Eb':
        return 'D#'
    elif note == 'E#':
        return 'F'
    elif note == 'Fb':
        return 'E'
    elif note == 'Gb':
        return 'F#'
    elif note == 'Ab':
        return 'G#'
    elif note == 'Bb':
        return 'A#'
    elif note == 'B#':
        return 'C'
    elif note == 'Cb':
        return 'B'
    return note

def Abs2RelRoot(chord_root, tonic):
    chromatic = ['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']
    relative = ['I', 'I#', 'II', 'II#', 'III', 'IV', 'IV#', 'V', 'V#', 'VI', 'VI#', 'VII']

    tonic = GetEnharmonic(tonic)
    chord_root = GetEnharmonic(chord_root)

    tonic_pos = chromatic.index(tonic)
    chord_root_pos = chromatic.index(chord_root)

    dist = chord_root_pos - tonic_pos
    rel_chord_root = relative[dist]

    return rel_chord_root

def Abs2Rel(chord, tonic):
    splitted = chord.split(':')
    root = splitted[0]
    func = ''
    if len(splitted) > 1:
        func = ':' + splitted[1]
    rel_root = Abs2RelRoot(root, tonic)
    rel_chord = rel_root + func

    return rel_chord

Abs2Rel('Bb:maj7', 'Ab')

'II:maj7'

In [81]:
df_chords['chords_rel'] = df_chords['chords'].apply(Abs2Rel, tonic=tonic)
df_chords['chords_basic_rel'] = df_chords['chords_basic'].apply(Abs2Rel, tonic=tonic)
df_chords['chords_root_rel'] = df_chords['chords_root'].apply(Abs2Rel, tonic=tonic)
df_chords.iloc[10:30]

Unnamed: 0,bar,chords,chords_basic,chords_root,chords_rel,chords_basic_rel,chords_root_rel
10,10,C:min7,C:min,C,I:min7,I:min,I
11,11,C:min,C:min,C,I:min,I:min,I
12,12,Bb:5/9,Bb:15,Bb,VI#:5/9,VI#:15,VI#
13,13,Bb:5/9,Bb:15,Bb,VI#:5/9,VI#:15,VI#
14,13,C:5,C:15,C,I:5,I:15,I
15,14,Bb:5/9,Bb:15,Bb,VI#:5/9,VI#:15,VI#
16,15,Bb:5/9,Bb:15,Bb,VI#:5/9,VI#:15,VI#
17,15,C:5,C:15,C,I:5,I:15,I
18,16,C:maj,C:maj,C,I:maj,I:maj,I
19,17,C:maj,C:maj,C,I:maj,I:maj,I
