## 1. Clean Wilding Chords Dataset
To perform word2vec, we first must extract the chords from the dataset. The original `.txt` can be found and downloaded [here](http://jazzparser.granroth-wilding.co.uk/attachments/JazzCorpus/chord_corpus.txt).

In [17]:
from pathlib import Path
import re

In [6]:
CORPUS_FILEPATH = Path('wilding-corpus.txt')

In [7]:
corpus = open(CORPUS_FILEPATH, 'r').read()

In [8]:
# Define constants
KEYS = ["A", "Bb", "B", "C", "Db", "D", "Eb", "E", "F", "Gb", "G", "Ab"]
MAIN_KEY_STRING = "Main key: "
MAJOR = 'major'

In [19]:
def parse_chord_symbol(symbol):
    ''' Split a chord symbol into the root and quality '''
    if len(symbol) == 1:
        return symbol, "M"
    if symbol[1] == 'b':
        return symbol[:2], symbol[2:]
    else:
        return symbol[0], symbol[1:]

In [20]:
def transpose_chord(symbol, amount):
    ''' Transpose a single chord by a given amount '''
    root, quality = parse_chord_symbol(symbol)
    key_index = KEYS.index(root)
    return KEYS[(key_index + amount) % len(KEYS)] + quality

def transpose_chords(symbols, amount):
    return [transpose_chord(symbol, amount) for symbol in symbols]

In [21]:
def parse_corpus(corpus_txt):
    ''' Parse the raw txt file into a list of chords '''
    corpus_txt = re.sub(' +', ' ', corpus_txt)
    sequence_separator = re.compile("Chords for '.*?'")
    sequences = sequence_separator.split(corpus_txt)[1:]
    sequences = [s.strip() for s in sequences]
    chords_corpus = []
    for sequence in sequences:
        sequence_lines = sequence.split('\n')
        assert sequence_lines[0][:len(MAIN_KEY_STRING)] == MAIN_KEY_STRING
        key = sequence_lines[0][len(MAIN_KEY_STRING):].split()
        is_major = len(key) == 1 or key[1] == MAJOR
        chords = sequence_lines[2].split()
        if not is_major:
            chords = transpose_chords(chords, -3)
        chords_corpus.append(chords)
    return chords_corpus

In [23]:
def write_chords_to_file(all_chords):
    ''' Save chords to file (space-separated and line-separated) '''
    with open('chords.txt', 'w') as f:
        for chords in all_chords:
            f.write(f"{' '.join(chords)}\n")

In [24]:
# Parse corpus
chords_corpus = parse_corpus(corpus)

Let's take a look at some chords from the first tune:

In [27]:
chords_corpus[0][:10]

['Am', 'Dm7', 'E7', 'Am', 'A7', 'Dm7', 'G7', 'CM7', 'FM7', 'B%7']

In [25]:
# Save corpus
write_chords_to_file(chords_corpus)