In [1]:
import glob, os
import numpy as np
import re
import matplotlib.pyplot as plt
import matplotlib
from collections import Counter
import copy

In [2]:
cleaned_path = '../dataset/cleaned/'
raw_path = '../dataset/McGill-Billboard/'
files = glob.glob(raw_path + '/**/*.txt', recursive=True)

In [3]:
# Convert letter keys into numeric values, relative to the tonic
def key2num(string):
    return (string.replace('A#', '1').replace('Ab', '11')
            .replace('B#', '3').replace('Bb', '1')
            .replace('C#', '4').replace('Cb', '2')
            .replace('D#', '6').replace('Db', '4')
            .replace('E#', '8').replace('Eb', '6')
            .replace('F#', '9').replace('Fb', '7')
            .replace('G#', '11').replace('Gb', '9')
            .replace('A', '0')
            .replace('B', '2')
            .replace('C', '3')
            .replace('D', '5')
            .replace('E', '7')
            .replace('F', '8')
            .replace('G', '10'))

# Raw data import + preprocess

In [4]:
# Get all the raw data into a python list
all_data = np.empty((890, 3), dtype=object)
all_num_data = np.empty((890, 3), dtype=object)

for i in range(len(files)):

    with open(files[i], 'r') as myfile:
        data = myfile.read().replace('\n', ' ').replace('\t', ' ')
        num_data = key2num(data)       
        
    index = re.findall(r'\d+',files[i])[0]
    chord_list = re.findall("[^ ]+:[^ ]+", data)
    tonic = re.findall("tonic: [^ ]+", data)[0].replace("tonic: ", "")

    num_tonic = int(re.findall("tonic: [^ ]+", num_data)[0].replace("tonic: ", ""))
    num_chord_list1 = re.findall("[0-9]+:", num_data)
    num_chord_list1 = [(int(element.replace(":", ""))-num_tonic)%12 for element in num_chord_list1]
    num_chord_list2 = re.findall(":[^ ]+", num_data)
    num_chord_list2 = [element.replace(":", "") for element in num_chord_list2]    
    
    all_data[i] =  (index, tonic, chord_list)
    all_num_data[i] = (index, tonic, np.dstack((num_chord_list1, num_chord_list2))[0])

In [5]:
import csv

with open('../dataset/index.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    csv_list = np.asarray(list(csv_reader), dtype=object)
csv_list = np.take(csv_list, [0,1,4,5], axis=1 )

In [7]:
# Clean data as exposed in milestone 2!!!
clean_data = np.take(np.concatenate((csv_list[1:], all_num_data), axis=1), [0,1,2,3,5,6], axis=1)
np.save('../dataset/clean_data.npy', clean_data)

In [8]:
# Now we remove the chords that are twice the same in a row
removed_duplicates = copy.deepcopy(clean_data)

for ele in removed_duplicates:
    tmp_list = []
    list_chords = ele[5]
    for idx, chord in enumerate(list_chords):
        if((idx < len(list_chords) - 1) and chord[0] != list_chords[idx + 1][0]):
            tmp_list.append(chord)
    ele[5] = np.array(tmp_list)
    
np.save('../dataset/removed_duplicates.npy', removed_duplicates)