# Implementing the `MiniBach` model - Part 1

You may be interested in this step if you are curious about how to process a collection of raw musical scores into a dataset for the `MiniBach` architecture. If you are comfortable with `music21` and doing these steps by yourself, you may want to skip this notebook.

The generated output of this script will be stored in the `dataset.csv` file.


In [3]:
import music21
import pandas as pd
import os

In [4]:
dataset_path = os.path.join('bach-370-chorales', 'kern')

part_indexes = {
    0: 'soprano',
    1: 'alto',
    2: 'tenor',
    3: 'bass'    
}

def make_measure_chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    return [lst[i:i+n] for i in range(len(lst)-n+1)]


with open('dataset.csv', 'w') as fd:
    fd.write('sixteenth,soprano,alto,tenor,bass,file\n')
    for f in sorted(os.listdir(dataset_path)):
        print(f)
        filepath = os.path.join(dataset_path, f)
        # parse the file with music21
        s = music21.converter.parse(filepath)
        # discard pieces with time signature != 4/4
        timeSignature = s.flat.getElementsByClass('TimeSignature')
        if timeSignature and timeSignature[0].ratioString != '4/4':
            continue
        parts = {}    
        # Iterate over the 4 parts
        for part_id, part in enumerate(s.parts):
            # Get all the measures in this part
            measures = [mm for mm in part.getElementsByClass('Measure') if mm.number != 0]
            # Group them into groups of 4
            measure_chunks = make_measure_chunks(measures, 4)
            part_chunks = []
            for chunk in measure_chunks:
                # For every chunk of 4 measures
                chunk_encoding = {(offset/4.0): '--' for offset in range(0, 64)}
                for measure_id, measure in enumerate(chunk):
                    # Iterate over each measure
                    for ev in measure:
                        # And every note within the measure
                        offs = 4.0 * measure_id + ev.offset
                        if isinstance(ev, music21.chord.Chord):                        
                            if offs in chunk_encoding:
                                chunk_encoding[offs] = ev[0].nameWithOctave
                        elif isinstance(ev, music21.note.Note):
                            if offs in chunk_encoding:
                                chunk_encoding[offs] = ev.nameWithOctave                        
                        elif isinstance(ev, music21.note.Rest):                        
                            if offs in chunk_encoding:
                                chunk_encoding[offs] = 'Rest'            
                part_chunks.append(list(chunk_encoding.values()))
            parts[part_indexes[part_id]] = part_chunks    

        for chunk_id in range(len(parts['soprano'])):
            dfdict = {}
            for part, chunks in parts.items():
                dfdict[part] = chunks[chunk_id]
            chunk_name = f'{f}_chunk_{chunk_id}'
            dfdict['file'] = chunk_name            
            df = pd.DataFrame(dfdict)
            df.to_csv(fd, header=False)      

chor001.krn
chor002.krn
chor003.krn
chor004.krn
chor005.krn
chor006.krn
chor007.krn
chor008.krn
chor009.krn
chor010.krn
chor011.krn
chor012.krn
chor013.krn
chor014.krn
chor015.krn
chor016.krn
chor017.krn
chor018.krn
chor019.krn
chor020.krn
chor021.krn
chor022.krn
chor023.krn
chor024.krn
chor025.krn
chor026.krn
chor027.krn
chor028.krn
chor029.krn
chor030.krn
chor031.krn
chor032.krn
chor033.krn
chor034.krn
chor035.krn
chor036.krn
chor037.krn
chor038.krn
chor039.krn
chor040.krn
chor041.krn
chor042.krn
chor043.krn
chor044.krn
chor045.krn
chor046.krn
chor047.krn
chor048.krn
chor049.krn
chor050.krn
chor051.krn
chor052.krn
chor053.krn
chor054.krn
chor055.krn
chor056.krn
chor057.krn
chor058.krn
chor059.krn
chor060.krn
chor061.krn
chor062.krn
chor063.krn
chor064.krn
chor065.krn
chor066.krn
chor067.krn
chor068.krn
chor069.krn
chor070.krn
chor071.krn
chor072.krn
chor073.krn
chor074.krn
chor075.krn
chor076.krn
chor077.krn
chor078.krn
chor079.krn
chor080.krn
chor081.krn
chor082.krn
chor083.krn
chor