# Notebook for filling the Extracted_Repns directory. This notebook uses the utility functions from comptuteMDE to calculate QANON and MDE representations, and save them in the Extracded_Repns directory

This file sets up the directories in the following format:
In Extracted repns, each dataset creates a QANON folder and an MDE folder.

In the QANON folder, there is a folder for:
1. QANON reps as numpy (reps)
2. QANON reps converted to a sequence of strings (text)
3. QANON reps converted to a sequence of strings and combined into one file (full)

In the MDE folder there is a folder for:
1. MDE reps as numpy (reps)
2. MDE reps converted to a sequence of strings (text)
3. MDE reps converted to a sequence of strings and combined into one file (full)
4. Saving the LM models (model)
5. Saving the best LM model (best)
6. Saving the hand configurations dictionary (dict)
7. Saving the tokenizer for the LM

### Imports

In [15]:
import numpy as np
import matplotlib.pyplot as plt
from mido import MidiFile, tick2second
from pretty_midi import PrettyMIDI
import pickle
import os
from os import path
import time
import pathlib

from utils.computeMDE import *

### Choose the dataset and Setup the directories

In [16]:
#CHOOSE THE DATASET

#All options are: 1. Chopin43  
                # 2. Maestro
                # 3. Chopin and Hannds
        
        
Dataset = 'Chopin43'

In [17]:
dataset_map = {"Chopin43": '_C', 
               "ChopinAndHannds": '_CH',
               "Maestro": '_M'}
Key = dataset_map[Dataset]

In [18]:
#Setup the Directories as needed

midiDir = './Datasets/' + Dataset
QANONDir = './Extracted_Repns/QANON' + Key
MDEDir = './Extracted_Repns/MDE' + Key



qanonRepDir = QANONDir + '/reps'
qanonTextDir = QANONDir + '/text'
qanonCorpusDir = QANONDir + '/full'


handConf_Savedir = MDEDir + '/dict'

MDERepDir = MDEDir + '/reps'
modelDir = MDEDir + '/model'
tokenizerDir = MDEDir + '/tokenizer'
bestmodelDir = MDEDir + '/best'
MDETextDir = MDEDir + '/text'
MDECorpusDir = MDEDir + '/full'

paths = [QANONDir, MDEDir, handConf_Savedir, qanonTextDir, qanonCorpusDir,
         qanonRepDir, MDERepDir, modelDir, tokenizerDir, bestmodelDir, MDETextDir, MDECorpusDir]



for path in paths:
    isExist = os.path.exists(path)

    if not isExist:

      # Create a new directory because it does not exist 
      os.makedirs(path)

### Calculate the QANON representations

Make a list of all midi files in the dataset

In [19]:
import os
midiFiles = []
for root, dirs, files in os.walk(midiDir):
    for file in files:
        if ".mid" in file:
            name = root  + '/' + file
            midiFiles.append(name)

In [20]:
print("The dataset has this many samples:")
len(midiFiles)

The dataset has this many samples:


43

In [21]:
#Use the computeMDE utilities to compute QANON representations
#and populate the reps directory

#NOTE: This may take approx. 10-20 minutes for a large dataset like Maestro

if 'Maestro' in midiDir:
    print("Calculating QANONS for BOTH hands")
    calculateBothHandQanons(midiFiles, qanonRepDir)
else:
    print("Calculating QANONS for LEFT hand ONLY")
    print("By default, this augments with pitch shifting since the left hand datasets are small")
    calculateLeftHandQanons(midiFiles, qanonRepDir)

Calculating QANONS for LEFT hand ONLY
By default, this augments with pitch shifting since the left hand datasets are small


Create a list of the QANON representations and the original midi files

In [22]:
import os
midiFiles = []
QANONFiles = []
for root, dirs, files in os.walk(midiDir):
    for file in files:
        if '.mid' in file:
            name = root  + '/' + file
            midiFiles.append(name)
            
QANONFiles = []
for root, dirs, files in os.walk(QANONDir):
    for file in files:
        name = root  + '/' + file
        if '.npy' in name:
            QANONFiles.append(name)

In [24]:
print("There were {} midi files and now are {} QANON representations".format(len(midiFiles), len(QANONFiles)))
print("These numbers can be different if the dataset is augmented with pitch shifts")

There were 43 midi files and now are 301 QANON representations
These numbers can be different if the dataset is augmented with pitch shifts


In [25]:
#Populate the QANON text and full corpus directories

with open(QANONDir + '/full/QANON.txt', 'w') as f:
    for x in range(len(QANONFiles)):
        name = pathlib.Path(QANONFiles[x]).name.replace('npy', 'txt')
        txt = ''
        with open(QANONDir + '/text/{}'.format(name), 'w') as g:
            q = np.load(QANONFiles[x])
            for c in range(q.shape[1]):

                column = list(q[:, c])
                col = column
                listCol = [str(int(x)) for x in column]
                strCol = ''.join(listCol)
                txt = txt + strCol + ' '
            g.write(txt)
            f.write(txt)
            f.write('\n')


## Generate MDE representations for the Chopin43 dataset

In [26]:
#build the hand configurations dictionary
hcount=0
if 'Maestro' in midiDir:
    maximum_hand_spread = 88 #For both hands, any spread is allowed
else:
    maximum_hand_spread = 18 #Hand configurations bigger than this size are set as UNK
hands = {}

for x in range(len(QANONFiles)):
    q = np.load(QANONFiles[x])
    for c in range(q.shape[1]):
        column = q[:, c]
        o,p,h,c = getMDE(column, c)
        
        if h not in hands and len(h)<maximum_hand_spread:
            hands[h] = hcount
            hcount+=1
            
#Save it to file
with open(handConf_Savedir + '/handConf_dict', 'wb') as handle:
    pickle.dump(hands, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [27]:
print("There are {} unique hand configurations".format(len(hands)))

There are 109 unique hand configurations


In [28]:
#Populate the MDE representations directory. Add 1 to the hand configuration, reserving
#0 for the unknown represenations

for x in range(len(QANONFiles)):
    q = np.load(QANONFiles[x])
    piece = []
    for c in range(q.shape[1]):
        column = q[:, c]
        o,p,h,c = getMDE(column, c)
        try: hc = hands[h]
        except: hc = 0
        MDE = [o,'s',p,'s',hc+1]
        piece.append(MDE)
    piece = np.array(piece)
    path = pathlib.PurePath(QANONFiles[x])
    name = path.name
    np.save(MDEDir + '/' + name, piece)

In [29]:
#Populate the MDE text and corpus directories


with open(MDEDir + '/full/MDE.txt', 'w') as f:
    for x in range(len(QANONFiles)):
        txt = ''
        with open(MDEDir + '/text/m_{}.txt'.format(x), 'w') as g:
            q = np.load(QANONFiles[x])
            for c in range(q.shape[1]):
                column = q[:, c]
                o,p,h,c = getMDE(column, c)

                try: hc = hands[h] + 1
                except: hc = 0
                
                MDE = [int(o),'s',int(p),'s',int(hc)]
                listCol = [str(x) for x in MDE]
                strCol = ''.join(listCol)
                txt = txt + strCol + ' '

            g.write(txt)
            f.write(txt)
            f.write('\n')