In [None]:
# this file shows what the symbolic music data looks like 
# as well as the tokenization process we use for finetuning on CoLA (single sentence)
# and MRPC (double sentence) datasets

!git clone https://github.com/wazenmai/MIDI-BERT.git
%cd MIDI-BERT
!pip install -r requirements.txt
%cd MidiBERT
%cd CP

In [2]:
import numpy as np
import random
import pickle
import os
root = '../../data/CP'
dataset = 'pop909' # dataset containing Pop music; used for melody classification
data = np.load(os.path.join(root, f'{dataset}_train.npy'), allow_pickle=True)
print ("This is what the symbolic music data looks like")
print (data)
# token types: [Bar, Position, Pitch, Duration]
# Number of possible combinations for each token type: [4,18,88,66]
# the vectors at the end of the training set are all [ 2 16 86 64], which corresponds to the pad token

This is what the symbolic music data looks like
[[[ 0  0 55  3]
  [ 1  0 24 13]
  [ 1  2 54  3]
  ...
  [ 1  3 43  5]
  [ 1  5 57  1]
  [ 1  5 31  3]]

 [[ 1  6 38  3]
  [ 1  6 36  1]
  [ 1  7 43  3]
  ...
  [ 1  9 33  3]
  [ 1 10 38  1]
  [ 1 11 38  5]]

 [[ 1 11 42  5]
  [ 1 12 43  1]
  [ 1 14 50  9]
  ...
  [ 1  0 50 11]
  [ 1  0 23 13]
  [ 1  0 35  3]]

 ...

 [[ 0  0 18 11]
  [ 1  0 30  3]
  [ 1  0 37  5]
  ...
  [ 1 11 41  1]
  [ 1 11 44  1]
  [ 1 12 37  3]]

 [[ 1 13 54  3]
  [ 1 13 42  1]
  [ 1 13 46  1]
  ...
  [ 1  0 39  7]
  [ 1  0 42  7]
  [ 1  0 46  1]]

 [[ 1  0 47  7]
  [ 1  1 27  1]
  [ 1  2 56  1]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]]


In [None]:
# show the tokenization process for CoLA
!pip install datasets==1.11.0

from datasets import load_dataset

max_seq_length = 512
task =  "cola"
dataset = load_dataset("glue", task)


In [4]:
import pandas as pd

df = pd.DataFrame(dataset["train"][:])

print('Size of training set: {:,}\n'.format(df.shape[0]))

# Display 10 random rows of data
df.sample(10)

Size of training set: 8,551



Unnamed: 0,sentence,label,idx
4359,There is likely to be no student absent.,1,4359
6589,A woman hit a girl who was pregnant.,1,6589
4306,Tom believes Stephen to be irritating.,1,4306
6311,Brandon read every book that Megan did.,1,6311
0,"Our friends won't buy this analysis, let alone...",1,0
8075,Peter is some disgruntled old pigs in those di...,0,8075
2227,This knife doesn't cut.,1,2227
7205,it is the tall man come from the back that Mar...,0,7205
7487,John asked whether Bill left.,1,7487
7492,Mary sent a book to Bill.,1,7492


In [5]:
import torch
from transformers import BertTokenizer

sentences = df.sentence.values
labels = df.label.values

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

input_ids = []


for sent in sentences:

    encoded_dict = tokenizer.encode(
                        sent,                  
                        max_length = max_seq_length ,           # Pad sentences
                        add_special_tokens = False,       # Pad is the only special token
                        truncation=True,
                        pad_to_max_length = True,
                        return_tensors = 'pt',     # Return pytorch tensors
                   )
  
    input_ids.append(encoded_dict)
    
input_ids = torch.cat(input_ids, dim=0)
labels = torch.tensor(labels)

print('Original: ', sentences[0])
print('Tokenized: ', tokenizer.convert_ids_to_tokens(input_ids[0]))
print('Token IDs:', input_ids[0]) 

Downloading:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/436k [00:00<?, ?B/s]



Original:  Our friends won't buy this analysis, let alone the next one we propose.
Tokenized:  ['Our', 'friends', 'won', "'", 't', 'buy', 'this', 'analysis', ',', 'let', 'alone', 'the', 'next', 'one', 'we', 'propose', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[P

In [6]:
# now convert the token IDs to 4 dimensional vectors

x = input_ids.numpy()
y_train = labels.numpy()
X_train = np.empty((x.shape[0], max_seq_length, 4))

# token types: [Bar, Position, Pitch, Duration]
# Number of possible combinations for each token type: [4,18,88,66]

#PAD:       [ 2 16 86 64]
#MASK:      [ 3 17 87 65]

#The total number of possible combinations (excluding pad and mask) is thus 2*16*86*64 = 176,128
#To constrain this closer to the vocabulary size of English BERT (roughly 30,000),
#we don’t map to zero values in the Bar column since they correspond to the beginning of a musical bar
#We also don’t map to values greater than 32 in the Duration column, since duration is measured in 32nd notes,
#and there are very few data points in the pretraining sets that have a duration greater than 32

# use modular arithmetic to map each ID to a unique 4D vector

X_train[:,:,0] = 1                                          # map to 1 in the Bar column
X_train[:,:,1] = x%16                                       # map to 0-16 in the Position column
X_train[:,:,2] = ((x/16).astype(int))%86                    # map to 0-86 in the Pitch column
X_train[:,:,3] = ((x/(16*86)).astype(int))%32               # map to 0-32 in the Duration column

# the pad token is initially mapped to [1 0 0 0]
# but we want it mapped to [ 2 16 86 64]
for i in range (0,x.shape[0]):
  for j in range (0,max_seq_length):
    X_train[i,j,:] = np.array([2, 16, 86, 64]) if(np.sum(X_train[i,j,:]) == 1) else X_train[i,j,:]
                              
X_train = X_train.astype(int)      
print('X_train shape: {}, y_train shape: {}'.format(X_train.shape,y_train.shape))     
print (" original ids")
print (x)       
print (" 4d vectors")  
print (X_train)

X_train shape: (8551, 512, 4), y_train shape: (8551,)
 original ids
[[ 3458  2053  1281 ...     0     0     0]
 [ 1448  1167 23563 ...     0     0     0]
 [ 1448  1167 23563 ...     0     0     0]
 ...
 [ 1135  1110  3123 ...     0     0     0]
 [  146  1125  1103 ...     0     0     0]
 [ 1327  1155  1225 ...     0     0     0]]
 4d vectors
[[[ 1  2 44  2]
  [ 1  5 42  1]
  [ 1  1 80  0]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1  8  4  1]
  [ 1 15 72  0]
  [ 1 11 10 17]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1  8  4  1]
  [ 1 15 72  0]
  [ 1 11 10 17]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 ...

 [[ 1 15 70  0]
  [ 1  6 69  0]
  [ 1  3 23  2]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1  2  9  0]
  [ 1  5 70  0]
  [ 1 15 68  0]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1 15 82  0]
  [ 1  3 72  0]
  [ 1  9 76  0]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]]


In [7]:
# now demonstrate the tokenization process for MRPC

max_seq_length = 512
task =  "mrpc"
dataset = load_dataset("glue", task)

import pandas as pd

df = pd.DataFrame(dataset["train"][:])

print('Size of training set: {:,}\n'.format(df.shape[0]))

df.sample(10)

Downloading and preparing dataset glue/mrpc (download: 1.43 MiB, generated: 1.43 MiB, post-processed: Unknown size, total: 2.85 MiB) to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad...


Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

Downloading: 0.00B [00:00, ?B/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

0 examples [00:00, ? examples/s]

Dataset glue downloaded and prepared to /root/.cache/huggingface/datasets/glue/mrpc/1.0.0/dacbe3125aa31d7f70367a07a8a9e72a5a0bfeb5fc42e75c9db75b96da6053ad. Subsequent calls will reuse this data.
Size of training set: 3,668



Unnamed: 0,sentence1,sentence2,label,idx
3081,"Aquila is short of cash , and Avon bondholders...",Avon bondholders have no recourse to Midlands ...,1,3424
732,A base configuration with a 2.0GHz Intel Celer...,"A base configuration with a 2.4GHz Pentium 4 ,...",1,822
1474,"And now it 's anything he wants to say , "" Ale...","And now it 's anything he wants to say , "" con...",1,1642
1389,The seal will ultimately be used to identify D...,The seal will ultimately be used on department...,1,1546
665,Bremer said one initiative is to launch a US $...,Bremer said he would launch a $ 70-million pro...,1,748
1401,Water management officials in Florida were wor...,Water management officials in Florida were wor...,0,1560
897,"Under the proposal , Slocan shareholders will ...","Under a proposed plan of arrangement , Slocan ...",1,1000
3385,His lack of co-operation was allowing other Je...,His lack of co-operation was allowing other JI...,0,3768
1485,One of the Oregon species was acclimated to a ...,One of the Oregon species was acclimated to a ...,1,1654
1545,"The victims included seven Lebanese , four Egy...",State television said the dead included seven ...,0,1718


In [8]:
sentences1 = df.sentence1.values
sentences2 = df.sentence2.values
sentences = df[["sentence1", "sentence2"]].values.astype("str")
labels = df.label.values

from transformers import BertTokenizer

import torch

input_ids = []
attention_masks = []

for sent in sentences:
    encoded_dict = tokenizer.encode(
                        sent[0], sent[1],           
                        max_length = max_seq_length,           # Pad sentences
                        add_special_tokens = False,       # Pad is the only special token
                        truncation=True,
                        pad_to_max_length = True,
                        return_tensors = 'pt',     # Return pytorch tensors
                   )
    input_ids.append(encoded_dict)
    # We don't use segment encodings

input_ids = torch.cat(input_ids, dim=0)
labels = torch.tensor(labels)

print(' Original 1: ', sentences1[0])
print(' Original 2: ', sentences2[0])
print('Tokenized: ', tokenizer.convert_ids_to_tokens(input_ids[0]))
print('Token IDs:', input_ids[0]) 



 Original 1:  Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .
 Original 2:  Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .
Tokenized:  ['Am', '##ro', '##zi', 'accused', 'his', 'brother', ',', 'whom', 'he', 'called', '"', 'the', 'witness', '"', ',', 'of', 'deliberately', 'di', '##sto', '##rting', 'his', 'evidence', '.', 'Re', '##fer', '##ring', 'to', 'him', 'as', 'only', '"', 'the', 'witness', '"', ',', 'Am', '##ro', '##zi', 'accused', 'his', 'brother', 'of', 'deliberately', 'di', '##sto', '##rting', 'his', 'evidence', '.', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]', '[PAD]',

In [9]:
x = input_ids.numpy()
y_train = labels.numpy()
X_train = np.empty((x.shape[0], max_seq_length, 4))

# token types: [Bar, Position, Pitch, Duration]
# Number of possible combinations for each token type: [4,18,88,66]

#PAD:       [ 2 16 86 64]
#MASK:      [ 3 17 87 65]

#The total number of possible combinations (excluding pad and mask) is thus 2*16*86*64 = 176,128
#To constrain this closer to the vocabulary size of English BERT (roughly 30,000),
#we don’t map to zero values in the Bar column since they correspond to the beginning of a musical bar
#We also don’t map to values greater than 32 in the Duration column, since duration is measured in 32nd notes,
#and there are very few data points in the pretraining sets that have a duration greater than 32

# use modular arithmetic to map each ID to a unique 4D vector

X_train[:,:,0] = 1                                          # map to 1 in the Bar column
X_train[:,:,1] = x%16                                       # map to 0-16 in the Position column
X_train[:,:,2] = ((x/16).astype(int))%86                    # map to 0-86 in the Pitch column
X_train[:,:,3] = ((x/(16*86)).astype(int))%32               # map to 0-32 in the Duration column

# the pad token is initially mapped to [1 0 0 0]
# but we want it mapped to [ 2 16 86 64]
for i in range (0,x.shape[0]):
  for j in range (0,max_seq_length):
    X_train[i,j,:] = np.array([2, 16, 86, 64]) if(np.sum(X_train[i,j,:]) == 1) else X_train[i,j,:]
                              
X_train = X_train.astype(int)      
print('X_train shape: {}, y_train shape: {}'.format(X_train.shape,y_train.shape))     
print (" original ids")
print (x)       
print (" 4d vectors")  
print (X_train)

X_train shape: (3668, 512, 4), y_train shape: (3668,)
 original ids
[[ 7277  2180  5303 ...     0     0     0]
 [10684  2599  9717 ...     0     0     0]
 [ 1220  1125  1502 ...     0     0     0]
 ...
 [  107  1284  1138 ...     0     0     0]
 [ 1109  1136  5783 ...     0     0     0]
 [ 1109  1476   118 ...     0     0     0]]
 4d vectors
[[[ 1 13 24  5]
  [ 1  4 50  1]
  [ 1  7 73  3]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1 12 65  7]
  [ 1  7 76  1]
  [ 1  5  5  7]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1  4 76  0]
  [ 1  5 70  0]
  [ 1 14  7  1]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 ...

 [[ 1 11  6  0]
  [ 1  4 80  0]
  [ 1  2 71  0]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1  5 69  0]
  [ 1  0 71  0]
  [ 1  7 17  4]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]

 [[ 1  5 69  0]
  [ 1  4  6  1]
  [ 1  6  7  0]
  ...
  [ 2 16 86 64]
  [ 2 16 86 64]
  [ 2 16 86 64]]]
