In [4]:
# tokenize with BPE
from miditok import REMI
from pathlib import Path

# Creates the tokenizer and list the file paths
tokenizer = REMI(sos_eos=True)
midi_paths = list(Path('data/new_grand_midi').glob('**/*.mid*'))

# A validation method to discard MIDIs we do not want
def midi_valid(midi) -> bool:
    if any(ts.numerator != 4 for ts in midi.time_signature_changes):
        return False  # time signature different from 4/*, 4 beats per bar
    if midi.max_tick < 10 * midi.ticks_per_beat:
        return False  # this MIDI is too short
    return True

# Converts MIDI files to tokens saved as JSON files
tokenizer.tokenize_midi_dataset(        
    midi_paths,
    Path('data/new_grand_midi_noBPE'),
    midi_valid
)

# Learns the vocabulary with BPE
tokenizer.learn_bpe(
    'data/new_grand_midi_noBPE',
    512,
    'data/new_grand_midi_BPE'
)

# Converts the tokenized musics into tokens with BPE
tokenizer.apply_bpe_to_dataset(Path('data/new_grand_midi_noBPE'), Path('data/new_grand_midi_BPE'))


Tokenizing MIDIs (data/new_grand_midi_noBPE): 100%|██████████| 7227/7227 [32:00<00:00,  3.76it/s]  
Loading token files: 100%|██████████| 7227/7227 [01:12<00:00, 99.46it/s] 
Learning byte pair encoding: 100%|██████████| 292/292 [7:32:48<00:00, 93.04s/it, seq_len_variation=-25.02, avg_nb_token_combs=2.04, max_nb_token_combs=3]   


Mean of original lengths: 13456.084567474049
Mean length after BPE: 10089.323875432527
Variation from original: -25.02 %


Applying BPE to dataset: 100%|██████████| 7227/7227 [2:37:02<00:00,  1.30s/it]   


In [None]:
from miditok import REMI
from pathlib import Path

In [None]:
# tokenize without BPE
from miditok import REMI
from pathlib import Path

# Creates the tokenizer and list the file paths
tokenizer = REMI(special_tokens=)
midi_paths = list(Path('Final_Project/Dataset').glob('**/*.mid*'))

# A validation method to discard MIDIs we do not want
def midi_valid(midi) -> bool:
    if any(ts.numerator != 4 for ts in midi.time_signature_changes):
        return False  # time signature different from 4/*, 4 beats per bar
    if midi.max_tick < 10 * midi.ticks_per_beat:
        return False  # this MIDI is too short
    return True

# Converts MIDI files to tokens saved as JSON files
tokenizer.tokenize_midi_dataset(        
    midi_paths,
    Path('Final_Project/combined_noBPE3'),
    midi_valid
)

In [1]:
from miditok import REMI
from pathlib import Path

# Creates the tokenizer and list the file paths
tokenizer = REMI(sos_eos=True)

# Learns the vocabulary with BPE
tokenizer.learn_bpe(
    'Final_Project/combined_noBPE',
    512,
    'Final_Project/combined_BPE'
)

# Converts the tokenized musics into tokens with BPE
tokenizer.apply_bpe_to_dataset(Path('Final_Project/combined_noBPE'), Path('Final_Project/combined_BPE'))

Loading token files: 100%|██████████| 1355/1355 [00:04<00:00, 296.29it/s]
Learning byte pair encoding: 100%|██████████| 292/292 [1:24:43<00:00, 17.41s/it, seq_len_variation=-30.13, avg_nb_token_combs=2.50, max_nb_token_combs=3]


Mean of original lengths: 18138.439696760855
Mean length after BPE: 12673.77050310131
Variation from original: -30.13 %


Applying BPE to dataset: 100%|██████████| 1355/1355 [18:36<00:00,  1.21it/s]


In [33]:
import json
import numpy as np
import os
path = 'clean/data/SMALL_2M/tokens_small_noBPE/'
# Turns all the json files in a folder into numpy arrays
def json_to_nparray(path):
    for filename in os.listdir(path):
        if filename.endswith('.json'):
            with open(path + filename) as f:
                data = json.load(f)
                np.save(path + filename[:-5], np.array(data))
            continue
        else:
            continue
json_to_nparray(path)

In [None]:
import numpy as np
import os

def get_tokens(path): # function to get all tokens from the datasets single .npy files and put them in one numpy array
    to_shuffle = np.array(()) # to_shuffle aggregator for one piece
    data = np.array(()) # data aggregator for one piece
    piece_counter = 0
    
    for filename in os.listdir(path):
        if filename.endswith('.npy'):
            to_shuffle = np.append(to_shuffle, np.load(path + filename, allow_pickle=True)[()])
    np.random.shuffle(to_shuffle)
    
    for i in range(len(to_shuffle)):
        data = np.append(data, 1)
        data = np.append(data, to_shuffle[i].get('tokens'))
        data = np.append(data, 2)
        piece_counter += 1
        
    return data, piece_counter

path = 'clean/data/SMALL_2M/tokens_small_BPE/'
data, piece_counter = get_tokens(path)

In [37]:
def get_tokens(path): # function to get all tokens from the dataset and put them in a numpy array
    to_shuffle = np.array(()) # to_shuffle aggregator for one piece
    data = np.array(()) # data aggregator for one piece
    piece_counter = 0
    
    for filename in os.listdir(path):
        if filename.endswith('.npy'):
            to_shuffle = np.append(to_shuffle, np.load(path + filename, allow_pickle=True)[()])
    np.random.shuffle(to_shuffle)
    
    for i in range(len(to_shuffle)):
        data = np.append(data, 1)
        for j in range(len(to_shuffle[i].get('tokens'))):
            data = np.append(data, to_shuffle[i].get('tokens')[j])
        data = np.append(data, 2)
        piece_counter += 1
        
    return data, piece_counter

path = 'clean/data/SMALL_2M/tokens_small_noBPE/'
data, piece_counter = get_tokens(path)

In [38]:
data.shape


(2701932,)

In [39]:
np.save('clean/data/SMALL_2M/SMALL_2M_noBPE', data)