In [25]:
import os, miditoolkit
import numpy as np
import os, pickle
import collections
import subprocess

In [26]:
# Position
double_positions_bins = set([i * 30 for i in range(0, 64)])
triplet_positions_bins = set([i * 40 for i in range(0, 48)])
positions_bins = sorted((double_positions_bins | triplet_positions_bins))  # 并集

# duration bins, default resol = 480 ticks per beat
double_duration = set([i * 30 for i in range(1, 257)])
triplet_duration = set([40, 80, 160, 320, 640])
duration_bins = list(sorted(double_duration | triplet_duration))

In [27]:
from transformers import BartTokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
tokenizer.encode("<pad>")
tokenizer.decode(1)

'<pad>'

In [28]:
def build_dict(save_path):
    # create save dir
    if os.path.exists(save_path):
        subprocess.check_call(f'rm -rf "{save_path}"', shell=True)  # 运行由args参数提供的命令，等待命 执行结束并返回返回码。
        os.makedirs(save_path)
    else:
        os.makedirs(save_path)
    
    # create melody dictionary
    melody_dict = collections.defaultdict(list)
    
    special_tokens = [
        '<s>', ## start of sequence
        '<pad>',
        '</s>', ## end of sequence
    ]
    strengths = ['<strong>', '<substrong>', '<weak>']
    lengths = ['<long>', '<short>']
    prosody = [
        '<strong, long>',
        '<strong, short>',
        '<substrong, long>',
        '<substrong, short>',
        '<weak, long>',
        '<weak, short>'
    ]
    
    ## phrase boundary
    # for st in special_tokens:
    for st in special_tokens:
        melody_dict['Phrase'].append(st)
    melody_dict['Phrase'].append('<true>')
    melody_dict['Phrase'].append('<false>')
    
    for st in special_tokens:
        melody_dict['CPProsody'].append(st)
    for pt in prosody:
        melody_dict['CPProsody'].append(pt)
        
        
    max_sent_num = 128
    
    ## ------------ Prosody Template -----------------
    ## special tokens
    for st in special_tokens:
        melody_dict['Prosody'].append(st)
    ## prosody tokens
    for x in strengths:
        melody_dict['Prosody'].append(x)
    for x in lengths:
        melody_dict['Prosody'].append(x)
    """
    ## for basic melody
    for pitch in range (0, 128):
        melody_dict['Prosody'].append(f"Pitch_{pitch}")
    ## for rhythm templates
    ## bar
    melody_dict['Prosody'].append(f"Bar")
    ## position
    for pos in positions_bins:
        melody_dict['Prosody'].append(f"Pos_{pos}")
    """
    ## note numbers
    """
    for x in range(200):
        melody_dict['Prosody'].append(f"<note_{x}>")
    """
    ## for basic melody
    """
    for pitch in range (0, 128):
        melody_dict['Prosody'].append(f"Pitch_{pitch}")
    """
    ## for phrase boundary
    '''
    for pid in range(max_sent_num):
        melody_dict['Prosody'].append(f"<sent_{pid}>")
    '''
    melody_dict['Prosody'].append(f"<sent>")
    
    
    ## strength
    for st in special_tokens:
        melody_dict['Strength'].append(st)
    for s in strengths:
        melody_dict['Strength'].append(s)
        
    ## length
    for st in special_tokens:
        melody_dict['Length'].append(st)
    for l in lengths:
        melody_dict['Length'].append(l)
    
    for st in special_tokens:
        melody_dict['Token'].append(st)
    # for i in range (0, 256):
    melody_dict['Token'].append(f"Bar")
    for pos in positions_bins:
        melody_dict['Token'].append(f"Pos_{pos}")
    for pitch in range (0, 128):
        melody_dict['Token'].append(f"Pitch_{pitch}")
    for dur in duration_bins:
        melody_dict['Token'].append(f"Dur_{dur}") 
    """
    for pid in range(max_sent_num):
        melody_dict['Token'].append(f"<sent_{pid}>")
    """
    melody_dict['Token'].append(f"<sent>")
    
    ## bar
    for st in special_tokens:
        melody_dict['Bar'].append(st)
    for i in range (0, 256):
        melody_dict['Bar'].append(f"Bar_{i}")
    
    ## position
    for st in special_tokens:
        melody_dict['Pos'].append(st)
    for pos in positions_bins:
        melody_dict['Pos'].append(f"Pos_{pos}")
    
    ## pitch
    for st in special_tokens:
        melody_dict['Pitch'].append(st)
    for pitch in range (0, 128):
        melody_dict['Pitch'].append(f"Pitch_{pitch}")
    
    ## duration
    for st in special_tokens:
        melody_dict['Dur'].append(st)
    for dur in duration_bins:
        melody_dict['Dur'].append(f"Dur_{dur}") 

    for k, v in melody_dict.items():
         print(f"{k:<15s} : {v}\n")
  
    # melody dictionary
    event2word, word2event = {}, {}
    melody_class = melody_dict.keys()

    for cls in melody_class:
        event2word[cls] = {v:k for k,v in enumerate(melody_dict[cls])}
        word2event[cls] = {k:v for k,v in enumerate(melody_dict[cls])}
            
    pickle.dump((event2word, word2event), open(f'{save_path}/music_dict.pkl', 'wb'))
    
    # print
    print('Melody Dict [class size]')
    for key in melody_class:
        print('> {:20s} : {}'.format(key, len(event2word[key])))

    return event2word, word2event

In [29]:
build_dict('/data1/qihao/cs6207/octuple/binary')

Phrase          : ['<s>', '<pad>', '</s>', '<true>', '<false>']

CPProsody       : ['<s>', '<pad>', '</s>', '<strong, long>', '<strong, short>', '<substrong, long>', '<substrong, short>', '<weak, long>', '<weak, short>']

Prosody         : ['<s>', '<pad>', '</s>', '<strong>', '<substrong>', '<weak>', '<long>', '<short>', '<sent>']

Strength        : ['<s>', '<pad>', '</s>', '<strong>', '<substrong>', '<weak>']

Length          : ['<s>', '<pad>', '</s>', '<long>', '<short>']

Token           : ['<s>', '<pad>', '</s>', 'Bar', 'Pos_0', 'Pos_30', 'Pos_40', 'Pos_60', 'Pos_80', 'Pos_90', 'Pos_120', 'Pos_150', 'Pos_160', 'Pos_180', 'Pos_200', 'Pos_210', 'Pos_240', 'Pos_270', 'Pos_280', 'Pos_300', 'Pos_320', 'Pos_330', 'Pos_360', 'Pos_390', 'Pos_400', 'Pos_420', 'Pos_440', 'Pos_450', 'Pos_480', 'Pos_510', 'Pos_520', 'Pos_540', 'Pos_560', 'Pos_570', 'Pos_600', 'Pos_630', 'Pos_640', 'Pos_660', 'Pos_680', 'Pos_690', 'Pos_720', 'Pos_750', 'Pos_760', 'Pos_780', 'Pos_800', 'Pos_810', 'Pos_840', 'Pos

({'Phrase': {'<s>': 0, '<pad>': 1, '</s>': 2, '<true>': 3, '<false>': 4},
  'CPProsody': {'<s>': 0,
   '<pad>': 1,
   '</s>': 2,
   '<strong, long>': 3,
   '<strong, short>': 4,
   '<substrong, long>': 5,
   '<substrong, short>': 6,
   '<weak, long>': 7,
   '<weak, short>': 8},
  'Prosody': {'<s>': 0,
   '<pad>': 1,
   '</s>': 2,
   '<strong>': 3,
   '<substrong>': 4,
   '<weak>': 5,
   '<long>': 6,
   '<short>': 7,
   '<sent>': 8},
  'Strength': {'<s>': 0,
   '<pad>': 1,
   '</s>': 2,
   '<strong>': 3,
   '<substrong>': 4,
   '<weak>': 5},
  'Length': {'<s>': 0, '<pad>': 1, '</s>': 2, '<long>': 3, '<short>': 4},
  'Token': {'<s>': 0,
   '<pad>': 1,
   '</s>': 2,
   'Bar': 3,
   'Pos_0': 4,
   'Pos_30': 5,
   'Pos_40': 6,
   'Pos_60': 7,
   'Pos_80': 8,
   'Pos_90': 9,
   'Pos_120': 10,
   'Pos_150': 11,
   'Pos_160': 12,
   'Pos_180': 13,
   'Pos_200': 14,
   'Pos_210': 15,
   'Pos_240': 16,
   'Pos_270': 17,
   'Pos_280': 18,
   'Pos_300': 19,
   'Pos_320': 20,
   'Pos_330': 21,
   '

In [94]:
# Create a new tokenizer with the desired vocabulary
"""
templ_vocab = ['<pad>', '<s>', '</s>', '<unk>', 
               '<strong,long>', '<substrong,long>', '<weak,long>', 
               '<strong,short>', '<substrong,short>', '<weak,short>', 
               '<stop>']
"""
templ_vocab = ['<pad>', '<s>', '</s>', '<unk>', 
               '<strong>', '<substrong>', '<weak>', 
               '<long>', '<short>', 
               '<stop>']
templ_tokenizer = tokenizers.CharBPETokenizer()
templ_tokenizer.train_from_iterator(templ_vocab)




