# LOAD RAW Text Seq and Init Vocab

In [None]:
import sys

from tqdm import tqdm
import json
import torch

raw_data_path = '/workspace/pj/data/corpus/raw_corpus_bpe.txt'
# raw_data_path = '/workspace/data/corpus/first_5_lines_bpe.txt'

vocab_path = '/workspace/pj/data/vocabs/chord.json'
with open(vocab_path, 'r') as file:
    vocab = json.load(file)

print("Initial Vocabulary:", vocab)


In [None]:
raw_data = []
with open(raw_data_path, 'r') as f:
    for line in tqdm(f, desc="reading original txt file..."):
        raw_data.append(line.strip())


def extract_chord_seq_list(toks, ratio=4):
    if isinstance(toks, str):
        toks = toks.split()

    l_toks = len(toks)

    chord_list = []
    
    for idx in range(0, l_toks, ratio):
        t1, t2, t3, t4 = toks[idx : idx + 4]
        
        if t1[0] == 'h' or t1[0] == 'H':
            chord_list.append(t1)
            
    return chord_list

In [None]:
# chord sequnece만 추출

chord_seq = []
cnt = 0
for data in tqdm(raw_data):
    if cnt == 2000:
        break
    out_list = extract_chord_seq_list(data)
    chord_seq.append(out_list)
    cnt += 1
print(len(chord_seq))
print(chord_seq[0])

# Init Vocab & Dict

In [None]:
init_dict = {"<eos>": 1,
    "<pad>": 0,
    "<bos>": 2,
    "HC+": 134,
    "HC/o7": 135,
    "HCD7": 3,
    "HCM": 4,
    "HCM7": 5,
    "HCm": 6,
    "HCm7": 7,
    "HCo": 8,
    "HCo7": 9,
    "HCsus2": 10,
    "HCsus4": 11,
    "Hd+": 12,
    "Hd/o7": 13,
    "HdD7": 14,
    "HdM": 15,
    "HdM7": 16,
    "Hdm": 17,
    "Hdm7": 18,
    "Hdo": 19,
    "Hdo7": 20,
    "Hdsus2": 21,
    "Hdsus4": 22,
    "HD+": 23,
    "HD/o7": 24,
    "HDD7": 25,
    "HDM": 26,
    "HDM7": 27,
    "HDm": 28,
    "HDm7": 29,
    "HDo": 30,
    "HDo7": 31,
    "HDsus2": 32,
    "HDsus4": 33,
    "He+": 34,
    "He/o7": 35,
    "HeD7": 36,
    "HeM": 37,
    "HeM7": 38,
    "Hem": 39,
    "Hem7": 40,
    "Heo": 41,
    "Heo7": 42,
    "Hesus2": 43,
    "Hesus4": 44,
    "HE+": 45,
    "HE/o7": 46,
    "HED7": 47,
    "HEM": 48,
    "HEM7": 49,
    "HEm": 50,
    "HEm7": 51,
    "HEo": 52,
    "HEo7": 53,
    "HEsus2": 54,
    "HEsus4": 55,
    "HF+": 56,
    "HF/o7": 57,
    "HFD7": 58,
    "HFM": 59,
    "HFM7": 60,
    "HFm": 61,
    "HFm7": 62,
    "HFo": 63,
    "HFo7": 64,
    "HFsus2": 65,
    "HFsus4": 66,
    "Hg+": 67,
    "Hg/o7": 68,
    "HgD7": 69,
    "HgM": 70,
    "HgM7": 71,
    "Hgm": 72,
    "Hgm7": 73,
    "Hgo": 74,
    "Hgo7": 75,
    "Hgsus2": 76,
    "Hgsus4": 77,
    "HG+": 78,
    "HG/o7": 79,
    "HGD7": 80,
    "HGM": 81,
    "HGM7": 82,
    "HGm": 83,
    "HGm7": 84,
    "HGo": 85,
    "HGo7": 86,
    "HGsus2": 87,
    "HGsus4": 88,
    "Ha+": 89,
    "Ha/o7": 90,
    "HaD7": 91,
    "HaM": 92,
    "HaM7": 93,
    "Ham": 94,
    "Ham7": 95,
    "Hao": 96,
    "Hao7": 97,
    "Hasus2": 98,
    "Hasus4": 99,
    "HA+": 100,
    "HA/o7": 101,
    "HAD7": 102,
    "HAM": 103,
    "HAM7": 104,
    "HAm": 105,
    "HAm7": 106,
    "HAo": 107,
    "HAo7": 108,
    "HAsus2": 109,
    "HAsus4": 110,
    "Hb+": 111,
    "Hb/o7": 112,
    "HbD7": 113,
    "HbM": 114,
    "HbM7": 115,
    "Hbm": 116,
    "Hbm7": 117,
    "Hbo": 118,
    "Hbo7": 119,
    "Hbsus2": 120,
    "Hbsus4": 121,
    "HB+": 122,
    "HB/o7": 123,
    "HBD7": 124,
    "HBM": 125,
    "HBM7": 126,
    "HBm": 127,
    "HBm7": 128,
    "HBo": 129,
    "HBo7": 130,
    "HBsus2": 131,
    "HBsus4": 132,
    "HNA": 133
}

for i in init_dict.keys():
    init_dict[i] = 0

print(init_dict)

In [None]:
init_vocabs = list(init_dict.keys())
print(init_vocabs)
print(len(init_vocabs))

In [None]:
bpe_memory = {}
for c_seq in tqdm(chord_seq):
    for idx in range(1,len(c_seq)):
        adj_chord = c_seq[idx-1] + c_seq[idx]
        if adj_chord in bpe_memory:
            bpe_memory[adj_chord] += 1
        else:
            bpe_memory[adj_chord] = 1
bpe_memory = dict(sorted(bpe_memory.items(), key=lambda item: item[1], reverse=True))
print(bpe_memory)
print(list(bpe_memory.keys())[0])

new_vocab = list(bpe_memory.keys())[0]

if new_vocab in init_vocabs:
    print("ERROR EXIST")
else:
    init_vocabs.append(new_vocab)


In [None]:
print(init_vocabs)
print(len(init_vocabs))

In [None]:
# merge exist sequence with new vocab
print(chord_seq[0])
update_data = []
for c_seq in chord_seq:
    update_chord = []
    idx = 1
    while(idx < len(c_seq)):
        before = c_seq[idx-1]
        current = c_seq[idx]
        
        if before + current == new_vocab:
            # print("MERGED")
            update_chord.append(before+current)
            idx += 2
        else:
            update_chord.append(before)
            idx += 1
            
        if idx == len(c_seq):
            update_chord.append(c_seq[idx-1])
            break
        if idx == len(c_seq)+1:
            break
    update_data.append(update_chord)
    
print(update_data[0])

In [None]:
bpe_memory = {}
chord_seq = update_data
for c_seq in tqdm(chord_seq):
    for idx in range(1,len(c_seq)):
        adj_chord = c_seq[idx-1] + c_seq[idx]
        if adj_chord in bpe_memory:
            bpe_memory[adj_chord] += 1
        else:
            bpe_memory[adj_chord] = 1
bpe_memory = dict(sorted(bpe_memory.items(), key=lambda item: item[1], reverse=True))
print(bpe_memory)
print(list(bpe_memory.keys())[0])

new_vocab = list(bpe_memory.keys())[0]

if new_vocab in init_vocabs:
    print("ERROR EXIST")
else:
    init_vocabs.append(new_vocab)


In [None]:
print(init_vocabs)
print(len(init_vocabs))

In [None]:
test = ['a','a','a','b','a','b','a','a','c','a','a','a','b','a']
new_test = []
idx = 1

while(idx < len(test)):
    before = test[idx-1]
    current = test[idx]
    
    if before + current == 'ab':
        new_test.append(before+current)
        idx += 2
    else:
        new_test.append(before)
        idx += 1
        
    if idx == len(test):
        new_test.append(test[idx-1])
        break
    if idx == len(test)+1:
        break
print(new_test)

In [None]:
init_dict = Di

from collections import defaultdict, Counter

def get_stats(vocab):
    pairs = defaultdict(int)
    for word, freq in vocab.items():
        symbols = word.split()
        for i in range(len(symbols) - 1):
            pairs[symbols[i], symbols[i + 1]] += freq
    return pairs

def merge_vocab(pair, vocab):
    bigram = ' '.join(pair)
    replacement = ''.join(pair)
    new_vocab = {}
    for word in vocab:
        new_word = word.replace(bigram, replacement)
        new_vocab[new_word] = vocab[word]
    return new_vocab

# Function to update vocabulary with new tokens
def update_vocab(vocab_dict, new_tokens):
    max_key = max(int(key) for key in vocab_dict.keys())
    for token in new_tokens:
        if token not in vocab_dict.values():
            max_key += 1
            vocab_dict[str(max_key)] = token
    return vocab_dict


In [None]:
# Example sequences
sequences = flat_seq

# Create a frequency dictionary from sequences
freq_dict = Counter()
for seq in sequences:
    for chord in seq:
        # Split chord into characters with spaces
        split_chord = ' '.join(list(chord))
        freq_dict[split_chord] += 1

# Apply BPE
num_merges = 20  # Number of merges you want to perform
for i in tqdm(range(num_merges)):
    pairs = get_stats(freq_dict)
    if not pairs:
        break
    best = max(pairs, key=pairs.get)
    freq_dict = merge_vocab(best, freq_dict)

# Extract new tokens from the merged vocabulary
new_tokens = set()
for chord in freq_dict.keys():
    new_tokens.update(chord.split())

# Update the initial vocabulary with new tokens
vocab = update_vocab(vocab, new_tokens)
# print(vocab)

# Save the updated vocabulary back to a file
# with open('updated_chord.json', 'w') as f:
#     json.dump(vocab, f, indent=4)

print("Updated Vocabulary:", vocab)


In [None]:
def map_to_new_vocab(sequence, vocab):
    """Map the original sequence to the new BPE vocabulary indices."""
    print("Seq")
    print(len(sequence))
    print(sequence[0])
    print("Vocab")
    print(len(vocab))
    print(type(vocab))
    print(vocab)
    print("T to Idx")
    token_to_index = {token: idx for idx, token in enumerate(vocab)}

    # print(token_to_index)
    print(token_to_index)
    mapped_sequences = []
    
    for seq in sequence:
        print("Inside for")
        print(seq)
        mapped_seq = [token_to_index[token] for token in seq]
        mapped_sequences.append(mapped_seq)
    return mapped_sequences, token_to_index


mapped_sequences, token_to_index = map_to_new_vocab(encoded_sequences, vocab)

In [None]:
a = {'HC/o7': 0, 'HB/o7': 1, 'HBsus4': 2}
b = {'asdf': 2, 'ad': 5}
c = a|b
print(c)

In [None]:
# Create the desired JSON format
index_to_token = {idx: f"CHORD{idx}" for idx, token in enumerate(vocab)}
token_to_index = {f"CHORD{idx}": idx for idx, token in enumerate(vocab)}
combined_vocab = {**token_to_index, **index_to_token}

# Save to JSON
with open("/workspace/data/vocabs/bpe_chord_vocabs.json", "w") as f:
    json.dump(combined_vocab, f, indent=4)

print("\nCombined Vocabulary JSON:")
print(json.dumps(combined_vocab, indent=4))

In [None]:
def combine_dicts_remove_duplicates(dict1, dict2):
    # Combine the dictionaries
    combined_dict = {**dict1, **dict2}
    
    # Identify duplicates
    duplicates = set(dict1.items()) & set(dict2.items())
    
    # Remove duplicates from the combined dictionary
    for k, v in duplicates:
        if k in combined_dict and combined_dict[k] == v:
            del combined_dict[k]
    
    return combined_dict

dict1 = {'a': 1, 'b': 7, 'c': 3}
dict2 = {'b': 2, 'c': 4, 'd': 5}

combined_dict = combine_dicts_remove_duplicates(dict1, dict2)
print(combined_dict)
# Output: {'a': 1, 'c': 4, 'd': 5}


In [None]:
bpe_path = '/workspace/pj/exp/bpe_chord_vocab_1000.json'
with open(bpe_path, 'r') as file:
    bpe_vocab = json.load(file)
print(bpe_vocab)

In [None]:
print(len(bpe_vocab))

# Create VOCSB Json

In [105]:
import json

# Define the list of tokens
bpe_path = '/workspace/pj/exp/bpe_chord_vocab_20000.json'
with open(bpe_path, 'r') as file:
    tokens = json.load(file)
tokens = tokens[:1000]

# Create a dictionary mapping each token to its index
vocab = {token: index for index, token in enumerate(tokens)}

# Add the reverse mapping from index to token
vocab.update({index: token for index, token in enumerate(tokens)})

# Save the dictionary as a JSON file
with open('/workspace/pj/data/vocabs/chord_bpe_1000.json', 'w') as json_file:
    json.dump(vocab, json_file, indent=4)

print("Vocabulary JSON file created successfully.")


Vocabulary JSON file created successfully.
