<a href="https://colab.research.google.com/github/nirkoren1/Chord_Transformer/blob/master/chord_tokenizers.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pickle5
!pip uninstall -y -q tensorflow keras tensorflow-estimator tensorflow-text
!pip install -q -U tensorflow-text tensorflow

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pickle5
import tensorflow_text as text
from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
import tensorflow as tf
import re
import pathlib

In [None]:
with open("/content/drive/MyDrive/Colab Notebooks/data.pickle", 'rb') as f:
  df = pickle5.load(f)
df

Unnamed: 0,genres,popularity,chords,training_data
0,"[canadian pop, pop, post-teen pop]",100,"[G, G/B, B, C, G, G, G/B, B, C, G, G, Em, C, G...","[<start-genres>, canadian pop, pop, post-teen ..."
1,"[canadian pop, pop, post-teen pop]",100,"[F#m7, D2, F#m7, D2, F#m7, D2, E, F#m7, A/C#, ...","[<start-genres>, canadian pop, pop, post-teen ..."
2,"[canadian pop, pop, post-teen pop]",100,"[Em, D, C, C, D, Em, Em, D, C, C, D, Em, Em, D...","[<start-genres>, canadian pop, pop, post-teen ..."
3,"[canadian pop, pop, post-teen pop]",100,"[Em, Bm, Am, C, Em, Bm, Am, C, Em, Bm, Am, C, ...","[<start-genres>, canadian pop, pop, post-teen ..."
4,"[canadian pop, pop, post-teen pop]",100,"[Gm, Dm, C, C, Gm, Dm, C, C, Gm, Dm, C, C, Gm,...","[<start-genres>, canadian pop, pop, post-teen ..."
...,...,...,...,...
135777,"[classic rock, folk, folk rock, melancholia, m...",76,"[C, Am, F, G, C, Am, F, G, C, Am, F, G, C, Am,...","[<start-genres>, classic rock, folk, folk rock..."
135778,"[classic rock, folk, folk rock, melancholia, m...",76,"[B, E, B, E, B, E, F#, E, B, F#, E, B, E, B, E...","[<start-genres>, classic rock, folk, folk rock..."
135780,"[classic rock, folk, folk rock, melancholia, m...",76,"[Dm, C, Dm, F, Bb, F, Bb, F, Bb, F, Dm, F, C, ...","[<start-genres>, classic rock, folk, folk rock..."
135781,"[classic rock, folk, folk rock, melancholia, m...",76,"[G, Em, C, G, G, Am, C, D, G, Em, C, G, G, Am,...","[<start-genres>, classic rock, folk, folk rock..."


In [None]:
train_ch = []
train_ge = []
for index, row in df.iterrows():
  genres = row["genres"]
  genres_connected = ""
  for genre in genres:
    genres_connected += (genre + " ")
  genres_connected = genres_connected[:-1]
  train_ge.append(genres_connected)

  chords = row["chords"]
  chords_connected = ""
  for chord in chords:
    chords_connected += (chord + " ")
  chords_connected = chords_connected[:-1]
  train_ch.append(chords_connected)

In [None]:
train_ch = tf.data.Dataset.from_tensor_slices(train_ch)
train_ge = tf.data.Dataset.from_tensor_slices(train_ge)
train_ch

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.string, name=None)>

In [None]:
bert_tokenizer_params=dict(lower_case=True)
reserved_tokens=["[PAD]", "[UNK]", "[START]", "[END]"]

bert_vocab_args_chords = dict(
    # The target vocabulary size
    vocab_size = 1000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

bert_vocab_args_genres = dict(
    # The target vocabulary size
    vocab_size = 8000,
    # Reserved tokens that must be included in the vocabulary
    reserved_tokens=reserved_tokens,
    # Arguments for `text.BertTokenizer`
    bert_tokenizer_params=bert_tokenizer_params,
    # Arguments for `wordpiece_vocab.wordpiece_tokenizer_learner_lib.learn`
    learn_params={},
)

In [None]:
ge_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ge.batch(1000).prefetch(2),
    **bert_vocab_args_genres
)

In [None]:
def write_vocab_file(filepath, vocab):
  with open(filepath, 'w') as f:
    for token in vocab:
      print(token, file=f)
write_vocab_file('ge_vocab.txt', ge_vocab)

In [None]:
ch_vocab = bert_vocab.bert_vocab_from_dataset(
    train_ch.batch(1000).prefetch(2),
    **bert_vocab_args_chords
)

In [None]:
write_vocab_file('ch_vocab.txt', ch_vocab)

In [None]:
ge_tokenizer = text.BertTokenizer('ge_vocab.txt', **bert_tokenizer_params)
ch_tokenizer = text.BertTokenizer('ch_vocab.txt', **bert_tokenizer_params)

In [None]:
START = tf.argmax(tf.constant(reserved_tokens) == "[START]")
END = tf.argmax(tf.constant(reserved_tokens) == "[END]")

def add_start_end(ragged):
  count = ragged.bounding_shape()[0]
  starts = tf.fill([count,1], START)
  ends = tf.fill([count,1], END)
  return tf.concat([starts, ragged, ends], axis=1)


In [None]:
def cleanup_text(reserved_tokens, token_txt):
  # Drop the reserved tokens, except for "[UNK]".
  bad_tokens = [re.escape(tok) for tok in reserved_tokens if tok != "[UNK]"]
  bad_token_re = "|".join(bad_tokens)

  bad_cells = tf.strings.regex_full_match(token_txt, bad_token_re)
  result = tf.ragged.boolean_mask(token_txt, ~bad_cells)

  # Join them into strings.
  result = tf.strings.reduce_join(result, separator=' ', axis=-1)

  return result


In [None]:
class CustomTokenizer(tf.Module):
  def __init__(self, reserved_tokens, vocab_path):
    self.tokenizer = text.BertTokenizer(vocab_path, lower_case=True)
    self._reserved_tokens = reserved_tokens
    self._vocab_path = tf.saved_model.Asset(vocab_path)

    vocab = pathlib.Path(vocab_path).read_text().splitlines()
    self.vocab = tf.Variable(vocab)

    ## Create the signatures for export:   

    # Include a tokenize signature for a batch of strings. 
    self.tokenize.get_concrete_function(
        tf.TensorSpec(shape=[None], dtype=tf.string))

    # Include `detokenize` and `lookup` signatures for:
    #   * `Tensors` with shapes [tokens] and [batch, tokens]
    #   * `RaggedTensors` with shape [batch, tokens]
    self.detokenize.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.detokenize.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    self.lookup.get_concrete_function(
        tf.TensorSpec(shape=[None, None], dtype=tf.int64))
    self.lookup.get_concrete_function(
          tf.RaggedTensorSpec(shape=[None, None], dtype=tf.int64))

    # These `get_*` methods take no arguments
    self.get_vocab_size.get_concrete_function()
    self.get_vocab_path.get_concrete_function()
    self.get_reserved_tokens.get_concrete_function()

  @tf.function
  def tokenize(self, strings):
    enc = self.tokenizer.tokenize(strings)
    # Merge the `word` and `word-piece` axes.
    enc = enc.merge_dims(-2,-1)
    enc = add_start_end(enc)
    return enc

  @tf.function
  def detokenize(self, tokenized):
    words = self.tokenizer.detokenize(tokenized)
    return cleanup_text(self._reserved_tokens, words)

  @tf.function
  def lookup(self, token_ids):
    return tf.gather(self.vocab, token_ids)

  @tf.function
  def get_vocab_size(self):
    return tf.shape(self.vocab)[0]

  @tf.function
  def get_vocab_path(self):
    return self._vocab_path

  @tf.function
  def get_reserved_tokens(self):
    return tf.constant(self._reserved_tokens)


In [None]:
tokenizers = tf.Module()
tokenizers.ge = CustomTokenizer(reserved_tokens, 'ge_vocab.txt')
tokenizers.ch = CustomTokenizer(reserved_tokens, 'ch_vocab.txt')

In [None]:
model_name = '/content/drive/MyDrive/Colab Notebooks/Transformer_project /ted_hrlr_translate_ge_ch_converter'
tf.saved_model.save(tokenizers, model_name)
print(tokenizers.ch.get_vocab_size())
print(tokenizers.ge.get_vocab_size())

tf.Tensor(174, shape=(), dtype=int32)
tf.Tensor(976, shape=(), dtype=int32)
