# Model Use: *Transformer*

# 1. Environement Setup

In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Embedding, LayerNormalization, Dropout, MultiHeadAttention, TimeDistributed, Masking
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import pad_sequences, to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import SparseCategoricalCrossentropy
from transformers import TFAutoModel, AutoTokenizer


import ast
import gc

file_path = 'normalized_output.csv'




  from .autonotebook import tqdm as notebook_tqdm


# 2. Data extraction

In [19]:
# class PositionalEncoding(tf.keras.layers.Layer):
#     def __init__(self, position, d_model):
#         super(PositionalEncoding, self).__init__()
#         self.pos_encoding = self.positional_encoding(position, d_model)

#     def get_angles(self, position, i, d_model):
#         # Ensure all operations are done using float32 data type
#         position = tf.cast(position, dtype=tf.float32)
#         i = tf.cast(i, dtype=tf.float32)
#         d_model = tf.cast(d_model, dtype=tf.float32)
#         angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / d_model)
#         return position * angle_rates

#     def positional_encoding(self, position, d_model):
#         angle_rads = self.get_angles(
#             position=tf.range(position, dtype=tf.float32)[:, tf.newaxis],
#             i=tf.range(d_model, dtype=tf.float32)[tf.newaxis, :],
#             d_model=d_model)
#         # apply sin to even indices in the array; 2i
#         sines = tf.math.sin(angle_rads[:, 0::2])
#         # apply cos to odd indices in the array; 2i+1
#         cosines = tf.math.cos(angle_rads[:, 1::2])
#         pos_encoding = tf.concat([sines, cosines], axis=-1)
#         pos_encoding = pos_encoding[tf.newaxis, ...]
#         return tf.cast(pos_encoding, tf.float32)

#     def call(self, inputs):
#         # Ensure inputs are float32 before adding positional encoding
#         inputs = tf.cast(inputs, tf.float32)
#         return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]


In [2]:
# Load the data
data = pd.read_csv(file_path)

# Convert string representations of lists to actual lists
data['ph_seq_encoded'] = data['ph_seq_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['ph_dur'] = data['ph_dur'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['f0_seq'] = data['f0_seq'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else x)  # Handle NaN for f0_seq
data['note_seq_encoded'] = data['note_seq_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Normalize the float arrays
scaler_ph_dur = MinMaxScaler(feature_range=(0, 1))
# Normalize ph_dur assuming they are already in the correct format
data['ph_dur'] = [scaler_ph_dur.fit_transform(np.array(seq).reshape(-1, 1)).flatten() for seq in data['ph_dur']]

# Only normalize f0_seq if it's not NaN
scaler_f0_seq = MinMaxScaler(feature_range=(0, 1))
data['f0_seq'] = [scaler_f0_seq.fit_transform(np.array(seq).reshape(-1, 1)).flatten() if seq is not np.nan else np.nan for seq in data['f0_seq']]

# Find the maximum sequence length across all sequence columns
max_sequence_length = max(
    max(data['ph_seq_encoded'].apply(len)),
    max(data['ph_dur'].apply(len)),
    max([len(seq) for seq in data['f0_seq'] if seq is not np.nan]),  # Only consider non-NaN sequences
    max(data['note_seq_encoded'].apply(len))
)

# Pad the sequences
data['ph_seq_encoded'] = pad_sequences(data['ph_seq_encoded'], maxlen=max_sequence_length, padding='post').tolist()
data['ph_dur'] = pad_sequences(data['ph_dur'], maxlen=max_sequence_length, padding='post', dtype='float').tolist()
data['f0_seq'] = [pad_sequences([seq], maxlen=max_sequence_length, padding='post', dtype='float').flatten() if seq is not np.nan else np.full(max_sequence_length, np.nan) for seq in data['f0_seq']]
data['note_seq_encoded'] = pad_sequences(data['note_seq_encoded'], maxlen=max_sequence_length, padding='post').tolist()

# Flatten all sequences into a single list
all_ph_seq = [item for sublist in data['ph_seq_encoded'] for item in sublist]

# Convert to numpy array and reshape to be 2D
all_ph_seq_array = np.array(all_ph_seq).reshape(-1, 1)
del all_ph_seq

# Determine the number of categories for notes
num_note_categories = max(data['note_seq_encoded'].apply(max)) + 1  # Assuming the sequences are zero-indexed

# Determine the number of unique phonemes and notes for embeddings
num_phonemes = len(np.unique(all_ph_seq_array))
num_notes = num_note_categories  # from your previous code

# Prepare the dataset for the transformer
X_phonemes = np.array(data['ph_seq_encoded'].tolist())
y_ph_dur = np.array(data['ph_dur'].tolist())
y_notes = np.array(data['note_seq_encoded'].tolist())
y_f0_seq = np.array(data['f0_seq'].tolist())

# Ensure that all arrays have three dimensions
y_ph_dur = y_ph_dur[..., np.newaxis]  # shape (num_sequences, sequence_length, 1)

# Split into inputs and targets
X_train, X_test, y_ph_dur_train, y_ph_dur_test, y_notes_train, y_notes_test, y_f0_seq_train, y_f0_seq_test = train_test_split(
    X_phonemes, y_ph_dur, y_notes, y_f0_seq, test_size=0.2, random_state=42
)

# Convert f0_seq to a masked array for handling NaN values in the training data
mask_value = -999  # This should be a value that doesn't naturally occur in the dataset
y_f0_seq_train_masked = np.array([np.where(np.isnan(seq), mask_value, seq) for seq in y_f0_seq_train])
y_f0_seq_test_masked = np.array([np.where(np.isnan(seq), mask_value, seq) for seq in y_f0_seq_test])


In [None]:
# Free some memory here
# del all_ph_seq_array
# del all_ph_seq
# del data
# del y_ph_dur
# gc.collect()

# y_ph_dur_train

# 3. Model Training

In [5]:
from transformers import TFBertModel, BertTokenizer
from tqdm import tqdm  # for a progress bar

# Load the data
d = pd.read_csv(file_path)

# Convert string representations of lists to actual lists
d['ph_seq_encoded'] = d['ph_seq_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
# Initialize the BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Define your input placeholders
# Adjust these based on how you preprocess your data for BERT
input_ids = tf.keras.Input(shape=(max_sequence_length,), dtype='int32', name="input_ids")
attention_mask = tf.keras.Input(shape=(max_sequence_length,), dtype='int32', name="attention_mask")

# Define the BERT model as part of your Keras model
bert_output = bert_model(input_ids, attention_mask=attention_mask)[0]

# Continue with your custom layers
transformer_output = tf.keras.layers.Dense(64, activation='relu')(bert_output)

# Output layers for each prediction
ph_dur_output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='linear'), name='ph_dur_output')(transformer_output)
f0_seq_output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(1, activation='linear'), name='f0_seq_output')(transformer_output)
note_output = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(num_notes), name='note_output')(transformer_output)

# Compile the model
model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=[ph_dur_output, f0_seq_output, note_output])
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss={
        'ph_dur_output': 'mean_squared_error',
        'f0_seq_output': 'mean_squared_error',
        'note_output': tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
    },
    metrics=['accuracy']
)

# Create a TensorFlow session
with tf.compat.v1.Session() as sess:
    # Initialize all variables
    sess.run(tf.compat.v1.global_variables_initializer())

    # Define the number of epochs for training
    num_epochs = 10
    # Your training loop here
    for epoch in range(num_epochs):
        print("Epoch:", epoch)

        # Shuffle your training data if necessary
        # Example: X_train, y_ph_dur_train, y_f0_seq_train, y_notes_train = shuffle_data(X_train, y_ph_dur_train, y_f0_seq_train, y_notes_train)

        # Iterate over batches of data
        for i in tqdm(range(0, len(X_train), 32)):
            # Extract batch of text data
            batch_texts = d['ph_seq_encoded'][i:i + 32]

            # Tokenize the text batch
            encoding = tokenizer.batch_encode_plus(batch_texts, 
                                                max_length=max_sequence_length, 
                                                padding='max_length', 
                                                truncation=True,
                                                return_tensors='tf')

            # Extract input IDs and attention masks from the encoding
            input_ids = encoding['input_ids']
            attention_masks = encoding['attention_mask']

            # Extract corresponding targets for the batch
            batch_ph_dur = y_ph_dur_train[i:i + 32]
            batch_f0_seq = y_f0_seq_train[i:i + 32]
            batch_notes = y_notes_train[i:i + 32]

            # Train on the batch
            model.train_on_batch([input_ids, attention_masks], [batch_ph_dur, batch_f0_seq, batch_notes])
    # Save the trained model
    saver = tf.compat.v1.train.Saver()
    saver.save(sess, 'model_transformer.ckpt')

# import joblib

# # Save the scalers and encoders
# joblib.dump(scaler_ph_dur, 'scaler_ph_dur.pkl')
# joblib.dump(scaler_f0_seq, 'scaler_f0_seq.pkl')
# joblib.dump(encoder, 'ph_seq_encoder.pkl')

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

Epoch: 0


  0%|          | 0/101 [00:00<?, ?it/s]


ValueError: too many values to unpack (expected 2)

# 4. Validation
Put user input and get output

### 4.1 Input

In [None]:
import json

# Load the encoding dictionary from the ph_token_to_int.json file
with open('ph_token_to_int.json', 'r') as file:
    ph_token_to_int = json.load(file)

# Your input sequence
input_sequence = "AP n ei f a g e n a j i f u y u a p u AP n ei f a g e n a l e y u d ao en AP"

# Split the input sequence into individual phonemes
input_phonemes = input_sequence.split()

# Convert the phonemes to their corresponding integers using the encoding dictionary
encoded_sequence = [ph_token_to_int[phoneme] for phoneme in input_phonemes]

# Convert the sequence to a numpy array and pad it to the right length
new_ph_seq_encoded = np.array([encoded_sequence])  # wrapping in a list to create a batch dimension
new_ph_seq_encoded_padded = pad_sequences(new_ph_seq_encoded, maxlen=max_sequence_length, padding='post')

print(encoded_sequence)


### 4.2 Inference

In [None]:
predictions = model.predict(user_input_padded)


### 4.3 Decoding

In [None]:
predicted_ph_dur, predicted_f0_seq, predicted_note_seq_encoded = predictions

predicted_ph_dur_2d = predicted_ph_dur.reshape(-1, 1)

# Apply inverse transformation
decoded_ph_dur_2d = scaler_ph_dur.inverse_transform(predicted_ph_dur_2d)
# print(predicted_ph_dur_2d)

# Reshape it back to the original shape if needed
decoded_ph_dur = decoded_ph_dur_2d.reshape(-1, max_sequence_length)



predicted_f0_seq_2d = predicted_f0_seq.reshape(-1, 1)

# Reverse normalization for 'f0_seq', if it's not all NaNs
if not np.isnan(y_f0_seq_train).all():
    decoded_f0_seq_2d = scaler_f0_seq.inverse_transform(predicted_f0_seq_2d)
else:
    decoded_f0_seq_2d = None  # or a placeholder value if f0_seq was not predicted
    
decoded_f0_seq = decoded_f0_seq_2d.reshape(-1, max_sequence_length)

# Convert predicted probabilities for 'note_seq_encoded' back to category indices
decoded_note_seq_encoded = np.argmax(predicted_note_seq_encoded, axis=-1)  # If the last dimension contains the category probabilities

# If the decoded sequences are padded, you may want to trim the padding off. For example:
trim_padding = lambda seq, mask: seq[:np.where(seq == mask)[0][0] if np.where(seq == mask)[0].size > 0 else None]
# Now you can trim the padding if your original sequences were padded
decoded_ph_dur_trimmed = [seq[seq != mask_value] for seq in decoded_ph_dur]
if decoded_f0_seq is not None:
    decoded_f0_seq_trimmed = [trim_padding(seq, mask_value) for seq in decoded_f0_seq]
decoded_note_seq_encoded_trimmed = [trim_padding(seq, mask_value) for seq in decoded_note_seq_encoded]

# Now you can print or return the decoded predictions
print("Decoded ph_dur:", decoded_ph_dur_trimmed)
print(len(decoded_ph_dur_trimmed[0]))
if decoded_f0_seq is not None:
    print("Decoded f0_seq:", decoded_f0_seq_trimmed)
    print(len(decoded_f0_seq_trimmed[0]))
print("Decoded note_seq_encoded:", decoded_note_seq_encoded_trimmed)
print(len(decoded_note_seq_encoded_trimmed[0]))

### 4.4 Removing zeros

In [None]:
f0_timestep = 0.005
f0_constant = 205.1
def remove_trailing_zeros(sequence):
    # Find the last non-zero element in the sequence
    last_non_zero = -1
    for i in range(len(sequence) - 1, -1, -1):
        if sequence[i] != 0:
            last_non_zero = i
            break
    # Slice the sequence to remove trailing zeros
    return sequence[:last_non_zero + 1] if last_non_zero != -1 else sequence

# Apply this function to each of your predicted sequences
decoded_ph_dur_no_zeros = remove_trailing_zeros(decoded_ph_dur_trimmed[0])
decoded_f0_seq_no_zeros = remove_trailing_zeros(decoded_f0_seq_trimmed[0])
decoded_note_seq_encoded_no_zeros = remove_trailing_zeros(decoded_note_seq_encoded_trimmed[0])

if len(decoded_ph_dur_no_zeros) > len(input_phonemes):
        # Truncate the sequence if it's longer
    decoded_ph_dur_no_zeros = decoded_ph_dur_no_zeros[:len(input_phonemes)]
elif len(decoded_ph_dur_no_zeros) < len(input_phonemes):
    # Pad the sequence with zeros if it's shorter
    decoded_ph_dur_no_zeros = np.pad(decoded_ph_dur_no_zeros, (0, len(input_phonemes) - len(decoded_ph_dur_no_zeros)), 'constant')

total_time = sum(decoded_ph_dur_no_zeros)

print(total_time)
f0_size = int(total_time / f0_timestep)
print(f0_size)
# Replace non-positive values with the specified constant
decoded_f0_seq_no_zeros = [x if x > 0 else f0_constant for x in decoded_f0_seq_no_zeros]

# Pad sequence with the constant value if it's shorter than the target length
if len(decoded_f0_seq_no_zeros) < f0_size:
    decoded_f0_seq_no_zeros = np.pad(decoded_f0_seq_no_zeros, (0, f0_size - len(decoded_f0_seq_no_zeros)), 'constant', constant_values=(f0_constant,))

# Or truncate if it's longer than the target length (just for safety)
elif len(decoded_f0_seq_no_zeros) > f0_size:
    decoded_f0_seq_no_zeros = decoded_f0_seq_no_zeros[:f0_size]

# Now you have your sequences with trailing zeros removed
print("Decoded ph_dur without trailing zeros:", decoded_ph_dur_no_zeros)
print(len(decoded_ph_dur_no_zeros))
if decoded_f0_seq_no_zeros is not None:
    print("Decoded f0_seq without trailing zeros:", decoded_f0_seq_no_zeros)
    print(len(decoded_f0_seq_no_zeros))
print("Decoded note_seq_encoded without trailing zeros:", decoded_note_seq_encoded_no_zeros)
print(len(decoded_note_seq_encoded_no_zeros))
print(len(input_phonemes))

### 4.5 Make it .ds file

In [None]:
# use input_phonemes

# Load the token-to-int mappings from the JSON files
def load_mapping(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        token_to_int = json.load(f)
    # Invert the dictionary to create an int-to-token mapping
    int_to_token = {v: k for k, v in token_to_int.items()}
    return int_to_token

# Decoding function using the mappings
def decode_predictions(prediction_integers, mapping):
    return [mapping.get(i, 'Unknown') for i in prediction_integers]

note_int_to_token = load_mapping('note_token_to_int.json')

predicted_note_seq_integers = decoded_note_seq_encoded_no_zeros

decoded_note_seq = decode_predictions(predicted_note_seq_integers, note_int_to_token)

# Print or return the decoded sequences
print("Decoded Phonetic Sequence:", input_phonemes)
print("Decoded Note Sequence:", decoded_note_seq)

file = {
    'ph_seq': input_phonemes,
    'ph_dur': decoded_ph_dur_no_zeros,
    'note_seq': decoded_note_seq,
    'f0_seq': decoded_f0_seq_no_zeros
}

with open('rnn_output.ds', 'w') as json_file:
    json.dump(file, json_file, indent=4)