# Model Use: *Transformer*

# 1. Environement Setup

In [4]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Layer, MultiHeadAttention, Dropout, LayerNormalization, Dense


file_path = 'normalized_output.csv'

# 2. Data processing

In [3]:
# Load the data
data = pd.read_csv(file_path)

# Function to convert string representation of a list to an actual list
def convert_string_to_list(string):
    return np.fromstring(string[1:-1], sep=',')  # Assumes the format "[1, 2, 3]"

# Assuming 'ph_seq_encoded' and 'note_seq_encoded' are strings representing lists
X = np.array([convert_string_to_list(row) for row in data['ph_seq_encoded']])
y_note = np.array([convert_string_to_list(row) for row in data['note_seq_encoded']])
y_dur = np.array([convert_string_to_list(row) for row in data['ph_dur']])

# Reshape the arrays
X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))
y_note_reshaped = y_note.reshape((y_note.shape[0], y_note.shape[1], 1))
y_dur_reshaped = y_dur.reshape((y_dur.shape[0], y_dur.shape[1], 1))

# print(X_reshaped[0].dtype, y_note_reshaped[0].dtype, y_dur_reshaped[0].dtype)

# Split the data into training and test sets
X_train, X_test, y_note_train, y_note_test, y_dur_train, y_dur_test = train_test_split(
    X_reshaped, y_note_reshaped, y_note_reshaped, test_size=0.3, random_state=42)

# 3. Model Training

In [10]:
class TransformerEncoder(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerEncoder, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim),]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    
def create_transformer_model(input_shape, num_heads, ff_dim, num_transformer_blocks, y1_shape, y2_shape):
    inputs = Input(shape=input_shape)
    x = inputs
    for _ in range(num_transformer_blocks):
        x = TransformerEncoder(ff_dim=ff_dim, num_heads=num_heads, embed_dim=input_shape[-1])(x)
    
    # Define two different output layers
    output1 = Dense(y1_shape[-1], activation='softmax')(x)  # Adjust based on your output shape and type for y1
    output2 = Dense(y2_shape[-1], activation='linear')(x)  # Adjust for y2

    model = Model(inputs=inputs, outputs=[output1, output2])
    return model

# Define model parameters and create the model
model = create_transformer_model(input_shape=X_train.shape[1:], num_heads=4, ff_dim=32, num_transformer_blocks=2, y1_shape=y_note_train.shape[1:], y2_shape=y_dur_train.shape[1:])

In [12]:
model.compile(optimizer='adam', loss=['categorical_crossentropy', 'mean_squared_error'], metrics=['accuracy'])

# Train the model
history = model.fit(X_train, [y_note_train, y_dur_train], validation_data=(X_test, [y_note_train, y_dur_train]), epochs=10, batch_size=32)

Epoch 1/10




  return dispatch_target(*args, **kwargs)




ValueError: Data cardinality is ambiguous:
  x sizes: 1207
  y sizes: 2815, 2815
Make sure all arrays contain the same number of samples.

# 4. Validation
Put user input and get output

### 4.1 Input

In [None]:
import json

# Load the encoding dictionary from the ph_token_to_int.json file
with open('ph_token_to_int.json', 'r') as file:
    ph_token_to_int = json.load(file)

# Your input sequence
input_sequence = "AP n ei f a g e n a j i f u y u a p u AP n ei f a g e n a l e y u d ao en AP"

# Split the input sequence into individual phonemes
input_phonemes = input_sequence.split()

# Convert the phonemes to their corresponding integers using the encoding dictionary
encoded_sequence = [ph_token_to_int[phoneme] for phoneme in input_phonemes]

# Convert the sequence to a numpy array and pad it to the right length
new_ph_seq_encoded = np.array([encoded_sequence])  # wrapping in a list to create a batch dimension
new_ph_seq_encoded_padded = pad_sequences(new_ph_seq_encoded, maxlen=max_sequence_length, padding='post')

print(encoded_sequence)


### 4.2 Inference

In [None]:
predictions = model.predict(user_input_padded)


### 4.3 Decoding

In [None]:
predicted_ph_dur, predicted_f0_seq, predicted_note_seq_encoded = predictions

predicted_ph_dur_2d = predicted_ph_dur.reshape(-1, 1)

# Apply inverse transformation
decoded_ph_dur_2d = scaler_ph_dur.inverse_transform(predicted_ph_dur_2d)
# print(predicted_ph_dur_2d)

# Reshape it back to the original shape if needed
decoded_ph_dur = decoded_ph_dur_2d.reshape(-1, max_sequence_length)



predicted_f0_seq_2d = predicted_f0_seq.reshape(-1, 1)

# Reverse normalization for 'f0_seq', if it's not all NaNs
if not np.isnan(y_f0_seq_train).all():
    decoded_f0_seq_2d = scaler_f0_seq.inverse_transform(predicted_f0_seq_2d)
else:
    decoded_f0_seq_2d = None  # or a placeholder value if f0_seq was not predicted
    
decoded_f0_seq = decoded_f0_seq_2d.reshape(-1, max_sequence_length)

# Convert predicted probabilities for 'note_seq_encoded' back to category indices
decoded_note_seq_encoded = np.argmax(predicted_note_seq_encoded, axis=-1)  # If the last dimension contains the category probabilities

# If the decoded sequences are padded, you may want to trim the padding off. For example:
trim_padding = lambda seq, mask: seq[:np.where(seq == mask)[0][0] if np.where(seq == mask)[0].size > 0 else None]
# Now you can trim the padding if your original sequences were padded
decoded_ph_dur_trimmed = [seq[seq != mask_value] for seq in decoded_ph_dur]
if decoded_f0_seq is not None:
    decoded_f0_seq_trimmed = [trim_padding(seq, mask_value) for seq in decoded_f0_seq]
decoded_note_seq_encoded_trimmed = [trim_padding(seq, mask_value) for seq in decoded_note_seq_encoded]

# Now you can print or return the decoded predictions
print("Decoded ph_dur:", decoded_ph_dur_trimmed)
print(len(decoded_ph_dur_trimmed[0]))
if decoded_f0_seq is not None:
    print("Decoded f0_seq:", decoded_f0_seq_trimmed)
    print(len(decoded_f0_seq_trimmed[0]))
print("Decoded note_seq_encoded:", decoded_note_seq_encoded_trimmed)
print(len(decoded_note_seq_encoded_trimmed[0]))

### 4.4 Removing zeros

In [None]:
f0_timestep = 0.005
f0_constant = 205.1
def remove_trailing_zeros(sequence):
    # Find the last non-zero element in the sequence
    last_non_zero = -1
    for i in range(len(sequence) - 1, -1, -1):
        if sequence[i] != 0:
            last_non_zero = i
            break
    # Slice the sequence to remove trailing zeros
    return sequence[:last_non_zero + 1] if last_non_zero != -1 else sequence

# Apply this function to each of your predicted sequences
decoded_ph_dur_no_zeros = remove_trailing_zeros(decoded_ph_dur_trimmed[0])
decoded_f0_seq_no_zeros = remove_trailing_zeros(decoded_f0_seq_trimmed[0])
decoded_note_seq_encoded_no_zeros = remove_trailing_zeros(decoded_note_seq_encoded_trimmed[0])

if len(decoded_ph_dur_no_zeros) > len(input_phonemes):
        # Truncate the sequence if it's longer
    decoded_ph_dur_no_zeros = decoded_ph_dur_no_zeros[:len(input_phonemes)]
elif len(decoded_ph_dur_no_zeros) < len(input_phonemes):
    # Pad the sequence with zeros if it's shorter
    decoded_ph_dur_no_zeros = np.pad(decoded_ph_dur_no_zeros, (0, len(input_phonemes) - len(decoded_ph_dur_no_zeros)), 'constant')

total_time = sum(decoded_ph_dur_no_zeros)

print(total_time)
f0_size = int(total_time / f0_timestep)
print(f0_size)
# Replace non-positive values with the specified constant
decoded_f0_seq_no_zeros = [x if x > 0 else f0_constant for x in decoded_f0_seq_no_zeros]

# Pad sequence with the constant value if it's shorter than the target length
if len(decoded_f0_seq_no_zeros) < f0_size:
    decoded_f0_seq_no_zeros = np.pad(decoded_f0_seq_no_zeros, (0, f0_size - len(decoded_f0_seq_no_zeros)), 'constant', constant_values=(f0_constant,))

# Or truncate if it's longer than the target length (just for safety)
elif len(decoded_f0_seq_no_zeros) > f0_size:
    decoded_f0_seq_no_zeros = decoded_f0_seq_no_zeros[:f0_size]

# Now you have your sequences with trailing zeros removed
print("Decoded ph_dur without trailing zeros:", decoded_ph_dur_no_zeros)
print(len(decoded_ph_dur_no_zeros))
if decoded_f0_seq_no_zeros is not None:
    print("Decoded f0_seq without trailing zeros:", decoded_f0_seq_no_zeros)
    print(len(decoded_f0_seq_no_zeros))
print("Decoded note_seq_encoded without trailing zeros:", decoded_note_seq_encoded_no_zeros)
print(len(decoded_note_seq_encoded_no_zeros))
print(len(input_phonemes))

### 4.5 Make it .ds file

In [None]:
# use input_phonemes

# Load the token-to-int mappings from the JSON files
def load_mapping(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        token_to_int = json.load(f)
    # Invert the dictionary to create an int-to-token mapping
    int_to_token = {v: k for k, v in token_to_int.items()}
    return int_to_token

# Decoding function using the mappings
def decode_predictions(prediction_integers, mapping):
    return [mapping.get(i, 'Unknown') for i in prediction_integers]

note_int_to_token = load_mapping('note_token_to_int.json')

predicted_note_seq_integers = decoded_note_seq_encoded_no_zeros

decoded_note_seq = decode_predictions(predicted_note_seq_integers, note_int_to_token)

# Print or return the decoded sequences
print("Decoded Phonetic Sequence:", input_phonemes)
print("Decoded Note Sequence:", decoded_note_seq)

file = {
    'ph_seq': input_phonemes,
    'ph_dur': decoded_ph_dur_no_zeros,
    'note_seq': decoded_note_seq,
    'f0_seq': decoded_f0_seq_no_zeros
}

with open('rnn_output.ds', 'w') as json_file:
    json.dump(file, json_file, indent=4)