# Model Use: *RNN w/ LSTM Self-Supervised*

# 1. Environement Setup

In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split

file_path = 'normalized_output.csv'

# 2. Data Processing

In [6]:
# Load the data
data = pd.read_csv(file_path)

# Function to convert string representation of a list to an actual list
def convert_string_to_list(string):
    return np.fromstring(string[1:-1], sep=',')  # Assumes the format "[1, 2, 3]"

# Assuming 'ph_seq_encoded' and 'note_seq_encoded' are strings representing lists
X = np.array([convert_string_to_list(row) for row in data['ph_seq_encoded']])
y_note = np.array([convert_string_to_list(row) for row in data['note_seq_encoded']])
y_dur = np.array([convert_string_to_list(row) for row in data['ph_dur']])

# Reshape the arrays
X_reshaped = X.reshape((X.shape[0], X.shape[1], 1))
y_note_reshaped = y_note.reshape((y_note.shape[0], y_note.shape[1], 1))
y_dur_reshaped = y_dur.reshape((y_dur.shape[0], y_dur.shape[1], 1))

# print(X_reshaped[0].dtype, y_note_reshaped[0].dtype, y_dur_reshaped[0].dtype)

# Split the data into training and test sets
X_train, X_test, y_note_train, y_note_test, y_dur_train, y_dur_test = train_test_split(
    X_reshaped, y_note_reshaped, y_note_reshaped, test_size=0.3, random_state=42)

# 3. Model Training

In [7]:
# Define the RNN model
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    tf.keras.layers.LSTM(64),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(y_note_train.shape[1])
])

# Compile the model
model.compile(optimizer='adam', loss='mse', metrics=['accuracy'])

# Train the model
history = model.fit(X_train, y_note_train, epochs=10, batch_size=32, validation_data=(X_test, y_note_test))

# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_note_test)
print(f"Test Loss: {test_loss}, Test Accuracy: {test_acc}")





Epoch 1/10




ValueError: in user code:

    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\engine\training.py", line 1384, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\engine\training.py", line 1373, in run_step  **
        outputs = model.train_step(data)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\engine\training.py", line 1155, in train_step
        return self.compute_metrics(x, y, y_pred, sample_weight)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\engine\training.py", line 1249, in compute_metrics
        self.compiled_metrics.update_state(y, y_pred, sample_weight)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\engine\compile_utils.py", line 620, in update_state
        metric_obj.update_state(y_t, y_p, sample_weight=mask)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\utils\metrics_utils.py", line 77, in decorated
        result = update_state_fn(*args, **kwargs)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\metrics\base_metric.py", line 140, in update_state_fn
        return ag_update_state(*args, **kwargs)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\metrics\base_metric.py", line 723, in update_state  **
        matches = ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\metrics\accuracy_metrics.py", line 459, in sparse_categorical_accuracy
        matches = metrics_utils.sparse_categorical_matches(y_true, y_pred)
    File "c:\Users\Niko\anaconda3\envs\ProjectVivy\lib\site-packages\keras\src\utils\metrics_utils.py", line 961, in sparse_categorical_matches
        y_true = tf.squeeze(y_true, [-1])

    ValueError: Can not squeeze dim[1], expected a dimension of 1, got 283 for '{{node Squeeze}} = Squeeze[T=DT_FLOAT, squeeze_dims=[-1]](remove_squeezable_dimensions/Squeeze)' with input shapes: [?,283].


In [None]:
# Save the model
model.save('model_self_supervised.h5')

# 4. Validation
Put user input and get output

### 4.1 Input

In [77]:
import json

# Load the encoding dictionary from the ph_token_to_int.json file
with open('ph_token_to_int.json', 'r') as file:
    ph_token_to_int = json.load(file)

# Your input sequence
input_sequence = "AP n ei f a g e n a j i f u y u a p u AP n ei f a g e n a l e y u d ao en AP"

# Split the input sequence into individual phonemes
input_phonemes = input_sequence.split()

# Convert the phonemes to their corresponding integers using the encoding dictionary
encoded_sequence = [ph_token_to_int[phoneme] for phoneme in input_phonemes]


# Reshape the sequence to be 2D as expected by the encoder
encoded_sequence_array = np.array(encoded_sequence).reshape(-1, 1)

# One-hot encode the sequence using the loaded encoder
encoded_sequence_onehot = encoder.transform(encoded_sequence_array)

padded_sequence = pad_sequences([encoded_sequence_onehot], maxlen=4449, padding='post', dtype='float')

if padded_sequence.shape[2] != 123:
    # Handle error: The one-hot encoded sequence does not match the number of features the model expects
    # This may involve adding columns of zeros or otherwise adjusting the sequence to match the expected number of features
    # For example:
    new_shape = (padded_sequence.shape[0], 4449, 123)
    new_padded_sequence = np.zeros(new_shape)
    new_padded_sequence[:, :, :65] = padded_sequence  # Assuming the first 65 features match
    padded_sequence = new_padded_sequence


### 4.2 Inference

In [120]:
predictions = model.predict(padded_sequence)




### 4.3 Decoding

In [121]:
# predicted_ph_dur, predicted_f0_seq, predicted_note_seq_encoded = predictions
note_seq_pred = np.argmax(predictions[..., -num_note_categories:], axis=-1)  # Replace num_note_categories with the actual number

ph_dur_pred_scaled = predictions[..., 1]  # Replace some_ph_dur_index with the actual index/indices
note_seq_pred_scaled = predictions[..., 2]  # Replace some_ph_dur_index with the actual index/indices
f0_seq_pred_scaled = predictions[..., 3]  # Replace some_f0_seq_index with the actual index/indices

# Reshape the predictions to match the scaler's expected input
ph_dur_pred_scaled = ph_dur_pred_scaled.reshape(-1, 1)
note_seq_pred_scaled = note_seq_pred_scaled.reshape(-1, 1)
f0_seq_pred_scaled = f0_seq_pred_scaled.reshape(-1, 1)

# Use the scaler's inverse_transform method
ph_dur_pred = scaler_ph_dur.inverse_transform(ph_dur_pred_scaled).flatten()
f0_seq_pred = scaler_f0_seq.inverse_transform(f0_seq_pred_scaled).flatten()

note_seq_pred = np.argmax(predictions[..., -num_note_categories:], axis=-1)  # Use the actual number of note categories

# # Now you can print or return the decoded predictions
# print("Decoded ph_dur:", decoded_ph_dur_trimmed)
# print(len(decoded_ph_dur_trimmed[0]))
# if decoded_f0_seq is not None:
#     print("Decoded f0_seq:", decoded_f0_seq_trimmed)
#     print(len(decoded_f0_seq_trimmed[0]))
# print("Decoded note_seq_encoded:", decoded_note_seq_encoded_trimmed)
# print(len(decoded_note_seq_encoded_trimmed[0]))
# Print the decoded sequences
print('Decoded Note Sequence:', note_seq_pred)
print(len(note_seq_pred[0]))
print('Inversed ph_dur:', ph_dur_pred)
print(len(ph_dur_pred))
print('Inversed f0_seq:', f0_seq_pred)
print(len(f0_seq_pred))

Decoded Note Sequence: [[38 20 20 ... 20 20 20]]
4449
Inversed ph_dur: [0.06198255 0.06043197 0.05869877 ... 0.044998   0.044998   0.044998  ]
4449
Inversed f0_seq: [232.87163 232.66498 232.30423 ... 229.8     229.8     229.8    ]
4449


### 4.4 Removing zeros

In [123]:
f0_timestep = 0.005
f0_constant = 205.1

note_seq_pred = note_seq_pred[0]

def remove_trailing_zeros(sequence):
    # Find the last non-zero element in the sequence
    last_non_zero = -1
    for i in range(len(sequence) - 1, -1, -1):
        if sequence[i] != 0:
            last_non_zero = i
            break
    # Slice the sequence to remove trailing zeros
    return sequence[:last_non_zero + 1] if last_non_zero != -1 else sequence

# Apply this function to each of your predicted sequences
# decoded_ph_dur_no_zeros = remove_trailing_zeros(ph_dur_pred)
# decoded_f0_seq_no_zeros = remove_trailing_zeros(f0_seq_pred)
# decoded_note_seq_encoded_no_zeros = remove_trailing_zeros(note_seq_pred)

if len(ph_dur_pred) > len(input_phonemes):
        # Truncate the sequence if it's longer
    ph_dur_pred = ph_dur_pred[:len(input_phonemes)]
elif len(ph_dur_pred) < len(input_phonemes):
    # Pad the sequence with zeros if it's shorter
    ph_dur_pred = np.pad(ph_dur_pred, (0, len(input_phonemes) - len(ph_dur_pred)), 'constant')
    
if len(note_seq_pred) > len(input_phonemes):
    # Truncate the sequence if it's longer
    note_seq_pred = note_seq_pred[:len(input_phonemes)]
elif len(note_seq_pred) < len(input_phonemes):
    # Pad the sequence with zeros if it's shorter
    note_seq_pred = np.pad(note_seq_pred, (0, len(input_phonemes) - len(ph_dur_pred)), 'constant')


total_time = sum(ph_dur_pred)

print(total_time)
f0_size = int(total_time / f0_timestep)
print('f0 size', f0_size)
# Replace non-positive values with the specified constant
decoded_f0_seq_no_zeros = [x if x > 0 else f0_constant for x in f0_seq_pred]

# Pad sequence with the constant value if it's shorter than the target length
if len(decoded_f0_seq_no_zeros) < f0_size:
    decoded_f0_seq_no_zeros = np.pad(decoded_f0_seq_no_zeros, (0, f0_size - len(decoded_f0_seq_no_zeros)), 'constant', constant_values=(f0_constant,))

# Or truncate if it's longer than the target length (just for safety)
elif len(decoded_f0_seq_no_zeros) > f0_size:
    decoded_f0_seq_no_zeros = decoded_f0_seq_no_zeros[:f0_size]

# Now you have your sequences with trailing zeros removed
print("Decoded ph_dur without trailing zeros:", ph_dur_pred)
print(len(ph_dur_pred))
if decoded_f0_seq_no_zeros is not None:
    print("Decoded f0_seq without trailing zeros:", decoded_f0_seq_no_zeros)
    print(len(decoded_f0_seq_no_zeros))
print("Decoded note_seq_encoded without trailing zeros:", note_seq_pred)
print(len(note_seq_pred))
print(len(input_phonemes))

1.6352285332977772
f0 size 327
Decoded ph_dur without trailing zeros: [0.06198255 0.06043197 0.05869877 0.05523891 0.04858048 0.04534776
 0.04500405 0.04499801 0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998  ]
35
Decoded f0_seq without trailing zeros: [232.87163, 232.66498, 232.30423, 231.74814, 230.56653, 229.8877, 229.80173, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 

### 4.5 Make it .ds file

In [125]:
# use input_phonemes

# Load the token-to-int mappings from the JSON files
def load_mapping(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        token_to_int = json.load(f)
    # Invert the dictionary to create an int-to-token mapping
    int_to_token = {v: k for k, v in token_to_int.items()}
    return int_to_token

# Decoding function using the mappings
def decode_predictions(prediction_integers, mapping):
    return [mapping.get(i, 'Unknown') for i in prediction_integers]

note_int_to_token = load_mapping('note_token_to_int.json')

predicted_note_seq_integers = note_seq_pred

decoded_note_seq = decode_predictions(predicted_note_seq_integers, note_int_to_token)

# Print or return the decoded sequences
print("Decoded Phonetic Sequence:", input_phonemes)
print("Decoded Note Sequence:", decoded_note_seq)

ph_dur = ' '.join(map(str, ph_dur_pred))
note_seq = ' '.join(map(str, decoded_note_seq))
f0_seq = ' '.join(map(str, decoded_f0_seq_no_zeros))

file = {
    'ph_seq': input_sequence,
    'ph_dur': ph_dur,
    'note_seq': note_seq,
    'f0_seq': f0_seq,
    'f0_timestep': f0_timestep
}

with open('rnn_output.ds', 'w') as json_file:
    json.dump(file, json_file, indent=4)

Decoded Phonetic Sequence: ['AP', 'n', 'ei', 'f', 'a', 'g', 'e', 'n', 'a', 'j', 'i', 'f', 'u', 'y', 'u', 'a', 'p', 'u', 'AP', 'n', 'ei', 'f', 'a', 'g', 'e', 'n', 'a', 'l', 'e', 'y', 'u', 'd', 'ao', 'en', 'AP']
Decoded Note Sequence: ['G#3', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4']
