# Model Use: *RNN w/ LSTM*

# 1. Environement Setup

In [6]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Masking
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical

import ast
import gc

file_path = 'normalized_output.csv'

# 2. Data extraction

In [10]:
# Load the data
data = pd.read_csv(file_path)

# Convert string representations of lists to actual lists
data['ph_seq_encoded'] = data['ph_seq_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['ph_dur'] = data['ph_dur'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['f0_seq'] = data['f0_seq'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else x)  # Handle NaN for f0_seq
data['note_seq_encoded'] = data['note_seq_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Normalize the float arrays
scaler_ph_dur = MinMaxScaler(feature_range=(0, 1))
# Normalize ph_dur assuming they are already in the correct format
data['ph_dur'] = [scaler_ph_dur.fit_transform(np.array(seq).reshape(-1, 1)).flatten() for seq in data['ph_dur']]

# Only normalize f0_seq if it's not NaN
scaler_f0_seq = MinMaxScaler(feature_range=(0, 1))
data['f0_seq'] = [scaler_f0_seq.fit_transform(np.array(seq).reshape(-1, 1)).flatten() if seq is not np.nan else np.nan for seq in data['f0_seq']]

# Find the maximum sequence length across all sequence columns
max_sequence_length = max(
    max(data['ph_seq_encoded'].apply(len)),
    max(data['ph_dur'].apply(len)),
    max([len(seq) for seq in data['f0_seq'] if seq is not np.nan]),  # Only consider non-NaN sequences
    max(data['note_seq_encoded'].apply(len))
)

# Pad the sequences
data['ph_seq_encoded'] = pad_sequences(data['ph_seq_encoded'], maxlen=max_sequence_length, padding='post').tolist()
data['ph_dur'] = pad_sequences(data['ph_dur'], maxlen=max_sequence_length, padding='post', dtype='float').tolist()
data['f0_seq'] = [pad_sequences([seq], maxlen=max_sequence_length, padding='post', dtype='float').flatten() if seq is not np.nan else np.full(max_sequence_length, np.nan) for seq in data['f0_seq']]
data['note_seq_encoded'] = pad_sequences(data['note_seq_encoded'], maxlen=max_sequence_length, padding='post').tolist()

# Flatten all sequences into a single list
all_ph_seq = [item for sublist in data['ph_seq_encoded'] for item in sublist]

# Convert to numpy array and reshape to be 2D
all_ph_seq_array = np.array(all_ph_seq).reshape(-1, 1)
del all_ph_seq

# Initialize and fit the OneHotEncoder on all sequences at once
encoder = OneHotEncoder(sparse=False)
encoder.fit(all_ph_seq_array)

# Now transform each sequence individually and store the transformed arrays
data['ph_seq_encoded'] = [encoder.transform(np.array(seq).reshape(-1, 1)) for seq in data['ph_seq_encoded']]

# Determine the number of categories for notes
num_note_categories = max(data['note_seq_encoded'].apply(max)) + 1  # Assuming the sequences are zero-indexed

# One-hot encode the note_seq_encoded
data['note_seq_encoded'] = [to_categorical(seq, num_classes=num_note_categories) for seq in data['note_seq_encoded']]

padded_note_seqs = np.array([pad_sequences([seq], maxlen=max_sequence_length, padding='post', value=0)[0] for seq in data['note_seq_encoded']])
data['note_seq_encoded'] = list(padded_note_seqs)

# After padding and encoding, convert to a numpy array
X = np.array(data['ph_seq_encoded'].tolist())
y_ph_dur = np.array(data['ph_dur'].tolist())
y_note_seq_encoded = np.array(data['note_seq_encoded'].tolist())

# Ensure that all arrays have three dimensions
y_ph_dur = y_ph_dur[..., np.newaxis]  # shape (num_sequences, sequence_length, 1)

# Handle missing f0_seq data
mask_value = -1  # Define a mask value that does not appear in the data
y_f0_seq = np.array([np.full(max_sequence_length, mask_value) if np.isnan(seq).all() else seq for seq in data['f0_seq']])

# If y_f0_seq is 2D, also convert it to 3D
if y_f0_seq.ndim == 2:
    y_f0_seq = y_f0_seq[..., np.newaxis]  # shape (num_sequences, sequence_length, 1)

# Check the shapes of the arrays to make sure they are all 3D and can be concatenated
print(f"X shape: {X.shape}")
print(f"y_ph_dur shape: {y_ph_dur.shape}")
print(f"y_note_seq_encoded shape: {y_note_seq_encoded.shape}")
print(f"y_f0_seq shape: {y_f0_seq.shape}")

# Concatenate your inputs along the last axis to create a single input array for each sequence
X_combined = np.concatenate([X, y_ph_dur, y_note_seq_encoded, y_f0_seq], axis=-1)


# Shift the combined input to create the input and target pairs
X_input = X_combined[:, :-1, :]  # All but the last time step
X_target = X_combined[:, 1:, :]  # All but the first time step




X shape: (4022, 4450, 65)
y_ph_dur shape: (4022, 4450, 1)
y_note_seq_encoded shape: (4022, 4450, 56)
y_f0_seq shape: (4022, 4450, 1)


In [None]:
# Free some memory here
del all_ph_seq_array
del padded_note_seqs
del all_ph_seq
del data
del X
del y_f0_seq
del y_ph_dur
del y_note_seq_encoded
del X_combined
gc.collect()

# 3. Model Training

In [23]:
# Model architecture
input_layer = Input(shape=(X_input.shape[1], X_input.shape[2]))
lstm_layer = LSTM(128, return_sequences=True)(input_layer)
output_layer = Dense(X_input.shape[2], activation='softmax')(lstm_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(learning_rate=0.001), loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model on the input and target sequences
model.fit(X_input, X_target, epochs=5, batch_size=64)

# Save the model
model.save('model_self_supervised.h5')

import joblib

# # Save the scalers and encoders
# joblib.dump(scaler_ph_dur, 'scaler_ph_dur.pkl')
# joblib.dump(scaler_f0_seq, 'scaler_f0_seq.pkl')
# joblib.dump(encoder, 'ph_seq_encoder.pkl')

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


  saving_api.save_model(


# 4. Validation
Put user input and get output

### 4.1 Input

In [77]:
import json

# Load the encoding dictionary from the ph_token_to_int.json file
with open('ph_token_to_int.json', 'r') as file:
    ph_token_to_int = json.load(file)

# Your input sequence
input_sequence = "AP n ei f a g e n a j i f u y u a p u AP n ei f a g e n a l e y u d ao en AP"

# Split the input sequence into individual phonemes
input_phonemes = input_sequence.split()

# Convert the phonemes to their corresponding integers using the encoding dictionary
encoded_sequence = [ph_token_to_int[phoneme] for phoneme in input_phonemes]


# Reshape the sequence to be 2D as expected by the encoder
encoded_sequence_array = np.array(encoded_sequence).reshape(-1, 1)

# One-hot encode the sequence using the loaded encoder
encoded_sequence_onehot = encoder.transform(encoded_sequence_array)

padded_sequence = pad_sequences([encoded_sequence_onehot], maxlen=4449, padding='post', dtype='float')

if padded_sequence.shape[2] != 123:
    # Handle error: The one-hot encoded sequence does not match the number of features the model expects
    # This may involve adding columns of zeros or otherwise adjusting the sequence to match the expected number of features
    # For example:
    new_shape = (padded_sequence.shape[0], 4449, 123)
    new_padded_sequence = np.zeros(new_shape)
    new_padded_sequence[:, :, :65] = padded_sequence  # Assuming the first 65 features match
    padded_sequence = new_padded_sequence


### 4.2 Inference

In [120]:
predictions = model.predict(padded_sequence)




### 4.3 Decoding

In [121]:
# predicted_ph_dur, predicted_f0_seq, predicted_note_seq_encoded = predictions
note_seq_pred = np.argmax(predictions[..., -num_note_categories:], axis=-1)  # Replace num_note_categories with the actual number

ph_dur_pred_scaled = predictions[..., 1]  # Replace some_ph_dur_index with the actual index/indices
note_seq_pred_scaled = predictions[..., 2]  # Replace some_ph_dur_index with the actual index/indices
f0_seq_pred_scaled = predictions[..., 3]  # Replace some_f0_seq_index with the actual index/indices

# Reshape the predictions to match the scaler's expected input
ph_dur_pred_scaled = ph_dur_pred_scaled.reshape(-1, 1)
note_seq_pred_scaled = note_seq_pred_scaled.reshape(-1, 1)
f0_seq_pred_scaled = f0_seq_pred_scaled.reshape(-1, 1)

# Use the scaler's inverse_transform method
ph_dur_pred = scaler_ph_dur.inverse_transform(ph_dur_pred_scaled).flatten()
f0_seq_pred = scaler_f0_seq.inverse_transform(f0_seq_pred_scaled).flatten()

note_seq_pred = np.argmax(predictions[..., -num_note_categories:], axis=-1)  # Use the actual number of note categories

# # Now you can print or return the decoded predictions
# print("Decoded ph_dur:", decoded_ph_dur_trimmed)
# print(len(decoded_ph_dur_trimmed[0]))
# if decoded_f0_seq is not None:
#     print("Decoded f0_seq:", decoded_f0_seq_trimmed)
#     print(len(decoded_f0_seq_trimmed[0]))
# print("Decoded note_seq_encoded:", decoded_note_seq_encoded_trimmed)
# print(len(decoded_note_seq_encoded_trimmed[0]))
# Print the decoded sequences
print('Decoded Note Sequence:', note_seq_pred)
print(len(note_seq_pred[0]))
print('Inversed ph_dur:', ph_dur_pred)
print(len(ph_dur_pred))
print('Inversed f0_seq:', f0_seq_pred)
print(len(f0_seq_pred))

Decoded Note Sequence: [[38 20 20 ... 20 20 20]]
4449
Inversed ph_dur: [0.06198255 0.06043197 0.05869877 ... 0.044998   0.044998   0.044998  ]
4449
Inversed f0_seq: [232.87163 232.66498 232.30423 ... 229.8     229.8     229.8    ]
4449


### 4.4 Removing zeros

In [123]:
f0_timestep = 0.005
f0_constant = 205.1

note_seq_pred = note_seq_pred[0]

def remove_trailing_zeros(sequence):
    # Find the last non-zero element in the sequence
    last_non_zero = -1
    for i in range(len(sequence) - 1, -1, -1):
        if sequence[i] != 0:
            last_non_zero = i
            break
    # Slice the sequence to remove trailing zeros
    return sequence[:last_non_zero + 1] if last_non_zero != -1 else sequence

# Apply this function to each of your predicted sequences
# decoded_ph_dur_no_zeros = remove_trailing_zeros(ph_dur_pred)
# decoded_f0_seq_no_zeros = remove_trailing_zeros(f0_seq_pred)
# decoded_note_seq_encoded_no_zeros = remove_trailing_zeros(note_seq_pred)

if len(ph_dur_pred) > len(input_phonemes):
        # Truncate the sequence if it's longer
    ph_dur_pred = ph_dur_pred[:len(input_phonemes)]
elif len(ph_dur_pred) < len(input_phonemes):
    # Pad the sequence with zeros if it's shorter
    ph_dur_pred = np.pad(ph_dur_pred, (0, len(input_phonemes) - len(ph_dur_pred)), 'constant')
    
if len(note_seq_pred) > len(input_phonemes):
    # Truncate the sequence if it's longer
    note_seq_pred = note_seq_pred[:len(input_phonemes)]
elif len(note_seq_pred) < len(input_phonemes):
    # Pad the sequence with zeros if it's shorter
    note_seq_pred = np.pad(note_seq_pred, (0, len(input_phonemes) - len(ph_dur_pred)), 'constant')


total_time = sum(ph_dur_pred)

print(total_time)
f0_size = int(total_time / f0_timestep)
print('f0 size', f0_size)
# Replace non-positive values with the specified constant
decoded_f0_seq_no_zeros = [x if x > 0 else f0_constant for x in f0_seq_pred]

# Pad sequence with the constant value if it's shorter than the target length
if len(decoded_f0_seq_no_zeros) < f0_size:
    decoded_f0_seq_no_zeros = np.pad(decoded_f0_seq_no_zeros, (0, f0_size - len(decoded_f0_seq_no_zeros)), 'constant', constant_values=(f0_constant,))

# Or truncate if it's longer than the target length (just for safety)
elif len(decoded_f0_seq_no_zeros) > f0_size:
    decoded_f0_seq_no_zeros = decoded_f0_seq_no_zeros[:f0_size]

# Now you have your sequences with trailing zeros removed
print("Decoded ph_dur without trailing zeros:", ph_dur_pred)
print(len(ph_dur_pred))
if decoded_f0_seq_no_zeros is not None:
    print("Decoded f0_seq without trailing zeros:", decoded_f0_seq_no_zeros)
    print(len(decoded_f0_seq_no_zeros))
print("Decoded note_seq_encoded without trailing zeros:", note_seq_pred)
print(len(note_seq_pred))
print(len(input_phonemes))

1.6352285332977772
f0 size 327
Decoded ph_dur without trailing zeros: [0.06198255 0.06043197 0.05869877 0.05523891 0.04858048 0.04534776
 0.04500405 0.04499801 0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998   0.044998
 0.044998   0.044998   0.044998   0.044998   0.044998  ]
35
Decoded f0_seq without trailing zeros: [232.87163, 232.66498, 232.30423, 231.74814, 230.56653, 229.8877, 229.80173, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 229.8, 

### 4.5 Make it .ds file

In [125]:
# use input_phonemes

# Load the token-to-int mappings from the JSON files
def load_mapping(file_path):
    with open(file_path, 'r', encoding='utf-8') as f:
        token_to_int = json.load(f)
    # Invert the dictionary to create an int-to-token mapping
    int_to_token = {v: k for k, v in token_to_int.items()}
    return int_to_token

# Decoding function using the mappings
def decode_predictions(prediction_integers, mapping):
    return [mapping.get(i, 'Unknown') for i in prediction_integers]

note_int_to_token = load_mapping('note_token_to_int.json')

predicted_note_seq_integers = note_seq_pred

decoded_note_seq = decode_predictions(predicted_note_seq_integers, note_int_to_token)

# Print or return the decoded sequences
print("Decoded Phonetic Sequence:", input_phonemes)
print("Decoded Note Sequence:", decoded_note_seq)

ph_dur = ' '.join(map(str, ph_dur_pred))
note_seq = ' '.join(map(str, decoded_note_seq))
f0_seq = ' '.join(map(str, decoded_f0_seq_no_zeros))

file = {
    'ph_seq': input_sequence,
    'ph_dur': ph_dur,
    'note_seq': note_seq,
    'f0_seq': f0_seq,
    'f0_timestep': f0_timestep
}

with open('rnn_output.ds', 'w') as json_file:
    json.dump(file, json_file, indent=4)

Decoded Phonetic Sequence: ['AP', 'n', 'ei', 'f', 'a', 'g', 'e', 'n', 'a', 'j', 'i', 'f', 'u', 'y', 'u', 'a', 'p', 'u', 'AP', 'n', 'ei', 'f', 'a', 'g', 'e', 'n', 'a', 'l', 'e', 'y', 'u', 'd', 'ao', 'en', 'AP']
Decoded Note Sequence: ['G#3', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4', 'F#4']
