# Model Use: *RNN w/ LSTM*

# 1. Environement Setup

In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Masking
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import OneHotEncoder
from tensorflow.keras.utils import to_categorical

import ast

file_path = 'normalized_output.csv'




# 2. Data extraction

In [2]:
# Load the data
data = pd.read_csv(file_path)

# Convert string representations of lists to actual lists
data['ph_seq_encoded'] = data['ph_seq_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['ph_dur'] = data['ph_dur'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
data['f0_seq'] = data['f0_seq'].apply(lambda x: ast.literal_eval(x) if pd.notnull(x) else x)  # Handle NaN for f0_seq
data['note_seq_encoded'] = data['note_seq_encoded'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Normalize the float arrays
scaler_ph_dur = MinMaxScaler(feature_range=(0, 1))
# Normalize ph_dur assuming they are already in the correct format
data['ph_dur'] = [scaler_ph_dur.fit_transform(np.array(seq).reshape(-1, 1)).flatten() for seq in data['ph_dur']]

# Only normalize f0_seq if it's not NaN
scaler_f0_seq = MinMaxScaler(feature_range=(0, 1))
data['f0_seq'] = [scaler_f0_seq.fit_transform(np.array(seq).reshape(-1, 1)).flatten() if seq is not np.nan else np.nan for seq in data['f0_seq']]

# Find the maximum sequence length across all sequence columns
max_sequence_length = max(
    max(data['ph_seq_encoded'].apply(len)),
    max(data['ph_dur'].apply(len)),
    max([len(seq) for seq in data['f0_seq'] if seq is not np.nan]),  # Only consider non-NaN sequences
    max(data['note_seq_encoded'].apply(len))
)

# Pad the sequences
data['ph_seq_encoded'] = pad_sequences(data['ph_seq_encoded'], maxlen=max_sequence_length, padding='post').tolist()
data['ph_dur'] = pad_sequences(data['ph_dur'], maxlen=max_sequence_length, padding='post', dtype='float').tolist()
data['f0_seq'] = [pad_sequences([seq], maxlen=max_sequence_length, padding='post', dtype='float').flatten() if seq is not np.nan else np.full(max_sequence_length, np.nan) for seq in data['f0_seq']]
data['note_seq_encoded'] = pad_sequences(data['note_seq_encoded'], maxlen=max_sequence_length, padding='post').tolist()

# Flatten all sequences into a single list
all_ph_seq = [item for sublist in data['ph_seq_encoded'] for item in sublist]

# Convert to numpy array and reshape to be 2D
all_ph_seq_array = np.array(all_ph_seq).reshape(-1, 1)

# Initialize and fit the OneHotEncoder on all sequences at once
encoder = OneHotEncoder(sparse=False)
encoder.fit(all_ph_seq_array)

# Now transform each sequence individually and store the transformed arrays
data['ph_seq_encoded'] = [encoder.transform(np.array(seq).reshape(-1, 1)) for seq in data['ph_seq_encoded']]

# Determine the number of categories for notes
num_note_categories = max(data['note_seq_encoded'].apply(max)) + 1  # Assuming the sequences are zero-indexed

# One-hot encode the note_seq_encoded
data['note_seq_encoded'] = [to_categorical(seq, num_classes=num_note_categories) for seq in data['note_seq_encoded']]

padded_note_seqs = np.array([pad_sequences([seq], maxlen=max_sequence_length, padding='post', value=0)[0] for seq in data['note_seq_encoded']])
data['note_seq_encoded'] = list(padded_note_seqs)

# After padding and encoding, convert to a numpy array
X = np.array(data['ph_seq_encoded'].tolist())
y_ph_dur = np.array(data['ph_dur'].tolist())
y_note_seq_encoded = np.array(data['note_seq_encoded'].tolist())

# Handle missing f0_seq data
mask_value = -1  # Define a mask value that does not appear in the data
data['f0_seq'] = np.array([np.full(max_sequence_length, mask_value) if np.isnan(seq).all() else seq for seq in data['f0_seq']])
y_f0_seq = np.array(data['f0_seq'].tolist())

# Split the data into training and test sets
X_train, X_test, y_ph_dur_train, y_ph_dur_test, y_f0_seq_train, y_f0_seq_test, y_note_seq_encoded_train, y_note_seq_encoded_test = train_test_split(
    X, y_ph_dur, y_f0_seq, y_note_seq_encoded, test_size=0.2, random_state=42
)



# 3. Model Training

In [3]:
# Define the model with input layer and output layers
input_layer = Input(shape=(X_train.shape[1], X_train.shape[2]))
lstm_layer = LSTM(128, return_sequences=True)(input_layer)

# Output layers for each prediction
ph_dur_output = Dense(1, activation='linear', name='ph_dur_output')(lstm_layer)
f0_seq_output = Dense(1, activation='linear', name='f0_seq_output')(lstm_layer)
note_seq_output = Dense(y_note_seq_encoded_train.shape[2], activation='softmax', name='note_seq_output')(lstm_layer)

model = Model(inputs=input_layer, outputs=[ph_dur_output, f0_seq_output, note_seq_output])

# Compile the model
model.compile(optimizer=Adam(learning_rate=0.001), loss={'ph_dur_output': 'mse', 'f0_seq_output': 'mse', 'note_seq_output': 'categorical_crossentropy'}, metrics=['accuracy'])

# Make sure your target arrays are in the correct shape before training
y_note_seq_encoded_train = np.array([np.array(lst) if isinstance(lst, list) else lst for lst in y_note_seq_encoded_train])
# Add an extra dimension to ph_dur and f0_seq if they are not already in the shape (num_samples, 1)
# y_ph_dur_train = np.expand_dims(y_ph_dur_train, axis=-1)
# y_f0_seq_train = np.expand_dims(y_f0_seq_train, axis=-1)

# Train the model
model.fit(X_train, [y_ph_dur_train, y_f0_seq_train, y_note_seq_encoded_train], epochs=10, batch_size=64, validation_split=0.2)

# Save the model and the scalers/encoders
model.save('model.h5')

import joblib

# Save the scalers and encoders
joblib.dump(scaler_ph_dur, 'scaler_ph_dur.pkl')
joblib.dump(scaler_f0_seq, 'scaler_f0_seq.pkl')
joblib.dump(encoder, 'ph_seq_encoder.pkl')


Epoch 1/10



# 4. Validation
Put user input and get output

### 4.1 Input

In [93]:
import json

# Load the encoding dictionary from the ph_token_to_int.json file
with open('ph_token_to_int.json', 'r') as file:
    ph_token_to_int = json.load(file)

# Your input sequence
input_sequence = "n ei f a g e n a j i f u y u a p u AP n ei f a g e n a l e y u d ao en AP"

# Split the input sequence into individual phonemes
input_phonemes = input_sequence.split()

# Convert the phonemes to their corresponding integers using the encoding dictionary
encoded_sequence = [ph_token_to_int[phoneme] for phoneme in input_phonemes]

# Convert the sequence to a numpy array and pad it to the right length
new_ph_seq_encoded = np.array([encoded_sequence])  # wrapping in a list to create a batch dimension
new_ph_seq_encoded_padded = pad_sequences(new_ph_seq_encoded, maxlen=max_sequence_length, padding='post')

input = np.expand_dims(new_ph_seq_encoded_padded, axis=1)


### 4.2 Inference

In [94]:
predicted_ph_dur, predicted_f0_seq, predicted_note_seq_encoded = model.predict(input)



### 4.3 Decoding

In [96]:
print(predicted_ph_dur, '\n', predicted_f0_seq, '\n', predicted_note_seq_encoded)

predicted_f0_seq = scaler_f0_seq.inverse_transform(predicted_f0_seq)


[[[nan]]] 
 [[[nan]]] 
 [[[nan nan nan ... nan nan nan]]]


ValueError: Found array with dim 3. None expected <= 2.