<a href="https://colab.research.google.com/github/mohammadreza-mohammadi94/Deep-Learning-Projects/blob/main/Melody-Sequence-Prediciton(Seq2Seq)/melody_seq_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Libraries Installation

In [1]:
!pip install -q tensorflow pandas music21

# Import Libs

In [17]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, TimeDistributed, Bidirectional, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
import random
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

# Configuration

In [19]:
INPUT_SEQ_LENGTH = 8     # Length of the sequence the model observes (Input Length)
OUTPUT_SEQ_LENGTH = 8    # Length of the sequence the model predicts (Output Length)
VOCAB_SIZE = 10          # Total unique notes/chords + special tokens
EMBEDDING_DIM = 32
LSTM_UNITS = 64
EPOCHS = 50
BATCH_SIZE = 32
PATIENCE = 10            # For EarlyStopping

# --- Special Token IDs ---
PAD_ID = 0 # Padding Token
SOS_ID = 1 # Start of Sequence Token (used in Decoder Input)
EOS_ID = 2 # End of Sequence Token (used to stop generation)
# Actual notes/chords are IDs 3 to 9 (VOCAB_SIZE - 3)

# Data Preparation

In [30]:
def create_synthetic_data(num_samples, input_len, output_len):
    """Generates sequences where Target[t] = Input[t+1] (Many-to-Many Aligned)."""
    X_raw, Y_raw = [], []

    for _ in range(num_samples):
        # Create a sequence of actual notes (IDs 3 to 9)
        seq_len = random.randint(input_len, input_len + 3)
        sequence = np.random.randint(3, VOCAB_SIZE, size=seq_len)

        # Input X: The observed sequence (of length input_len)
        input_seq = sequence[:input_len]

        # Target Y: The next note (t+1) for each step in the input
        target_seq_full = np.roll(sequence, -1)
        target_seq = target_seq_full[:output_len]

        X_raw.append(input_seq)
        Y_raw.append(target_seq)

    # Padding for X and final Target Y
    X_padded = pad_sequences(X_raw, maxlen=input_len, padding='post', dtype='int32')
    Y_target_padded = pad_sequences(Y_raw, maxlen=output_len, padding='post', value=EOS_ID)

    # Teacher Forcing Input: Shifted Target with SOS at the beginning
    Y_decoder_input = np.zeros((num_samples, output_len), dtype=np.int32)
    Y_decoder_input[:, 0] = SOS_ID
    Y_decoder_input[:, 1:] = Y_target_padded[:, :-1]

    return X_padded, Y_decoder_input, Y_target_padded

# Generate Data
X_train, Y_dec_input, Y_target = create_synthetic_data(
    5000, INPUT_SEQ_LENGTH, OUTPUT_SEQ_LENGTH)

# Split for validation
X_train, X_val, Y_target_train, Y_target_val = train_test_split(
    X_train, Y_target, test_size=0.15, random_state=42
)

# --- CRITICAL FIX: Generate the corresponding Decoder Input for the Validation Set ---
# The validation input for the decoder must mirror the structure of the training decoder input.
Y_dec_val_input = np.zeros_like(Y_target_val)
Y_dec_val_input[:, 0] = SOS_ID
Y_dec_val_input[:, 1:] = Y_target_val[:, :-1]


print(f"Encoder Input Shape (X_train): {X_train.shape}")
print(f"Decoder Input Shape (Y_dec_in): {Y_dec_input.shape}")
print(f"Decoder Target Shape (Y_target): {Y_target.shape}")
print(f"Example Target (First Sample): {Y_target[0]}")

Encoder Input Shape (X_train): (4250, 8)
Decoder Input Shape (Y_dec_in): (5000, 8)
Decoder Target Shape (Y_target): (5000, 8)
Example Target (First Sample): [6 8 8 5 9 9 7 7]


# Seq2Seq Model

In [31]:
# --- ENCODER ---
encoder_inputs = Input(shape=(INPUT_SEQ_LENGTH,), name='encoder_input')
encoder_emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(encoder_inputs)
# Encoder returns ONLY the final state (Context Vector)
encoder_lstm = LSTM(LSTM_UNITS, return_state=True)
encoder_outputs, state_h, state_c = encoder_lstm(encoder_emb) # LSTM returns h and c states

# Keep only the states for the decoder
encoder_states = [state_h, state_c]


# --- DECODER ---
decoder_inputs = Input(shape=(OUTPUT_SEQ_LENGTH,), name='decoder_input')
decoder_emb = Embedding(VOCAB_SIZE, EMBEDDING_DIM)(decoder_inputs)


# We set up our decoder to return full sequences, and to not return states.
# The states are set by using the "initial_state" argument
decoder_lstm = LSTM(LSTM_UNITS, return_sequences=True, return_state=True)
decoder_outputs, _, _ = decoder_lstm(decoder_emb, initial_state=encoder_states)


# Final Output Layer: Applies Dense to EACH of the 8 timesteps
output_layer = TimeDistributed(Dense(VOCAB_SIZE, activation='softmax'))(decoder_outputs)

# --- Define and Compile the final Model ---
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=output_layer)
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

model.summary()

In [32]:
callbacks = [
    ModelCheckpoint('best_music_model.keras', save_best_only=True, monitor='val_loss', mode='min'),
    EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
]

history = model.fit(
    [X_train, Y_dec_input[:len(X_train)]], # Use the subset of Y_dec_input that corresponds to X_train
    Y_target_train, # Target for the training set
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    # Pass the complete tuple for validation data: ([Encoder_Val_Input, Decoder_Val_Input], Target_Val_Output)
    validation_data=([X_val, Y_dec_val_input], Y_target_val),
    callbacks=callbacks,
    verbose=1
)

Epoch 1/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 18ms/step - accuracy: 0.1676 - loss: 2.0660 - val_accuracy: 0.2920 - val_loss: 1.7722
Epoch 2/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 18ms/step - accuracy: 0.3136 - loss: 1.7233 - val_accuracy: 0.3783 - val_loss: 1.5822
Epoch 3/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 15ms/step - accuracy: 0.3817 - loss: 1.5459 - val_accuracy: 0.4117 - val_loss: 1.4502
Epoch 4/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.4274 - loss: 1.3973 - val_accuracy: 0.4573 - val_loss: 1.3067
Epoch 5/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 12ms/step - accuracy: 0.4693 - loss: 1.2722 - val_accuracy: 0.4883 - val_loss: 1.2019
Epoch 6/50
[1m133/133[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - accuracy: 0.5118 - loss: 1.1530 - val_accuracy: 0.5327 - val_loss: 1.0980
Epoch 7/50
[1m133/133