In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import io # Keep for robustness, though direct file read is now primary

# --- 1. Global Constants (Adjust as needed) ---
LOOK_BACK = 1300      # Number of past time steps the model looks at
PREDICT_AHEAD = 10    # Number of future time steps the model predicts
EMBED_DIM = 128       # Embedding dimension for the Transformer
NUM_HEADS = 8         # Number of attention heads in MultiHeadSelfAttention
FF_DIM = 256          # Hidden layer size in feed forward network of TransformerBlock
DROPOUT_RATE = 0.2    # Dropout rate for regularization


In [None]:
# --- 2. Data Loading and Preprocessing (UPDATED) ---
def load_and_preprocess_data(csv_file_path):
    """
    Loads data from a CSV file path, separates input and target features, and scales them.

    Args:
        csv_file_path (str): The path to the CSV file.

    Returns:
        tuple: A tuple containing:
            - numpy.ndarray: The scaled input features data (X).
            - numpy.ndarray: The scaled target feature data (Y).
            - sklearn.preprocessing.MinMaxScaler: Scaler fitted on input features.
            - sklearn.preprocessing.MinMaxScaler: Scaler fitted on target feature.
            - list: List of input feature names.
            - list: List of target feature names.
    """
    print(f"Loading data from {csv_file_path}...")
    df = pd.read_csv(csv_file_path)

    df['date'] = pd.to_datetime(df['date'])
    df = df.set_index('date')

    # Define input features (used by the model to predict)
    # Removed 'Closing_Price' from input features as it's now the target
    input_features = ['volume', 'RSI', 'MACD', 'MACD_Signal', 'Momentum', 'OBV', 'ATR', 'Revenue_Growth', 'EPS_Growth', 'ROE']

    # Define the single target feature to be predicted
    target_feature = ['Closing_Price'] # Predicting 'Closing_Price'

    # Select and preprocess input features
    df_input_features = df[input_features]
    df_input_features = df_input_features.fillna(method='ffill').fillna(method='bfill')
    print(f"Input DataFrame shape: {df_input_features.shape}")

    # Select and preprocess target feature
    df_target_feature = df[target_feature]
    df_target_feature = df_target_feature.fillna(method='ffill').fillna(method='bfill')
    print(f"Target DataFrame shape: {df_target_feature.shape}")

    # Initialize separate scalers for input and target features
    scaler_input = MinMaxScaler(feature_range=(0, 1))
    scaler_output = MinMaxScaler(feature_range=(0, 1)) # This scaler is for the target

    # Fit and transform input features
    scaled_input_data = scaler_input.fit_transform(df_input_features)

    # Fit and transform target feature
    scaled_target_data = scaler_output.fit_transform(df_target_feature)

    return scaled_input_data, scaled_target_data, scaler_input, scaler_output, input_features, target_feature


In [None]:
# --- 3. Sequence Creation (UPDATED) ---
def create_sequences(input_data, target_data, look_back, predict_ahead):
    """
    Creates input (X) and target (Y) sequences for time series prediction.

    Args:
        input_data (numpy.ndarray): The scaled input features data.
        target_data (numpy.ndarray): The scaled target feature data.
        look_back (int): Number of past time steps to use as input.
        predict_ahead (int): Number of future time steps to predict.

    Returns:
        tuple: A tuple containing:
            - numpy.ndarray: Input sequences (X).
            - numpy.ndarray: Target sequences (Y).
    """
    X, Y = [], []
    # Ensure we don't go out of bounds for both input and target data
    # The latest possible starting point for a sequence is when 
    # (i + look_back + predict_ahead) is still within the bounds of the data.
    for i in range(len(input_data) - look_back - predict_ahead + 1):
        # X is the sequence of input features for 'look_back' steps
        X.append(input_data[i:(i + look_back)])
        # Y is the sequence of target feature (Closing_Price) for 'predict_ahead' steps
        Y.append(target_data[(i + look_back):(i + look_back + predict_ahead)])
    return np.array(X), np.array(Y)


In [None]:
# --- 4. Transformer Model Components ---

class MultiHeadSelfAttention(layers.Layer):
    """
    Multi-Head Self-Attention mechanism as described in the Transformer paper.
    """
    def __init__(self, embed_dim, num_heads=8, **kwargs):
        super(MultiHeadSelfAttention, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)
        key = self.key_dense(inputs)
        value = self.value_dense(inputs)
        query = self.separate_heads(query, batch_size)
        key = self.separate_heads(key, batch_size)
        value = self.separate_heads(value, batch_size)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        output = self.combine_heads(concat_attention)
        return output


class TransformerBlock(layers.Layer):
    """
    A single Transformer block combining Multi-Head Attention and a Feed-Forward Network.
    (Fixed: added training=None to call signature)
    """
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training=None): # <-- FIX: Added training=None
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)


class PositionalEmbedding(layers.Layer):
    """
    Positional Embedding layer to add temporal information to input embeddings.
    """
    def __init__(self, sequence_length, embed_dim, **kwargs):
        super(PositionalEmbedding, self).__init__(**kwargs)
        self.sequence_length = sequence_length
        self.embed_dim = embed_dim
        self.position_embedding = layers.Embedding(
            input_dim=sequence_length, output_dim=embed_dim
        )
        self.scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))

    def call(self, inputs): # inputs here are already of shape (batch_size, sequence_length, embed_dim)
        length = tf.shape(inputs)[1]
        positions = tf.range(start=0, limit=length, delta=1)
        embedded_positions = self.position_embedding(positions)
        # The addition works because inputs and embedded_positions now have compatible last dimensions (embed_dim)
        return inputs * self.scale + embedded_positions


def build_transformer_model(input_shape, output_sequence_length, output_features_count):
    """
    Builds the Transformer model for time series prediction.
    (Fixed: Added TimeDistributed Dense for input feature projection and Flatten before final Dense)

    Args:
        input_shape (tuple): Shape of the input sequences (look_back, num_input_features).
        output_sequence_length (int): The length of the output sequence (predict_ahead).
        output_features_count (int): The number of features in the output (now 1 for Closing_Price).

    Returns:
        keras.Model: The compiled Transformer model.
    """
    inputs = layers.Input(shape=input_shape) # (None, LOOK_BACK, num_input_features)

    # Project input features (num_input_features) to EMBED_DIM before positional embedding
    x = layers.TimeDistributed(layers.Dense(EMBED_DIM))(inputs) # Shape: (None, LOOK_BACK, EMBED_DIM)

    # Positional Embedding for input sequence
    x = PositionalEmbedding(input_shape[0], EMBED_DIM)(x) # Shape: (None, LOOK_BACK, EMBED_DIM)

    # Apply Transformer blocks
    transformer_block = TransformerBlock(EMBED_DIM, NUM_HEADS, FF_DIM, DROPOUT_RATE)
    x = transformer_block(x) # Shape: (None, LOOK_BACK, EMBED_DIM)

    # Flatten the output from the TransformerBlock so that the Dense layer can operate on a single vector per sample.
    # This aggregates information from all LOOK_BACK timesteps into a single vector for prediction.
    x = layers.Flatten()(x) # Shape: (None, LOOK_BACK * EMBED_DIM)

    # The Dense layer now outputs a flat vector of `predict_ahead * output_features_count`
    # output_features_count is now 1 for 'Closing_Price'
    outputs = layers.Dense(output_sequence_length * output_features_count)(x) # Shape: (None, PREDICT_AHEAD * 1)

    # Reshape to the desired output sequence shape: (batch_size, predict_ahead, output_features_count)
    outputs = layers.Reshape((output_sequence_length, output_features_count))(outputs) # Shape: (None, PREDICT_AHEAD, 1)

    model = keras.Model(inputs=inputs, outputs=outputs)
    return model


In [None]:
# --- 5. Model Training ---
def train_transformer_ts(X_scaled, Y_scaled, input_features_count, output_features_count, look_back, predict_ahead, epochs=50, batch_size=32):
    """
    Trains the Transformer time series prediction model.

    Args:
        X_scaled (numpy.ndarray): The scaled input sequences.
        Y_scaled (numpy.ndarray): The scaled target sequences.
        input_features_count (int): Number of features in the input data.
        output_features_count (int): Number of features in the output (target) data (now 1).
        look_back (int): The sequence length for inputs.
        predict_ahead (int): The sequence length for outputs (predictions).
        epochs (int): Number of training epochs.
        batch_size (int): Batch size for training.

    Returns:
        keras.Model: The trained Transformer model.
    """
    # Build the model with updated input and output feature counts
    model = build_transformer_model(
        input_shape=(look_back, input_features_count), # Pass the count of input features
        output_sequence_length=predict_ahead,
        output_features_count=output_features_count # Pass the count of target features (should be 1)
    )

    # Compile the model
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")

    print(f"Training Transformer model for {epochs} epochs...")
    model.fit(X_scaled, Y_scaled, epochs=epochs, batch_size=batch_size, verbose=0) # Set verbose to 1 for progress
    print("Training complete.")
    return model


In [None]:
# --- 6. Prediction Function (UPDATED) ---
def predict_future_values(model, last_input_sequence_scaled, scaler_output):
    """
    Predicts future values using the trained Transformer model.

    Args:
        model (keras.Model): The trained Transformer model.
        last_input_sequence_scaled (numpy.ndarray): The last sequence of input data, scaled.
                                                    Shape: (look_back, num_input_features) before expansion.
        scaler_output (sklearn.preprocessing.MinMaxScaler): The scaler used for the single target feature.

    Returns:
        numpy.ndarray: The inverse-transformed predicted future values.
                       Shape: (predict_ahead, 1).
    """
    # Ensure the input sequence has the correct shape for prediction (batch_size, look_back, num_input_features)
    if last_input_sequence_scaled.ndim == 2:
        last_input_sequence_scaled = np.expand_dims(last_input_sequence_scaled, axis=0)

    # Make prediction
    predicted_scaled = model.predict(last_input_sequence_scaled, verbose=0)
    # After prediction, predicted_scaled will have shape (1, PREDICT_AHEAD, 1)

    # Reshape the predicted output to 2D for inverse transformation: (PREDICT_AHEAD, 1)
    # The scaler expects a 2D array where columns are features. Since we have 1 feature,
    # we reshape from (1, PREDICT_AHEAD, 1) to (PREDICT_AHEAD, 1).
    predicted_scaled_2d = predicted_scaled.reshape(predicted_scaled.shape[1], predicted_scaled.shape[2])

    # Inverse transform the prediction
    predicted_original = scaler_output.inverse_transform(predicted_scaled_2d)

    # predicted_original is already (PREDICT_AHEAD, 1) after inverse_transform, no further reshape needed if it's the only target
    # However, the line below is robust if shape[1] is PREDICT_AHEAD and shape[2] is 1.
    # It's technically redundant if predicted_scaled_2d was already (PREDICT_AHEAD, 1)
    # but doesn't hurt.
    # If you want to explicitly ensure (PREDICT_AHEAD, 1) without relying on reshape's behavior:
    # predicted_original = predicted_original.flatten().reshape(-1, 1)
    # Or simply: return predicted_original if it's already (PREDICT_AHEAD, 1)

    return predicted_original


In [None]:
# --- Example Usage (UPDATED) ---

csv_file = "data/AAPL_market_data.csv"

# 1. Load and preprocess data (now returns separate input and target data/scalers)
scaled_input_data, scaled_target_data, scaler_input, scaler_output, input_features_list, target_features_list = load_and_preprocess_data(csv_file)

# Determine the number of features based on the preprocessed data
NUM_INPUT_FEATURES = scaled_input_data.shape[1]  # Number of features in X
NUM_TARGET_FEATURES = scaled_target_data.shape[1] # Number of features in Y (should be 1 for 'Closing_Price')

# 2. Create sequences (now takes both scaled_input_data and scaled_target_data)
X, Y = create_sequences(scaled_input_data, scaled_target_data, LOOK_BACK, PREDICT_AHEAD)

# Ensure X and Y are reshaped correctly for the model
# X: (samples, look_back, NUM_INPUT_FEATURES)
# Y: (samples, predict_ahead, NUM_TARGET_FEATURES)

# 3. Train the model
if X.shape[0] > 0: # Check if sequences were created
    model = train_transformer_ts(
        X_scaled=X,
        Y_scaled=Y,
        input_features_count=NUM_INPUT_FEATURES,  # Pass input feature count
        output_features_count=NUM_TARGET_FEATURES, # Pass target feature count (1)
        look_back=LOOK_BACK,
        predict_ahead=PREDICT_AHEAD,
        epochs=1,  # Set a small number of epochs for demonstration
        batch_size=1
    )

    # 4. Make a prediction
    # Get the last sequence from the scaled input data to predict future values
    last_input_sequence_scaled = scaled_input_data[-LOOK_BACK:]

    if last_input_sequence_scaled.shape[0] == LOOK_BACK:
        predicted_future_values = predict_future_values(model, last_input_sequence_scaled, scaler_output)
        print(f"\nPredicted Future Values (original scale) for next {PREDICT_AHEAD} steps for '{target_features_list[0]}':")
        
        # Get the last known date from the original DataFrame for date generation
        original_df_full = pd.read_csv(csv_file)
        original_df_full['date'] = pd.to_datetime(original_df_full['date'])
        last_known_date = original_df_full['date'].iloc[-1]

        # Generate future dates (business days)
        future_dates = pd.date_range(start=last_known_date + pd.Timedelta(days=1),
                                     periods=PREDICT_AHEAD,
                                     freq='B') # 'B' for business day frequency

        # Create a DataFrame for better readability of predictions with dates
        predicted_df = pd.DataFrame(predicted_future_values, columns=target_features_list)
        predicted_df.insert(0, 'date', future_dates)
        print(predicted_df)
    else:
        print("Not enough data to create the last input sequence for prediction.")
else:
    print("Not enough data to create sequences for training and prediction. Adjust LOOK_BACK or PREDICT_AHEAD constants or provide more data.")
