<a href="https://colab.research.google.com/github/minhaz1172/Deep-Learning/blob/main/Building_Advanced_Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**implement and experiment with advanced Transformer models using Keras.**

# Import Necessary Liabraries

In [21]:
import numpy as np
import pandas as pd
import tensorflow as tf # Deep learning framework for building and training neural network models.
import requests    # For making HTTP requests (might be used later to fetch online data).
from sklearn.preprocessing import MinMaxScaler # Utility for scaling data to the range [0, 1].
from tensorflow.keras.layers import Layer,Dense,LayerNormalization,Dropout
# These Keras layers are used to build custom neural network components (like in a Transformer).

#  setup the environment to generate random synthetic stock price data data to use as dataset

In [22]:
np.random.seed(42) # NumPy's random number generator.
# To ensure reproducibility—every time you run the code, the same random numbers will be generated, so the results are consistent.
data_length=2000 # Defines the number of data points.2000 means you're simulating stock prices for 2000 time steps.

trend=np.linspace(100,200,data_length) # np.linspace(start, end, num) returns num equally spaced values from start to end. So, this line generates 2000 points starting at 100 and ending at 200, forming a straight upward trend.
 # np.random.normal(mean, std, size) generates random values from a normal (Gaussian) distribution.

# mean=0: Noise is centered around zero, so it doesn’t add a bias.

# std=2: Standard deviation of 2 adds moderate variation.

# size=data_length: Generates 2000 noise values, one for each dat
noise=np.random.normal(0,2,data_length)

syntheic_data=trend+noise
#Combines the trend with random noise to simulate a more realistic stock price behavior.
# Result: A dataset (synthetic_data) that looks like a stock price gradually rising over time, but with small random ups and downs (just like real market data).

# create a dataframe and save as stock_pricees.csv

In [23]:
data=pd.DataFrame(syntheic_data,columns=['Close']) #: Names the single column as "Close"
data.to_csv('stock_prices.csv',index=False) # index=False: Ensures the DataFrame index (row numbers) is not included as a separate column in the CSV file.
print("Synthetic stock_prices.csv created and loaded")

Synthetic stock_prices.csv created and loaded


# Load the dataset

In [24]:
data=pd.read_csv('stock_prices.csv')
data=data[['Close']].values # Select only the 'Close' column and convert it to a NumPy array for easier numerical processing.

# Normalize the data and fit the scalar to data

In [25]:
# Instantiate the MinMaxScaler to transform data values into the range between 0 and 1.
scaler=MinMaxScaler(feature_range=(0,1))
data=scaler.fit_transform(data) ## Fit the scaler to the data and transform it, which normalizes the stock price values.


# Prepare data for training

In [26]:
# Defines a function to prepare the data for time series prediction.
# time_step: The number of past time steps to use as input features (X) to predict the next value (Y).

def create_dataset(data, time_step=1):
    """
    This function creates the dataset in a sliding window fashion.
    For each index 'i', it takes 'time_step' consecutive values as input (X)
    and the next immediate value as the corresponding label (Y).

    Example:
    Input: [10, 20, 30] → Output: 40
    Input: [20, 30, 40] → Output: 50
    Input: [30, 40, 50] → Output: 60

    It transforms a 1D sequence into multiple input-output pairs that
    a machine learning or deep learning model can learn from.
    """
    X, Y = [], []  # Initialize empty lists to store input sequences and their corresponding outputs.

    for i in range(len(data) - time_step - 1):
        a = data[i:(i + time_step), 0]         # Extract a window of 'time_step' values from 'data' starting at index 'i'
        X.append(a)                            # Append this extracted sequence to the input list (X)
        Y.append(data[i + time_step, 0])       # Append the next value as the target (Y)

    return np.array(X), np.array(Y)            # Convert the lists into NumPy arrays for model input


# Define the length of the time sequence to consider when predicting the next stock price
time_step = 100

# Generate the training features (X) and labels (Y) using the 'create_dataset' function
X, Y = create_dataset(data, time_step)  # X: shape (samples, 100); Y: next value for each input sequence

# Reshape X to be [samples, time steps, features]
X = X.reshape(
    X.shape[0],    # Number of samples (windows)
    X.shape[1],    # Time steps per sample
    1              # One feature (e.g., closing price)
)

# Now X is 3D, as required by LSTM or similar deep learning models
print("Shape of X:",X.shape) ## Expected: (Number of samples, time_step, 1)
print("Shape of Y:",Y.shape) ## Expected: (Number of samples,)

Shape of X: (1899, 100, 1)
Shape of Y: (1899,)


# MultiheadSelfAttention

In [29]:
from tensorflow.keras.layers import Layer, Dense
import tensorflow as tf

class MultiHeadSelfAttention(Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()  # Initialize the parent Layer class

        self.embed_dim = embed_dim                     # Total embedding dimension of the model (e.g., 512)
        self.num_heads = num_heads                     # Number of attention heads (e.g., 8)
        self.projection_dim = embed_dim // num_heads   # Dimensionality of each head (e.g., 64 if 512/8)

        # 💡 Dense layers to generate queries, keys, and values from the input
        self.query_dense = Dense(embed_dim)            # Linear layer to create the query matrix
        self.key_dense = Dense(embed_dim)
        self.value_dense = Dense(embed_dim)            # Linear layer to create the value matrix

        # 💡 After attention, all heads are concatenated and projected back to original embed_dim
        self.combine_heads = Dense(embed_dim)          # Linear layer to combine outputs from all heads

    # The attention() method implements the core attention mechanism.
    def attention(self, query, key, value):
        # Compute the raw attention scores by taking the dot product of query and the transpose of key
        score = tf.matmul(query, key, transpose_b=True)

        # Scale the scores by the square root of the key dimension (to stabilize gradients)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)

        # Apply softmax to obtain attention weights (probabilities that sum to 1)
        weights = tf.nn.softmax(scaled_score, axis=-1)

        # Multiply the weights with the value vectors to get the final output
        output = tf.matmul(weights, value)

        return output, weights  # Return both the attention output and the weights

    # Helper method to split the last dimension into multiple heads.
    def split_heads(self, x, batch_size):
        # Reshape the tensor to [batch_size, sequence_length, num_heads, projection_dim].
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        # Transpose to [batch_size, num_heads, sequence_length, projection_dim]
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # Get the current batch size from the input.
        batch_size = tf.shape(inputs)[0]

        # Project the inputs into query, key, and value
        query = self.query_dense(inputs)  # (batch_size, sequence_length, embed_dim)
        key = self.key_dense(inputs)      # Same shape as query
        value = self.value_dense(inputs)  # Same shape as query

        # Split the query, key, and value into multiple heads
        query = self.split_heads(query, batch_size)  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.split_heads(key, batch_size)
        value = self.split_heads(value, batch_size)

        # Compute attention: apply dot-product attention
        attention, _ = self.attention(query, key, value)

        # Transpose back to (batch_size, sequence_length, num_heads, projection_dim)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])

        # Concatenate all heads back together
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))

        # Final dense layer to combine all heads’ outputs
        output = self.combine_heads(concat_attention)

        return output
