In [None]:
from tensorflow.keras import layers, models, callbacks
from tensorflow.keras import backend as K

# ---------------------
# # 5. Transformer Model Implementation and Training
# ---------------------

# Define Positional Encoding
def get_positional_encoding(sequence_length, d_model):
    angle_rads = get_angles(np.arange(sequence_length)[:, np.newaxis],
                            np.arange(d_model)[np.newaxis, :],
                            d_model)
    
    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])
    
    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])
    
    pos_encoding = angle_rads[np.newaxis, ...]
    
    return tf.cast(pos_encoding, dtype=tf.float32)

def get_angles(pos, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i//2)) / np.float32(d_model))
    return pos * angle_rates

# Define the Transformer Block
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        print(embed_dim // num_heads)
        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim // num_heads)
        self.ffn = models.Sequential([
            layers.Dense(ff_dim, activation='relu'),
            layers.Dense(embed_dim),
        ])
        
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

# Define the Token and Position Embedding
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, sequence_length, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Dense(embed_dim)
        self.pos_emb = get_positional_encoding(sequence_length, embed_dim)

    def call(self, x):
        x = self.token_emb(x)
        return x + self.pos_emb[:, :tf.shape(x)[1], :]

# Build the Transformer Model
def build_transformer_model(sequence_length, feature_dim, embed_dim, num_heads, ff_dim, num_layers, dropout_rate=0.1):
    inputs = layers.Input(shape=(sequence_length, feature_dim))
    embedding_layer = TokenAndPositionEmbedding(sequence_length, feature_dim, embed_dim)
    x = embedding_layer(inputs)

    for _ in range(num_layers):
        transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim, dropout_rate)
        x = transformer_block(x)

    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dropout(dropout_rate)(x)
    usage_minutes_head = layers.Dense(ff_dim, activation="relu")(x)
    usage_minutes_head = layers.Dense(1, name="usage_minutes_output")(usage_minutes_head)
    usage_minutes_output = layers.Dense(1, activation="linear", name="usage_minutes_output")(usage_minutes_head)
    
    
    model = models.Model(inputs=inputs, outputs=[usage_minutes_output]) # Priority_score_output
    return model



# Define model hyperparameters
EMBED_DIM = 64  # Embedding size for each token
NUM_HEADS = 4  # Number of attention heads
FF_DIM = 128  # Hidden layer size in the feed-forward network
NUM_LAYERS = 1  # Number of Transformer blocks

# Get feature dimension
feature_dim = X.shape[2]  # Number of features in input data

# Build the model
transformer_model = build_transformer_model(
    sequence_length=3,  # Example sequence length
    feature_dim=feature_dim,
    embed_dim=EMBED_DIM,
    num_heads=NUM_HEADS,
    ff_dim=FF_DIM,
    num_layers=NUM_LAYERS,
    dropout_rate=0.1
)


        
        
        
        
        
        

In [None]:
# Compile the model
transformer_model.compile(
    optimizer='adam',
    loss={
        'usage_minutes_output': 'mse',
        # 'priority_score_output': 'mse'
    },
    loss_weights={
        'usage_minutes_output': 1.0,
        # 'priority_score_output': 1.0
    },
    metrics={
        'usage_minutes_output': 'mae',
        # 'priority_score_output': 'mae'
    }
)

# Model summary
transformer_model.summary()


In [None]:
from tensorflow.keras.callbacks import TensorBoard
import datetime

# Define the log directory for TensorBoard
log_dir = "logs/model/transformer/" + datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
tensorboard_callback = TensorBoard(log_dir=log_dir, histogram_freq=1)
print(y_val_usage_minutes)


In [None]:

# Train the model
history = transformer_model.fit(
    X_train,
    {
        'usage_minutes_output': y_train_usage_minutes,
        # 'priority_score_output': y_train_priority_score
    },
    epochs=50,
    batch_size=64,
    validation_data=(
        X_val,
        {
            'usage_minutes_output': y_val_usage_minutes,
            # 'priority_score_output': y_val_priority_score
        }
    ),
    callbacks=[tensorboard_callback],
    verbose=1
)

In [None]:
# Plot Training and Validation Loss
plt.figure(figsize=(10, 5))
plt.plot(history.history['loss'], label='Training Loss (MSE)')
plt.plot(history.history['val_loss'], label='Validation Loss (MSE)')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss Over Epochs')
plt.legend()
plt.show()

In [None]:
# Make Predictions
usage_pred = transformer_model.predict(X_val)

In [None]:
print(usage_pred.shape)
print('**********')
#print(priority_pred.shape)

In [None]:
# ================================
# 6. Model Evaluation
# ================================

# Load scalers for inverse transformation of predictions
usage_minutes_scaler = joblib.load('usage_minutes_scaler.pkl')
usage_minutes_predicted = usage_minutes_scaler.inverse_transform(usage_pred)
actual_usage_minutes = usage_minutes_scaler.inverse_transform(y_val_usage_minutes.reshape(-1, 1))

timestampScaler = joblib.load('time_scaler.pkl')
actual_timestamp = timestampScaler.inverse_transform(timestamps_val2.reshape(-1, 1))



In [None]:
mae = mean_absolute_error(actual_usage_minutes, usage_minutes_predicted)  
# Calculate Mean Absolute Error (MAE) between actual and predicted usage minutes

print(mae)  
# Print the MAE result

usage_rmse = np.sqrt(mean_squared_error(actual_usage_minutes, usage_minutes_predicted))  
# Calculate Root Mean Squared Error (RMSE) for usage minutes

print(usage_rmse)  
# Print the RMSE result

usage_r2 = r2_score(actual_usage_minutes, usage_minutes_predicted)  
# Calculate R2 score to evaluate the variance explained by the predictions

print(usage_r2)  
# Print the R2 score result