In [2]:
# Import necessary libraries
import numpy as np
import json
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import (
    LSTM, Dense, Dropout, BatchNormalization, 
    Input, Attention, Flatten, Permute, Multiply, Lambda
)
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt

# Configurations and Constants

In [3]:
AI_READY_DATA_DIR = "ai_ready_data"  # Preprocessed data directory
MODEL_SAVE_PATH = "lstm_location_predictor.keras"  # Trained Keras model
ONNX_MODEL_SAVE_PATH = "lstm_location_predictor.onnx" # ONNX model

# LSTM Model Hyperparameters
LSTM_UNITS_1 = 128       # Number of units in the first LSTM layer
LSTM_UNITS_2 = 64        # Number of units in the second LSTM layer
DENSE_UNITS_1 = 128      # Number of units in the first Dense layer
DENSE_UNITS_2 = 64       # Number of units in the second Dense layer
DROPOUT_RATE = 0.3       # Dropout rate for regularization
LEARNING_RATE = 0.001    # Learning rate for the Adam optimizer
BATCH_SIZE = 64          # Number of samples per gradient update
EPOCHS = 50              # Maximum number of epochs for training (EarlyStopping will monitor)
USE_ATTENTION = False    # Whether to use an attention mechanism (as per proposal "optional")

# Load the preprocessed Data

In [4]:
def load_data(data_dir):
    try:
        X_train = np.load(os.path.join(data_dir, "X_train.npy"))
        y_train = np.load(os.path.join(data_dir, "y_train.npy"))
        
        # Handle potentially empty validation/test sets if data split was small
        X_val_path = os.path.join(data_dir, "X_val.npy")
        X_val = np.load(X_val_path) if os.path.exists(X_val_path) and os.path.getsize(X_val_path) > 0 else np.array([])
        
        y_val_path = os.path.join(data_dir, "y_val.npy")
        y_val = np.load(y_val_path) if os.path.exists(y_val_path) and os.path.getsize(y_val_path) > 0 else np.array([])
        
        X_test_path = os.path.join(data_dir, "X_test.npy")
        X_test = np.load(X_test_path) if os.path.exists(X_test_path) and os.path.getsize(X_test_path) > 0 else np.array([])
        
        y_test_path = os.path.join(data_dir, "y_test.npy")
        y_test = np.load(y_test_path) if os.path.exists(y_test_path) and os.path.getsize(y_test_path) > 0 else np.array([])

        with open(os.path.join(data_dir, "grid_id_to_index.json"), "r") as f:
            grid_id_to_index = json.load(f)

        print("Data loaded successfully.")
        print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
        if X_val.size > 0:
            print(f"X_val shape: {X_val.shape}, y_val shape: {y_val.shape}")
        if X_test.size > 0:
            print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")
        
        return X_train, y_train, X_val, y_val, X_test, y_test, grid_id_to_index
    except FileNotFoundError as e:
        print(f"Error: Data file not found. {e}")
        exit()
    except Exception as e:
        print(f"An error occurred while loading data: {e}")
        exit()

# Define Model Archtecture

In [None]:
def build_model(sequence_length, num_features, num_classes):
    print("Building LSTM model...")
    
    inputs = Input(shape=(sequence_length, num_features))
    
    # First LSTM layer
    x = LSTM(LSTM_UNITS_1, return_sequences=(LSTM_UNITS_2 > 0 or USE_ATTENTION))(inputs)
    x = BatchNormalization()(x)
    x = Dropout(DROPOUT_RATE)(x)
    
    # Optional Attention Layer (simple self-attention on LSTM output)
    if USE_ATTENTION:
        if not (LSTM_UNITS_2 > 0): # If no second LSTM, LSTM1 output needs to be sequence for attention
            pass
        
        # A simple attention mechanism
        attention_probs = Dense(sequence_length, activation='softmax', name='attention_vec')(x)
        # The Permute and Multiply layers are common for applying attention weights.
        # However, Keras's Attention layer is simpler if it fits the need.
        # Using tf.keras.layers.Attention for a more standard approach:
        # query_value_attention_seq = Attention()([x, x]) # Self-attention
        
        # Simpler attention: weighted sum of LSTM outputs
        # This requires LSTM_UNITS_1 to return_sequences=True
        attention_mul = Multiply()([x, Permute((2,1))(Dense(num_features, activation='softmax')(x))]) # Element-wise multiplication after permuting dense output
        attention_mul = Lambda(lambda xin: tf.keras.backend.sum(xin, axis=1))(attention_mul) # Sum over time steps
        
        # If using tf.keras.layers.Attention directly:
        # attention_result = Attention()([x, x]) # query, value
        # x = Flatten()(attention_result) # Flatten if attention output is still sequential

        x = attention_mul # Use the weighted sum as input to the next layer
        # Note: This is a basic attention. More complex mechanisms exist.
        # For this project, keeping it optional and relatively simple.
        # If LSTM_UNITS_2 > 0, this attention output (which is now 1D) needs to be handled.
        # The current setup assumes attention output is flattened/summarized before dense layers
        # or that the next LSTM layer can handle its shape.
        # If attention is used and followed by another LSTM, LSTM1 must return_sequences=True.
        # The attention layer output would then be processed.
        # For now, if attention is used, it will be followed by Dense layers, so we need to flatten if 'x' is still a sequence.
        # The current simple attention_mul already reduces dimensionality.

    # Optional second LSTM layer
    if LSTM_UNITS_2 > 0:
        # If USE_ATTENTION is True, 'x' might be shaped (batch_size, LSTM_UNITS_1) after simple attention.
        # A standard LSTM layer expects 3D input (batch_size, timesteps, features).
        # This part needs careful handling if attention is used before a second LSTM.
        # For this iteration, if attention is used, we assume it's followed by Dense layers.
        # So, if LSTM_UNITS_2 > 0 AND USE_ATTENTION, this architecture might need adjustment.
        # Let's assume if USE_ATTENTION, it's the final recurrent/attention block before Dense.
        if USE_ATTENTION:
            print("Warning: Using Attention with a second LSTM layer requires careful architecture. Current setup assumes Attention is followed by Dense layers.")
        else: # No attention, standard stacked LSTM
            x = LSTM(LSTM_UNITS_2, return_sequences=False)(x) # return_sequences=False for the last LSTM before Dense
            x = BatchNormalization()(x)
            x = Dropout(DROPOUT_RATE)(x)

    # Dense layers
    x = Dense(DENSE_UNITS_1, activation='relu')(x)
    x = BatchNormalization()(x)
    x = Dropout(DROPOUT_RATE)(x)
    
    if DENSE_UNITS_2 > 0:
        x = Dense(DENSE_UNITS_2, activation='relu')(x)
        x = BatchNormalization()(x)
        x = Dropout(DROPOUT_RATE)(x)
        
    # Output layer
    outputs = Dense(num_classes, activation='softmax')(x)
    
    model = Model(inputs=inputs, outputs=outputs)
    
    # Compile the model
    optimizer = Adam(learning_rate=LEARNING_RATE)
    model.compile(optimizer=optimizer, 
                  loss='sparse_categorical_crossentropy', # Use this for integer targets
                  metrics=['accuracy'])
    
    print("Model built and compiled successfully.")
    model.summary()
    return model