In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
import joblib
import os

print("--- LSTM Model Training Script ---")

# --- Configuration ---
N_PAST = 72
N_FUTURE = 72
DATA_PATH = "/Users/manishwagle/Desktop/Air Quality Intelligence/data/processed/processed_data.csv"
MODEL_DIR = 'models'

os.makedirs(MODEL_DIR, exist_ok=True)
LSTM_MODEL_PATH = os.path.join(MODEL_DIR, 'lstm_model.keras')
SCALER_PATH = os.path.join(MODEL_DIR, 'scaler.pkl')

# --- 1. Load and Prepare Data ---
print(f"Loading data from {DATA_PATH}...")
try:
    df = pd.read_csv(DATA_PATH, parse_dates=['Datetime'], index_col='Datetime')
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_PATH}.")
    exit()

if 'AQI' in df.columns:
    cols = ['AQI'] + [col for col in df.columns if col != 'AQI']
    df = df[cols]
else:
    print("Error: 'AQI' column not found.")
    exit()

print("Original data shape:", df.shape)

# --- 2. Scale the Data ---
print("Scaling data using MinMaxScaler...")
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df)

joblib.dump(scaler, SCALER_PATH)
print(f"Scaler saved to {SCALER_PATH}")

# --- 3. Create Sequences ---
def create_sequences(data, n_past, n_future):
    X, y = [], []
    for i in range(len(data) - n_past - n_future + 1):
        X.append(data[i : i + n_past])
        y.append(data[i + n_past : i + n_past + n_future, 0])
    return np.array(X), np.array(y)

print(f"Creating sequences...")
X, y = create_sequences(scaled_data, N_PAST, N_FUTURE)
print("Shape of X (input sequences):", X.shape)
print("Shape of y (output sequences):", y.shape)

# We will use 80% of the sequences for training and 20% for validation.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData split into training and validation sets:")
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

# --- 4. Build and Train the LSTM Model ---
print("\nBuilding the LSTM model...")
# 
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(100, activation='tanh', input_shape=(X.shape[1], X.shape[2]), return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(50, activation='tanh', return_sequences=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(N_FUTURE)
])

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

# EarlyStopping will monitor the validation loss and stop training if it doesn't improve for 5 straight epochs.
# It will also restore the best weights found during training.
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

print("\nTraining the model with Early Stopping...")
# We now train for more epochs, knowing Early Stopping will find the best one.
# We also provide the validation data to monitor performance.
history = model.fit(
    X_train, y_train,
    epochs=50,  # Train for up to 50 epochs
    batch_size=32,
    validation_data=(X_val, y_val), # Provide validation data
    callbacks=[early_stopping],     # Use the early stopping callback
    verbose=1
)
print("Model training complete.")

# --- 5. Save the Trained Model ---
model.save(LSTM_MODEL_PATH)
print(f"Best model saved to {LSTM_MODEL_PATH}")
print("\n--- Script Finished Successfully ---")


--- LSTM Model Training Script ---
Loading data from /Users/manishwagle/Desktop/Air Quality Intelligence/data/processed/processed_data.csv...
Original data shape: (27337, 19)
Scaling data using MinMaxScaler...
Scaler saved to models/scaler.pkl
Creating sequences...
Shape of X (input sequences): (27194, 72, 19)
Shape of y (output sequences): (27194, 72)

Data split into training and validation sets:
X_train shape: (21755, 72, 19)
X_val shape: (5439, 72, 19)

Building the LSTM model...


  super().__init__(**kwargs)



Training the model with Early Stopping...
Epoch 1/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 39ms/step - loss: 0.0186 - val_loss: 0.0089
Epoch 2/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 0.0099 - val_loss: 0.0078
Epoch 3/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 0.0082 - val_loss: 0.0066
Epoch 4/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - loss: 0.0073 - val_loss: 0.0064
Epoch 5/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 39ms/step - loss: 0.0065 - val_loss: 0.0057
Epoch 6/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - loss: 0.0059 - val_loss: 0.0050
Epoch 7/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - loss: 0.0054 - val_loss: 0.0048
Epoch 8/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 0.0049 

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib
import os

print("--- LSTM Model Training Script ---")

# --- Configuration ---
N_PAST = 72
N_FUTURE = 72
DATA_PATH = "/Users/manishwagle/Desktop/Air Quality Intelligence/data/processed/processed_data.csv"
MODEL_DIR = 'models'

os.makedirs(MODEL_DIR, exist_ok=True)
LSTM_MODEL_PATH = os.path.join(MODEL_DIR, 'lstm_model.keras')
SCALER_PATH = os.path.join(MODEL_DIR, 'scaler.pkl')

# --- 1. Load and Prepare Data ---
print(f"Loading data from {DATA_PATH}...")
try:
    df = pd.read_csv(DATA_PATH, parse_dates=['Datetime'], index_col='Datetime')
except FileNotFoundError:
    print(f"Error: Data file not found at {DATA_PATH}.")
    exit()

if 'AQI' in df.columns:
    cols = ['AQI'] + [col for col in df.columns if col != 'AQI']
    df = df[cols]
else:
    print("Error: 'AQI' column not found.")
    exit()

print("Original data shape:", df.shape)

# --- 2. Scale the Data ---
print("Scaling data using MinMaxScaler...")
scaler = MinMaxScaler(feature_range=(0, 1))
scaled_data = scaler.fit_transform(df)

joblib.dump(scaler, SCALER_PATH)
print(f"Scaler saved to {SCALER_PATH}")

# --- 3. Create Sequences ---
def create_sequences(data, n_past, n_future):
    X, y = [], []
    for i in range(len(data) - n_past - n_future + 1):
        X.append(data[i : i + n_past])
        y.append(data[i + n_past : i + n_past + n_future, 0])
    return np.array(X), np.array(y)

print(f"Creating sequences...")
X, y = create_sequences(scaled_data, N_PAST, N_FUTURE)
print("Shape of X (input sequences):", X.shape)
print("Shape of y (output sequences):", y.shape)

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
print("\nData split into training and validation sets:")
print("X_train shape:", X_train.shape)
print("X_val shape:", X_val.shape)

# --- 4. Build and Train the LSTM Model ---
print("\nBuilding the LSTM model...")
model = tf.keras.models.Sequential([
    tf.keras.layers.LSTM(100, activation='tanh', input_shape=(X.shape[1], X.shape[2]), return_sequences=True),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.LSTM(50, activation='tanh', return_sequences=False),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(N_FUTURE)
])

model.compile(optimizer='adam', loss='mean_squared_error')
model.summary()

early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

print("\nTraining the model with Early Stopping...")
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_data=(X_val, y_val),
    callbacks=[early_stopping],
    verbose=1
)
print("Model training complete.")

# --- 5. Evaluate and Save the Model ---
print("\nEvaluating model performance on the validation set...")

# Make predictions on the validation data
y_pred_scaled = model.predict(X_val)

# The scaler was fitted on a multi-feature dataset. To inverse transform our
# single-feature predictions (AQI), we need to create a dummy array that
# matches the original shape the scaler was fitted on.
n_features = scaled_data.shape[1]

# --- Inverse transform the predictions ---
y_pred_dummy = np.zeros((y_pred_scaled.shape[0] * N_FUTURE, n_features))
y_pred_dummy[:, 0] = y_pred_scaled.flatten()
y_pred_inv = scaler.inverse_transform(y_pred_dummy)[:, 0]
y_pred_inv = y_pred_inv.reshape(y_pred_scaled.shape)

# --- Inverse transform the actual values ---
y_val_dummy = np.zeros((y_val.shape[0] * N_FUTURE, n_features))
y_val_dummy[:, 0] = y_val.flatten()
y_val_inv = scaler.inverse_transform(y_val_dummy)[:, 0]
y_val_inv = y_val_inv.reshape(y_val.shape)

# --- Calculate Metrics ---
# Note: These metrics are averaged over all 72 future time steps.
mae = mean_absolute_error(y_val_inv, y_pred_inv)
mse = mean_squared_error(y_val_inv, y_pred_inv)
rmse = np.sqrt(mse)
r2 = r2_score(y_val_inv, y_pred_inv)

print(f"  -> Validation MAE (Mean Absolute Error):      {mae:.4f}")
print(f"  -> Validation MSE (Mean Squared Error):       {mse:.4f}")
print(f"  -> Validation RMSE (Root Mean Squared Error): {rmse:.4f}")
print(f"  -> Validation R² (R-squared):                 {r2:.4f}")

# --- 6. Save the Trained Model ---
#model.save(LSTM_MODEL_PATH)
#print(f"\nBest model saved to {LSTM_MODEL_PATH}")
#print("\n--- Script Finished Successfully ---")

--- LSTM Model Training Script ---
Loading data from /Users/manishwagle/Desktop/Air Quality Intelligence/data/processed/processed_data.csv...
Original data shape: (27337, 19)
Scaling data using MinMaxScaler...
Scaler saved to models/scaler.pkl
Creating sequences...
Shape of X (input sequences): (27194, 72, 19)
Shape of y (output sequences): (27194, 72)

Data split into training and validation sets:
X_train shape: (21755, 72, 19)
X_val shape: (5439, 72, 19)

Building the LSTM model...


  super().__init__(**kwargs)



Training the model with Early Stopping...
Epoch 1/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 39ms/step - loss: 0.0192 - val_loss: 0.0087
Epoch 2/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - loss: 0.0098 - val_loss: 0.0073
Epoch 3/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 0.0083 - val_loss: 0.0071
Epoch 4/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 37ms/step - loss: 0.0075 - val_loss: 0.0065
Epoch 5/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 37ms/step - loss: 0.0068 - val_loss: 0.0061
Epoch 6/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 0.0064 - val_loss: 0.0057
Epoch 7/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 38ms/step - loss: 0.0058 - val_loss: 0.0051
Epoch 8/50
[1m680/680[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 39ms/step - loss: 0.0052 