In [61]:
import utils as ui
from sklearn.preprocessing import MinMaxScaler,RobustScaler,StandardScaler
import pandas as pd
import numpy as np
from keras.layers import Bidirectional,Dropout,LSTM,Dense,LeakyReLU
from keras.callbacks import ModelCheckpoint,EarlyStopping
from keras.models import load_model
from keras.models import Model,Sequential
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

In [62]:
df_train= ui.load_csv('Datasets\With Imputation\PPCA Imputation\Ghaziabad_train_PPCA.csv')
df_test= ui.load_csv('Datasets\With Imputation\PPCA Imputation\Ghaziabad_test_PPCA.csv')

In [63]:
features_to_select = ['PM2.5', 'PM10', 'NO2', 'NH3', 'SO2', 'CO', 'Ozone', 'Temp', 'RH','WS','WD']

In [64]:
df_train[features_to_select],scaler=ui.scaling(df_train,features_to_scale=features_to_select,scaler=RobustScaler(),s=True,r=True)
df_test[features_to_select]=ui.scaling(df_test,features_to_scale=features_to_select,scaler=RobustScaler())

Max value of scaled 'PM2.5': 7.42344660129255
Min value of scaled 'PM2.5': -0.7135677136358749
count    28742.000000
mean         0.344936
std          1.037664
min         -0.713568
25%         -0.343425
50%          0.000000
75%          0.656575
max          7.423447
Name: PM2.5, dtype: float64


In [65]:
df_train_final, df_val = train_test_split(df_train, test_size=0.2, shuffle=False)

In [66]:
X_train, y_train = ui.data_formating(df_train_final)
X_val, y_val = ui.data_formating(df_val)
X_test, y_test = ui.data_formating(df_test)

X shape == (22945, 48, 14).
Y shape == (22945, 1).
X shape == (5701, 48, 14).
Y shape == (5701, 1).
X shape == (5147, 48, 14).
Y shape == (5147, 1).


In [67]:
def LSTM_Model(X,Y):

    model = Sequential()
    model.add(Bidirectional(LSTM(64, input_shape=(X.shape[1],X.shape[2]), return_sequences=  False)))
    model.add(LeakyReLU(0.69))
    model.add(Dropout(0.2))
    model.add(Dense(Y.shape[1],activation='linear'))
    model.compile(optimizer='adam', loss='mse')

    cp=ModelCheckpoint('models/model_epoch_{epoch:02d}.h5',save_best_only=True)
    early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

    # fit the model
    history = model.fit(X,Y, epochs=15, batch_size=24, validation_split=0.2,shuffle=False, verbose=1,callbacks=[cp,early_stopping])
    # history = model.fit(X,Y, epochs=15, batch_size=48, validation_split=0.1, verbose=1,callbacks=[cp,early_stopping])
    model.summary()

    # Print final training and validation loss
    print("Final training loss:", history.history['loss'][-1])
    print("Final validation loss:", history.history['val_loss'][-1])

    ui.plot_loss(history)

    ask = input("Do You Want To Continue? (Yes/No) ").strip().lower()
    i=0
    while ask=='yes':

        cp=ModelCheckpoint('models/model_epoch_'+ str(i) +'_{epoch:02d}.h5',save_best_only=True)

        # Continue training
        history = model.fit(X, Y, epochs=5, batch_size=24, validation_split=0.2,shuffle=False, verbose=1, callbacks=[cp, early_stopping])
        # history = model.fit(X, Y, epochs=5, batch_size=48, validation_split=0.1, verbose=1, callbacks=[cp, early_stopping])
        
        # Print final training and validation loss
        print("Final training loss:", history.history['loss'][-1])
        print("Final validation loss:", history.history['val_loss'][-1])

        ui.plot_loss(history)
        
        # Ask again
        ask = input("Do You Want To Continue? (Yes/No) ").strip().lower()
        i=i+1

    return model

In [68]:
def XGB_Model(X,Y):

    X = X.reshape(X.shape[0], -1)

    model = XGBRegressor(
        n_estimators=100,
        max_depth=3,
        learning_rate=0.1,
        verbosity=0,
        random_state=42
    )

    X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=40,shuffle=False)

    # Train the model
    model.fit(X_train, y_train)

    # Make predictions
    predictions = model.predict(X_test)

    # Calculate MSE
    mse = mean_squared_error(y_test, predictions, squared=False)
    print("Mean Squared Error:", mse)

    # Calculating the RMSE
    rmse = sqrt(mse)
    print("Root Mean Squared Error:", rmse)

    return model


In [69]:
lstm_model = LSTM_Model(X_train, y_train)
xgb_model = XGB_Model(X_train, y_train)  # make XGB_Model accept X, y args

Epoch 1/15


Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_4 (Bidirectio  (None, 128)              40448     
 nal)                                                            
                                                                 
 leaky_re_lu_4 (LeakyReLU)   (None, 128)               0         
                                                                 
 dropout_4 (Dropout)         (None, 128)               0         
                                                                 
 dense_4 (Dense)             (None, 1)                 129       
                                                                 
Total params: 40,577
Trainable params: 40,577
Non-trainable params: 0
______________________

Mean Squared Error: 0.22883917169907694
Root Mean Squared Error: 0.47837137424711873


In [70]:
# Predictions for validation set
lstm_val_preds = lstm_model.predict(X_val)
xgb_val_preds = xgb_model.predict(X_val.reshape(X_val.shape[0], -1))

# Predictions for test set
lstm_test_preds = lstm_model.predict(X_test)
xgb_test_preds = xgb_model.predict(X_test.reshape(X_test.shape[0], -1))




In [71]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from math import sqrt
import numpy as np

# Stack predictions for ensemble training and testing
X_ens_val = np.column_stack((lstm_val_preds, xgb_val_preds))
X_ens_test = np.column_stack((lstm_test_preds, xgb_test_preds))

# Train ensemble model on validation set
ensemble_model = LinearRegression()
ensemble_model.fit(X_ens_val, y_val)

# Predict on test set using ensemble
ensemble_test_preds = ensemble_model.predict(X_ens_test)
# ensemble_test_preds = (X_ens_test[:, 0]*0.6 + 0.2*X_ens_test[:, 1])

# Repeat predictions and y_test to match scaler input shape (11 features)
lstm_unscaled = scaler.inverse_transform(np.repeat(lstm_test_preds.reshape(-1, 1), 11, axis=1))[:, 0]
xgb_unscaled = scaler.inverse_transform(np.repeat(xgb_test_preds.reshape(-1, 1), 11, axis=1))[:, 0]
ensemble_unscaled = scaler.inverse_transform(np.repeat(ensemble_test_preds.reshape(-1, 1), 11, axis=1))[:, 0]
y_test_unscaled = scaler.inverse_transform(np.repeat(y_test.reshape(-1, 1), 11, axis=1))[:, 0]

# Evaluate all models on original scale
lstm_rmse = sqrt(mean_squared_error(y_test_unscaled, lstm_unscaled))
xgb_rmse = sqrt(mean_squared_error(y_test_unscaled, xgb_unscaled))
ensemble_rmse = sqrt(mean_squared_error(y_test_unscaled, ensemble_unscaled))

lstm_mae = mean_absolute_error(y_test_unscaled, lstm_unscaled)
xgb_mae = mean_absolute_error(y_test_unscaled, xgb_unscaled)
ensemble_mae = mean_absolute_error(y_test_unscaled, ensemble_unscaled)

lstm_r2 = r2_score(y_test_unscaled, lstm_unscaled)
xgb_r2 = r2_score(y_test_unscaled, xgb_unscaled)
ensemble_r2 = r2_score(y_test_unscaled, ensemble_unscaled)

# 📊 Print results
print("📊 RMSE, MAE, and R² Comparison on Unscaled Test Set:")
print(f"{'Model':<12} {'RMSE':>10} {'MAE':>10} {'R²':>10}")
print(f"{'LSTM':<12} {lstm_rmse:10.4f} {lstm_mae:10.4f} {lstm_r2:10.4f}")
print(f"{'XGBoost':<12} {xgb_rmse:10.4f} {xgb_mae:10.4f} {xgb_r2:10.4f}")
print(f"{'Ensemble':<12} {ensemble_rmse:10.4f} {ensemble_mae:10.4f} {ensemble_r2:10.4f}")


📊 RMSE, MAE, and R² Comparison on Unscaled Test Set:
Model              RMSE        MAE         R²
LSTM            50.0456    25.4377     0.8889
XGBoost         50.7625    25.5765     0.8857
Ensemble        48.8598    24.7524     0.8941


In [None]:
print(f"\n📌 Ensemble Weights:")
print(f"   LSTM Weight   = {ensemble_model.coef_[0][0]:.4f}")
print(f"   XGBoost Weight= {ensemble_model.coef_[0][1]:.4f}")


📌 Ensemble Weights:
   LSTM Weight   = 0.4449
   XGBoost Weight= 0.5690
