# PM2.5 Forecasting - Optimized LSTM Model

This notebook implements a fully optimized time series forecasting model using advanced LSTM architecture, denormalization, and feature engineering to improve RMSE on the Kaggle leaderboard.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from math import sqrt


In [None]:
train = pd.read_csv('train.csv', parse_dates=['datetime'])
test = pd.read_csv('test.csv', parse_dates=['datetime'])
sample_submission = pd.read_csv('sample_submission.csv')

train.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)


In [None]:
train['pm2.5'] = train['pm2.5'].interpolate(method='time')
train.dropna(subset=['pm2.5'], inplace=True)

train['pm2.5_lag1'] = train['pm2.5'].shift(1)
train['pm2.5_roll3'] = train['pm2.5'].rolling(window=3).mean()
train.dropna(inplace=True)

features = ['DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 
            'cbwd_NW', 'cbwd_SE', 'cbwd_cv', 
            'pm2.5_lag1', 'pm2.5_roll3']

scaler = MinMaxScaler()
scaled_features = scaler.fit_transform(train[features])
combined = np.hstack([scaled_features, train[['pm2.5']].values])


In [None]:
def create_sequences(data, seq_length=48):
    X, y = [], []
    for i in range(len(data) - seq_length):
        X.append(data[i:i+seq_length, :-1])
        y.append(data[i+seq_length, -1])
    return np.array(X), np.array(y)

seq_length = 48
X, y = create_sequences(combined)

split = int(0.8 * len(X))
X_train, X_val = X[:split], X[split:]
y_train, y_val = y[:split], y[split:]


In [None]:
model = Sequential([
    LSTM(128, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])),
    Dropout(0.3),
    LSTM(64),
    Dropout(0.3),
    Dense(1)
])

model.compile(optimizer='adam', loss='mse')
es = EarlyStopping(monitor='val_loss', patience=7, restore_best_weights=True)
lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3)

history = model.fit(X_train, y_train, validation_data=(X_val, y_val),
                    epochs=50, batch_size=64, callbacks=[es, lr])


In [None]:
val_preds = model.predict(X_val)
rmse = sqrt(mean_squared_error(y_val, val_preds))
print("Validation RMSE:", rmse)

plt.figure(figsize=(10, 4))
plt.plot(y_val[:200], label='Actual')
plt.plot(val_preds[:200], label='Predicted')
plt.legend()
plt.title("Sample Validation Predictions")
plt.show()


In [None]:
test_scaled = scaler.transform(test[features])
test_sequences = [test_scaled[i:i+seq_length] for i in range(len(test_scaled) - seq_length)]
X_test = np.array(test_sequences)

test_preds = model.predict(X_test)

pm25_scaler = MinMaxScaler()
pm25_scaler.fit(train[['pm2.5']])
sample_submission['pm2.5'] = pm25_scaler.inverse_transform(test_preds[:len(sample_submission)])

sample_submission.to_csv('submission.csv', index=False)
sample_submission.head()
