# Beijing Air Quality Forecasting Starter Notebook

In [10]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import tensorflow as tf

In [11]:
# Load the datasets
# Ensure train.csv and test.csv are saved in your Google Drive in the same folder.
# Replace the file paths below with the actual paths to your dataset.
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')


# Explore the training data

In this sections explore your dataset with appropiate statistics and visualisations to understand your better. Ensure that you explain output of every code cell and what it entails.

In [12]:
# Inspecting the first few rows of the dataset to understand its structure.
print("Training Data Overview:")
train.head()

Training Data Overview:


Unnamed: 0,No,DEWP,TEMP,PRES,Iws,Is,Ir,datetime,cbwd_NW,cbwd_SE,cbwd_cv,pm2.5
0,1,-1.580878,-1.92225,0.443328,-0.441894,-0.069353,-0.137667,2010-01-01 00:00:00,1.448138,-0.732019,-0.522096,
1,2,-1.580878,-2.004228,0.345943,-0.379306,-0.069353,-0.137667,2010-01-01 01:00:00,1.448138,-0.732019,-0.522096,
2,3,-1.580878,-1.92225,0.248559,-0.343514,-0.069353,-0.137667,2010-01-01 02:00:00,1.448138,-0.732019,-0.522096,
3,4,-1.580878,-2.168183,0.248559,-0.280926,-0.069353,-0.137667,2010-01-01 03:00:00,1.448138,-0.732019,-0.522096,
4,5,-1.511594,-2.004228,0.151174,-0.218339,-0.069353,-0.137667,2010-01-01 04:00:00,1.448138,-0.732019,-0.522096,


In [13]:
train.columns

Index(['No', 'DEWP', 'TEMP', 'PRES', 'Iws', 'Is', 'Ir', 'datetime', 'cbwd_NW',
       'cbwd_SE', 'cbwd_cv', 'pm2.5'],
      dtype='object')

# Handle missing values


- Check the dataset for missing values and decide how to handle them.
- In this example, missing values are filled with the mean. You can experiment with other strategies.

In [14]:
import numpy as np
import pandas as pd

# -------------------------------
# 0. Basic datetime & fillna (same as before)
# -------------------------------
train['datetime'] = pd.to_datetime(train['datetime'])
test['datetime'] = pd.to_datetime(test['datetime'])

train.set_index('datetime', inplace=True)
test.set_index('datetime', inplace=True)

# Fill missing values with mean (same as original)
train.fillna(train.mean(), inplace=True)
test.fillna(test.mean(), inplace=True)

# Convert to float32 for efficiency
train = train.astype(np.float32)
test = test.astype(np.float32)

# -------------------------------
# 1. Time features
# -------------------------------
def create_time_features(df):
    hour = df.index.hour.values
    dow = df.index.dayofweek.values
    month = df.index.month.values
    
    df['hour_sin'] = np.sin(2*np.pi*hour/24)
    df['hour_cos'] = np.cos(2*np.pi*hour/24)
    df['dow_sin'] = np.sin(2*np.pi*dow/7)
    df['dow_cos'] = np.cos(2*np.pi*dow/7)
    df['month_sin'] = np.sin(2*np.pi*month/12)
    df['month_cos'] = np.cos(2*np.pi*month/12)
    return df

train = create_time_features(train)
test = create_time_features(test)

# -------------------------------
# 2. Prepare sequences for LSTM
# -------------------------------
SEQUENCE_LENGTH = 12
TARGET = 'pm2.5'

def create_sequences(X, y=None, seq_len=SEQUENCE_LENGTH):
    n_samples = X.shape[0] - seq_len
    if n_samples <= 0:
        return np.zeros((0, seq_len, X.shape[1]), dtype=np.float32), None
    X_seq = np.zeros((n_samples, seq_len, X.shape[1]), dtype=np.float32)
    y_seq = np.zeros((n_samples,), dtype=np.float32) if y is not None else None
    for i in range(n_samples):
        X_seq[i] = X[i:i+seq_len]
        if y is not None:
            y_seq[i] = y[i+seq_len]
    return X_seq, y_seq

# Training sequences
X_train_vals = train.drop(columns=[TARGET]).values
y_train_vals = train[TARGET].values
X_train, y_train = create_sequences(X_train_vals, y_train_vals, SEQUENCE_LENGTH)

# Test sequences
if TARGET in test.columns:
    X_test_vals = test.drop(columns=[TARGET]).values
    y_test_vals = test[TARGET].values
else:
    X_test_vals = test.values
    y_test_vals = None

X_test, y_test = create_sequences(X_test_vals, y_test_vals, SEQUENCE_LENGTH)

# ✅ Do NOT reshape manually. These are now correct for LSTM
print(f"X_train shape: {X_train.shape}, X_test shape: {X_test.shape}")


X_train shape: (30664, 12, 16), X_test shape: (13136, 12, 16)


# Separate features and target

- Feel free to trop any non-essential columns like that you think might not contribute to modeling.

In [15]:
# X_train = train.drop(['pm2.5', 'No'], axis=1).values 
# y_train = train['pm2.5'].values 

In [16]:
# Reshape data for LSTM input
# LSTM models require data in the shape (samples, timesteps, features).
# Here, the data is reshaped to add a "timesteps" dimension.
# X_train = np.expand_dims(X_train, axis=1)

# Build model

Below is a simple LSTM model. Your task is to experiment with different parameters like, numbers of layers, units, activation functions, and optimizers, etc to get the best performing model. Experiment with other optimizers (e.g., SGD) or hyperparameters to improve performance.

In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, BatchNormalization, Bidirectional
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint

# =============================
# Fully Optimized High-Performance Bidirectional LSTM
# =============================
model = Sequential([
    # First Bidirectional LSTM: large layer for capturing long-term dependencies
    Bidirectional(LSTM(256, activation='tanh', return_sequences=True,
                       dropout=0.45, recurrent_dropout=0.35), 
                  input_shape=(X_train.shape[1], X_train.shape[2])),
    BatchNormalization(),
    
    # Second Bidirectional LSTM: deep stacked layer
    Bidirectional(LSTM(256, activation='tanh', return_sequences=True,
                       dropout=0.45, recurrent_dropout=0.35)),
    BatchNormalization(),
    
    # Stacked LSTMs for fine temporal patterns
    LSTM(128, activation='tanh', return_sequences=True, dropout=0.9, recurrent_dropout=0.1),
    LSTM(64, activation='tanh', return_sequences=True, dropout=0.5, recurrent_dropout=0.1),
    LSTM(64, activation='tanh', return_sequences=True, dropout=0.3, recurrent_dropout=0.1),
    
    # Final LSTM layer
    LSTM(32, activation='tanh', dropout=0.15, recurrent_dropout=0.05),
    
    # Output layer
    Dense(1)
])

# -------------------------
# Optimizer: RMSprop (fast + stable)
# -------------------------
optimizer = RMSprop(learning_rate=0.001, clipnorm=5.0)
model.compile(optimizer=optimizer, loss='mse', metrics=['mae', 'mse'])

# -------------------------
# Callbacks for max performance
# -------------------------
callbacks = [
    EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6),
    ModelCheckpoint("best_model_max.h5", save_best_only=True, monitor='val_loss')
]

# -------------------------
# Model summary
# -------------------------
model.summary()

  super().__init__(**kwargs)


In [18]:
# Train the model with professional configuration
history = model.fit(
    X_train, y_train,
    validation_split=0.4,  # Use 20% of training data for validation
    epochs=50,            # Increased epochs with early stopping
    batch_size=16,
    callbacks=callbacks,   # Use the defined callbacks for optimization
    verbose=1,             # Show progress bar
    shuffle=True         # Important: Don't shuffle time series data
)

print("Model training completed with early stopping and learning rate scheduling")

Epoch 1/50


KeyboardInterrupt: 

In [None]:
# Calculate training loss
train_predictions = model.predict(X_train)
train_loss = np.mean((y_train - train_predictions.flatten())**2)

# Plot training loss

plt.figure(figsize=(8, 6))
plt.plot(history.history['loss'], label='Training Loss')  # Training loss during epochs
plt.axhline(y=train_loss, color='blue', linestyle='--', label='Final rain Loss')  # Final training loss
plt.title('Loss on Training Data')
plt.xlabel('Epochs')
plt.ylabel('Loss (MSE)')
plt.legend()
plt.show()

print(f"Final Training Loss (MSE): {train_loss}")

Final Training Loss (MSE): 7489.71875

Final Training Loss (MSE): 10908.271484375

Final Training Loss (MSE): 7389.98681640625 (0.01)

Final Training Loss (MSE): 6858.7236328125

Final Training Loss (MSE): 7189.4912109375 ()

In [None]:
# Prepare the test data
X_test = test.drop(['No'], axis=1)
X_test = np.expand_dims(X_test, axis=1)

# Make predictions on the test set using trained model to predict "pm2.5" concentrations
predictions = model.predict(X_test)

# Ensure predictions do not contain NaN values
predictions = np.nan_to_num(predictions)

# Convert predictions to integers
predictions = np.round(predictions).astype(int)

# Prepare the submission file
# Convert 'row ID' index to string and remove leading zeros for single-digit hours
submission = pd.DataFrame({
    'row ID': pd.to_datetime(test.index).strftime('%Y-%m-%d %-H:%M:%S'),  # Remove leading zeros for hours
    'pm2.5': predictions.flatten()
})

# Sort the submission by 'row ID' to match the solution file exactly
submission = submission.sort_values(by='row ID')

# Save the file in CSV format for submission on Kaggle
submission.to_csv('subm_fixed.csv', index=False)