In [2]:

#  ARIMA Model 

# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
import warnings
warnings.filterwarnings("ignore")

#  Load the Dataset
df = pd.read_csv("station_day_edited_with_missing.csv")

# Basic Preprocessing
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values('Date')

# Drop rows with missing values in any column 
df = df.dropna()

# Set Date as index (required for ARIMA)
df.set_index('Date', inplace=True)

#  Target Variable Selection
target_col = 'PM2.5' if 'PM2.5' in df.columns else df.columns[-1]

# Create Lag Features for better temporal learning
df['lag_1'] = df[target_col].shift(1)
df['lag_2'] = df[target_col].shift(2)
df['lag_3'] = df[target_col].shift(3)

# Drop any rows with NaN created due to lagging
df = df.dropna()

# Split into Train Set (80%)
split_index = int(len(df) * 0.8)
train = df.iloc[:split_index]

#  Build & Train ARIMA Model
# (p, d, q) parameters can be tuned later
p, d, q = 2, 1, 2

model = ARIMA(train[target_col], order=(p, d, q))
model_fit = model.fit()

print("ARIMA Model trained successfully!")
print(model_fit.summary())


ARIMA Model trained successfully!
                               SARIMAX Results                                
Dep. Variable:                  PM2.5   No. Observations:                 6324
Model:                 ARIMA(2, 1, 2)   Log Likelihood              -31883.228
Date:                Fri, 17 Oct 2025   AIC                          63776.455
Time:                        18:52:17   BIC                          63810.215
Sample:                             0   HQIC                         63788.148
                               - 6324                                         
Covariance Type:                  opg                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
ar.L1         -0.5331      0.135     -3.940      0.000      -0.798      -0.268
ar.L2         -0.0347      0.009     -3.774      0.000      -0.053      -0.017
ma.L1         -0.4

In [3]:

#  PROPHET Model (with lag features)

#  Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from prophet import Prophet
import warnings
warnings.filterwarnings("ignore")

# 2️⃣ Load the Dataset
df = pd.read_csv("station_day_edited_with_missing.csv")

# 3️⃣ Basic Preprocessing
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values('Date')

# Select target variable (example: PM2.5)
target_col = 'PM2.5' if 'PM2.5' in df.columns else df.columns[-1]

# Fill missing numeric values
df[target_col] = df[target_col].fillna(df[target_col].mean())

# 4️⃣ Create Lag Features
df['lag_1'] = df[target_col].shift(1)
df['lag_2'] = df[target_col].shift(2)
df['lag_3'] = df[target_col].shift(3)

# Drop missing rows from lagging
df = df.dropna()

# 5️⃣ Prepare Data for Prophet
# Prophet requires columns: ds (date) and y (target)
prophet_df = df[['Date', target_col, 'lag_1', 'lag_2', 'lag_3']].rename(columns={'Date': 'ds', target_col: 'y'})

# 6️⃣ Train–Test Split (we only train)
split_index = int(len(prophet_df) * 0.8)
train_df = prophet_df.iloc[:split_index]

# 7️⃣ Build Prophet Model
model = Prophet()

# Add lag features as external regressors
model.add_regressor('lag_1')
model.add_regressor('lag_2')
model.add_regressor('lag_3')

# 8️⃣ Fit (Train) the Model
model.fit(train_df)

print("✅ Prophet model trained successfully!")


18:52:54 - cmdstanpy - INFO - Chain [1] start processing
18:53:09 - cmdstanpy - INFO - Chain [1] done processing


✅ Prophet model trained successfully!


In [12]:

#  LSTM Model

#  Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense
import warnings
warnings.filterwarnings("ignore")

# Load the Dataset
df = pd.read_csv("station_day_edited_with_missing.csv")

#  Basic Preprocessing
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values('Date')

# Choose target variable 
target_col = 'PM2.5' if 'PM2.5' in df.columns else df.columns[-1]

# Fill missing numeric values
df[target_col] = df[target_col].fillna(df[target_col].mean())

# Create Lag Features (to help LSTM learn time dependencies)
df['lag_1'] = df[target_col].shift(1)
df['lag_2'] = df[target_col].shift(2)
df['lag_3'] = df[target_col].shift(3)

# Drop rows with missing lag values
df = df.dropna()

# Feature Scaling (LSTM works best with scaled data)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[[target_col, 'lag_1', 'lag_2', 'lag_3']])

# Convert back to DataFrame
scaled_df = pd.DataFrame(scaled_data, columns=[target_col, 'lag_1', 'lag_2', 'lag_3'])
scaled_df.index = df['Date']

#  Prepare Features (X) and Target (y)
X = scaled_df[['lag_1', 'lag_2', 'lag_3']].values
y = scaled_df[target_col].values

# Reshape input to [samples, timesteps, features] for LSTM
X = X.reshape((X.shape[0], X.shape[1], 1))

# Split into Training Data (first 80%)
split_index = int(len(X) * 0.8)
X_train, y_train = X[:split_index], y[:split_index]

# Build LSTM Model
model = Sequential()
model.add(LSTM(50, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(Dense(1))
model.compile(optimizer='adam', loss='mse')

# Train the Model
history = model.fit(X_train, y_train, epochs=50, batch_size=16, verbose=1)

print("LSTM model trained successfully!")


Epoch 1/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 0.0034
Epoch 2/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0032
Epoch 3/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - loss: 0.0032
Epoch 4/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0032
Epoch 5/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0032
Epoch 6/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0032
Epoch 7/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0031
Epoch 8/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - loss: 0.0031
Epoch 9/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 0.0031
Epoch 10/50
[1m402/402[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - lo

In [9]:

# XGBoost Model (with lag features)


# Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from xgboost import XGBRegressor
import warnings
warnings.filterwarnings("ignore")

#  Load the Dataset
df = pd.read_csv("station_day_edited_with_missing.csv")

# Basic Preprocessing
df['Date'] = pd.to_datetime(df['Date'], errors='coerce')
df = df.sort_values('Date')

# Choose target variable (example: PM2.5)
target_col = 'PM2.5' if 'PM2.5' in df.columns else df.columns[-1]

# Fill missing numeric values
df[target_col] = df[target_col].fillna(df[target_col].mean())

#  Create Lag Features
# These represent past observations as features
df['lag_1'] = df[target_col].shift(1)
df['lag_2'] = df[target_col].shift(2)
df['lag_3'] = df[target_col].shift(3)
df['lag_4'] = df[target_col].shift(4)
df['lag_5'] = df[target_col].shift(5)

# Drop rows with missing lag values
df = df.dropna()

# Feature Scaling (optional but helps XGBoost learn faster)
scaler = MinMaxScaler()
scaled_data = scaler.fit_transform(df[[target_col, 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5']])

scaled_df = pd.DataFrame(
    scaled_data, 
    columns=[target_col, 'lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5']
)
scaled_df.index = df['Date']

#  Prepare Input and Output
X = scaled_df[['lag_1', 'lag_2', 'lag_3', 'lag_4', 'lag_5']]
y = scaled_df[target_col]

#  Split Data into Training (80%) and Testing (20%) 
split_index = int(len(X) * 0.8)
X_train, y_train = X.iloc[:split_index], y.iloc[:split_index]

# Build XGBoost Model
model = XGBRegressor(
    n_estimators=200,       
    learning_rate=0.05,   
    max_depth=5,           
    subsample=0.8,         
    colsample_bytree=0.8,  
    random_state=42
)

# Train the Model
model.fit(X_train, y_train)

print("XGBoost model trained successfully!")


XGBoost model trained successfully!
