In [12]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler

In [13]:
df = pd.read_csv("/content/final_dataset.csv")
print("Columns:", df.columns)


Columns: Index(['Date', 'Month', 'Year', 'Holidays_Count', 'Days', 'PM2.5', 'PM10',
       'NO2', 'SO2', 'CO', 'Ozone', 'AQI'],
      dtype='object')


In [14]:
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date')


In [15]:
features = ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Ozone']
extra_features = ['Month', 'Year', 'Holidays_Count', 'Days']

features = [f for f in features if f in df.columns]
extra_features = [f for f in extra_features if f in df.columns]

target = 'AQI'

df = df[['Date'] + features + extra_features + [target]]
print("Using features:", features + extra_features)
print("Target:", target)

Using features: ['PM2.5', 'PM10', 'NO2', 'SO2', 'CO', 'Ozone', 'Month', 'Year', 'Holidays_Count', 'Days']
Target: AQI


In [16]:
df = df.fillna(method='ffill').fillna(method='bfill')


  df = df.fillna(method='ffill').fillna(method='bfill')


In [17]:
scaler = MinMaxScaler()
scaled = scaler.fit_transform(df[features + extra_features + [target]])
df_scaled = pd.DataFrame(scaled, columns=features + extra_features + [target])
df_scaled['Date'] = df['Date'].values

In [18]:
def make_sequences(data, lookback=7, horizon=1):
    X, y = [], []
    for i in range(len(data) - lookback - horizon + 1):
        X.append(data[i:i+lookback, :-1])
        y.append(data[i+lookback+horizon-1, -1])
    return np.array(X), np.array(y)

data = df_scaled[features + extra_features + [target]].values
X, y = make_sequences(data, lookback=7, horizon=1)

print("X shape:", X.shape)
print("y shape:", y.shape)


X shape: (1454, 7, 10)
y shape: (1454,)


In [19]:
n = len(X)
train_end = int(n * 0.7)
val_end = int(n * 0.85)

X_train, y_train = X[:train_end], y[:train_end]
X_val, y_val = X[train_end:val_end], y[train_end:val_end]
X_test, y_test = X[val_end:], y[val_end:]

print(f"Train: {X_train.shape}, Val: {X_val.shape}, Test: {X_test.shape}")


Train: (1017, 7, 10), Val: (218, 7, 10), Test: (219, 7, 10)


In [20]:
np.save("X_train.npy", X_train)
np.save("y_train.npy", y_train)
np.save("X_val.npy", X_val)
np.save("y_val.npy", y_val)
np.save("X_test.npy", X_test)
np.save("y_test.npy", y_test)

print("Preprocessing complete. Data saved as .npy files.")

Preprocessing complete. Data saved as .npy files.
