In [945]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy.stats import boxcox, yeojohnson
from matplotlib import pyplot as plt
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.feature_selection import mutual_info_regression
from sklearn.metrics import mean_absolute_error, mean_squared_error, explained_variance_score
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import RandomizedSearchCV
from keras.models import Sequential
from keras.layers import Dense, LSTM, SimpleRNN, GRU, Dropout
from keras.wrappers.scikit_learn import KerasRegressor

In [946]:
df = pd.read_csv('data/processed_dataset.csv')

In [947]:
df.head()

Unnamed: 0,Date,Latitude,Longitude,Altitude,NO2,PM2.5,O3,PM10,temperature_2m (°C),relative_humidity_2m (%),...,soil_temperature_0_to_7cm (°C),soil_moisture_0_to_7cm (m³/m³),is_day (),PM2.5/N02,PM2.5/03,Weekend,Season_Autumn,Season_Spring,Season_Summer,Season_Winter
0,2023-02-15 19:35:00+00:00,46.234818,15.267305,240.0,4.248495,4.26268,0.857538,4.49981,0.5,91.0,...,0.384574,-1.055553,0.0,0.014185,4.416369,0,False,False,False,True
1,2023-02-15 20:15:00+00:00,46.234818,15.267305,240.0,4.025352,4.330733,2.0,4.454347,0.1,92.0,...,0.195874,-1.055553,0.0,0.305382,3.637586,0,False,False,False,True
2,2023-02-15 20:35:00+00:00,46.234818,15.267305,240.0,4.025352,4.330733,2.0,4.454347,0.1,92.0,...,0.195874,-1.055553,0.0,0.305382,3.637586,0,False,False,False,True
3,2023-02-15 21:15:00+00:00,46.234818,15.267305,240.0,3.871201,4.290459,2.0,4.356709,-1.1,93.0,...,0.09893,-1.055553,0.0,0.419258,3.597312,0,False,False,False,True
4,2023-02-15 21:35:00+00:00,46.234818,15.267305,240.0,3.871201,4.290459,2.0,4.356709,-1.1,93.0,...,0.09893,-1.055553,0.0,0.419258,3.597312,0,False,False,False,True


In [948]:
df.isnull().sum()

Date                              0
Latitude                          0
Longitude                         0
Altitude                          0
NO2                               0
PM2.5                             0
O3                                0
PM10                              0
temperature_2m (°C)               0
relative_humidity_2m (%)          0
dew_point_2m (°C)                 0
apparent_temperature (°C)         0
precipitation (mm)                0
rain (mm)                         0
snowfall (cm)                     0
surface_pressure (hPa)            0
cloud_cover (%)                   0
wind_speed_10m (km/h)             0
wind_direction_10m (°)            0
soil_temperature_0_to_7cm (°C)    0
soil_moisture_0_to_7cm (m³/m³)    0
is_day ()                         0
PM2.5/N02                         0
PM2.5/03                          4
Weekend                           0
Season_Autumn                     0
Season_Spring                     0
Season_Summer               

In [949]:
# delete rows with null values
df.dropna(inplace=True)

In [950]:
df.isnull().sum()

Date                              0
Latitude                          0
Longitude                         0
Altitude                          0
NO2                               0
PM2.5                             0
O3                                0
PM10                              0
temperature_2m (°C)               0
relative_humidity_2m (%)          0
dew_point_2m (°C)                 0
apparent_temperature (°C)         0
precipitation (mm)                0
rain (mm)                         0
snowfall (cm)                     0
surface_pressure (hPa)            0
cloud_cover (%)                   0
wind_speed_10m (km/h)             0
wind_direction_10m (°)            0
soil_temperature_0_to_7cm (°C)    0
soil_moisture_0_to_7cm (m³/m³)    0
is_day ()                         0
PM2.5/N02                         0
PM2.5/03                          0
Weekend                           0
Season_Autumn                     0
Season_Spring                     0
Season_Summer               

In [951]:
df.shape

(5621, 29)

In [952]:
output_col = 'PM10'

input_cols = df.columns.tolist()
input_cols.remove(output_col)
input_cols.remove('Date')

information_gain_scores = mutual_info_regression(df[input_cols], df[output_col])

feature_importances = pd.Series(information_gain_scores, index=input_cols)
feature_importances.name = 'Information Gain Scores'
feature_importances.sort_values(ascending=False, inplace=True)

feature_importances.head(10)

PM2.5                             1.517766
PM2.5/03                          0.811420
PM2.5/N02                         0.561003
soil_moisture_0_to_7cm (m³/m³)    0.513139
dew_point_2m (°C)                 0.397203
soil_temperature_0_to_7cm (°C)    0.386281
apparent_temperature (°C)         0.332422
NO2                               0.316896
O3                                0.311178
temperature_2m (°C)               0.300859
Name: Information Gain Scores, dtype: float64

In [953]:
top_features = feature_importances.head(2).index.tolist()
print(top_features)

['PM2.5', 'PM2.5/03']


In [954]:
target = "PM10"
features = top_features + [target]
dataset = df[features]

In [955]:
dataset.head()

Unnamed: 0,PM2.5,PM2.5/03,PM10
0,4.26268,4.416369,4.49981
1,4.330733,3.637586,4.454347
2,4.330733,3.637586,4.454347
3,4.290459,3.597312,4.356709
4,4.290459,3.597312,4.356709


In [956]:
dataset.shape

(5621, 3)

In [957]:
test_split = round(len(dataset) * 0.2)

In [958]:
train_data = dataset[:-1041]
test_data = dataset[-1041:]

In [959]:
print(train_data.shape, test_data.shape)

(4580, 3) (1041, 3)


In [960]:
scaler = MinMaxScaler(feature_range=(0, 1))

train_data = scaler.fit_transform(train_data)
test_data = scaler.transform(test_data)

In [961]:
def create_time_series(data, n_past):
    X, y = [], []
    for i in range(n_past, len(data)):
        X.append(data[i - n_past:i, 0:data.shape[1]])
        y.append(data[i, 0])
    return np.array(X), np.array(y)

In [962]:
window_size = 48*7
X_train, y_train = create_time_series(train_data, window_size)
X_test, y_test = create_time_series(test_data, window_size)

In [963]:
print(X_train.shape, y_train.shape)

(4244, 336, 3) (4244,)


In [964]:
print(X_test.shape, y_test.shape)

(705, 336, 3) (705,)


In [965]:
model = Sequential()
model.add(LSTM(50, return_sequences=True, input_shape=(X_train.shape[1], X_train.shape[2])))
model.add(LSTM(50))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(loss='mse', optimizer='adam')

In [None]:
history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test), verbose=1,
                    shuffle=False)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10

In [None]:
prediction = model.predict(X_test)

In [None]:
prediction

In [None]:
prediction.shape

In [None]:
X_test.shape

In [None]:
prediction_copies_array = np.repeat(prediction, 3, axis=-1)

In [None]:
prediction_copies_array.shape

In [None]:
pred = scaler.inverse_transform(np.reshape(prediction_copies_array, (len(prediction), 3)))[:, 0]

In [None]:
pred.shape

In [None]:
pred

In [None]:
original_copies_array = np.repeat(y_test, 3, axis=-1)
original = scaler.inverse_transform(np.reshape(original_copies_array, (len(y_test), 3)))[:, 0]

In [None]:
plt.figure(figsize=(12,8))
plt.plot(original, color='blue', label='Actual PM10 values')
plt.plot(pred, color='red', label='Predicted PM10 values')
plt.title('PM10 Prediction')
plt.xlabel('Time')
plt.ylabel('PM10')
plt.legend()
plt.show()

In [None]:
df_output = pd.DataFrame({'Actual': original, 'Predicted': pred})
df_output.head(1000)