## Qual a previsão da vazão de saída para as próximas 24h para um determinado dia e horário?

In [1]:
from xgboost import XGBRegressor
import numpy as np 
from sklearn.metrics import mean_squared_error, mean_absolute_error
from xgboost import callback
from sklearn.model_selection import GridSearchCV, KFold
import plotly.express as px
from pathlib import Path
import pandas as pd 
import pickle

In [2]:
df_path = Path("../data/curated_data/water_consumption_curated_1.parquet")
df = pd.read_parquet(df_path)
df.head()

Unnamed: 0,id,timestamp,flow_in_(l/s),reservoir_level_(%),pressure_(mca),gmb_1_is_on,gmb_2_is_on,reservoir_level_liters,time_passed_seconds,liters_should_have_entered,liters_entered,liters_out,flow_out_(l/s)
0,0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,298600.0,,,,,
1,1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,3710.0,245045.5,60000.0,185045.5,49.88
2,2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,150.0,9846.0,3000.0,6846.0,45.64
3,3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,150.0,9846.0,3400.0,6446.0,42.97
4,4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,150.0,9846.0,3000.0,6846.0,45.64


In [3]:
df["year"] = df["timestamp"].dt.year
df["month"] = df["timestamp"].dt.month
df['week_of_year'] = df['timestamp'].dt.isocalendar().week
df['day_of_week'] = df['timestamp'].dt.dayofweek
df["day"] = df["timestamp"].dt.day
df["hour"] = df["timestamp"].dt.hour
df["second"] = df["timestamp"].dt.second

df = df.dropna(subset=["flow_out_(l/s)"])
df.head()


Unnamed: 0,id,timestamp,flow_in_(l/s),reservoir_level_(%),pressure_(mca),gmb_1_is_on,gmb_2_is_on,reservoir_level_liters,time_passed_seconds,liters_should_have_entered,liters_entered,liters_out,flow_out_(l/s),year,month,week_of_year,day_of_week,day,hour,second
1,1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,3710.0,245045.5,60000.0,185045.5,49.88,2023,3,11,4,17,12,56
2,2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,150.0,9846.0,3000.0,6846.0,45.64,2023,3,11,4,17,12,26
3,3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,150.0,9846.0,3400.0,6446.0,42.97,2023,3,11,4,17,12,56
4,4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,150.0,9846.0,3000.0,6846.0,45.64,2023,3,11,4,17,12,26
5,5,2023-03-17 12:38:56,65.24,36.8,38.17,0,1,368000.0,150.0,9786.0,0.0,9786.0,65.24,2023,3,11,4,17,12,56


In [4]:
df.to_parquet("../data/curated_data/water_consumption_curated_2.parquet")

In [5]:
X = df[["flow_in_(l/s)", "reservoir_level_(%)", "pressure_(mca)", "gmb_1_is_on", "gmb_2_is_on", "reservoir_level_liters", 
        "time_passed_seconds", "liters_entered", "year", "month", "week_of_year", "day_of_week", "day", "hour", "second"]]

y = df["flow_out_(l/s)"]

In [6]:
# train test split
train_size = int(0.8 * len(X))
x_train, x_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [7]:
class LearningRateDecay(callback.TrainingCallback):
    def __init__(self, initial_lr=0.01, decay_rate=0.1, decay_steps=1000):
        self.initial_lr = initial_lr
        self.decay_rate = decay_rate
        self.decay_steps = decay_steps

    def after_iteration(self, model, epoch, evals_log):
        new_lr = self.initial_lr * self.decay_rate ** (epoch / self.decay_steps)
        model.set_param('learning_rate', new_lr)
        return False

# Instantiate the XGBRegressor with the custom callback
model = XGBRegressor(
    n_estimators=100000, 
    learning_rate=0.01, 
    early_stopping_rounds=100,
    callbacks=[LearningRateDecay(initial_lr=0.01, decay_rate=0.1, decay_steps=1000)]
)

# Set up cross-validation
cv = KFold(n_splits=5)

# Define parameter grid for GridSearchCV
params = {
    'n_estimators': [100, 300, 500],
    'max_depth': [3, 5, 10, 14]
}

# Instantiate GridSearchCV
clf = GridSearchCV(estimator=model, param_grid=params, cv=cv)

# Train the model (no need to pass callbacks in fit)
clf.fit(
    x_train, y_train, 
    eval_set=[(x_train, y_train), (x_test, y_test)], 
    verbose=100
)

[0]	validation_0-rmse:29.50640	validation_1-rmse:31.71243
[99]	validation_0-rmse:19.45564	validation_1-rmse:23.21777
[0]	validation_0-rmse:29.49972	validation_1-rmse:31.79886
[99]	validation_0-rmse:19.97159	validation_1-rmse:23.57661
[0]	validation_0-rmse:29.49049	validation_1-rmse:31.85860
[99]	validation_0-rmse:19.18341	validation_1-rmse:23.70188
[0]	validation_0-rmse:29.49645	validation_1-rmse:31.94907
[99]	validation_0-rmse:19.26577	validation_1-rmse:23.82615
[0]	validation_0-rmse:29.50841	validation_1-rmse:31.96181
[99]	validation_0-rmse:19.56526	validation_1-rmse:23.53153
[0]	validation_0-rmse:29.50640	validation_1-rmse:31.71243
[100]	validation_0-rmse:19.39067	validation_1-rmse:23.15420
[200]	validation_0-rmse:15.74275	validation_1-rmse:20.12050
[299]	validation_0-rmse:13.76521	validation_1-rmse:18.77362
[0]	validation_0-rmse:29.49972	validation_1-rmse:31.79886
[100]	validation_0-rmse:19.91475	validation_1-rmse:23.53972
[200]	validation_0-rmse:16.24634	validation_1-rmse:20.71865

The numbers within square brackets, such as `[ ]`, in the log output represent the iteration number or boosting round during the training process of the `XGBRegressor` model. XGBoost utilizes a gradient boosting framework, which builds the model sequentially by adding one tree at a time and optimizing the model at each iteration. Each line in the log shows the root mean square error (RMSE) for both the training set (`validation_0-rmse`) and the validation set (`validation_1-rmse`) at the given iteration. For instance, `[0] validation_0-rmse:101.21552 validation_1-rmse:120.47506` indicates that at the first iteration (iteration 0), the RMSE for the training set is 101.21552, and the RMSE for the validation set is 120.47506. As training progresses, these RMSE values typically decrease, reflecting the model's improvement. However, they may plateau or even increase if the model begins to overfit the training data. Therefore, monitoring these metrics across iterations helps in understanding the model's performance and potential overfitting.

In [15]:
def plot_diagnostics(y_test, y_pred):
    print(f'MSE: {round(mean_squared_error(y_true=y_test, y_pred=y_pred),2)}')
    print(f'RMSE: {round(np.sqrt(mean_squared_error(y_true=y_test, y_pred=y_pred)),2)}')
    print(f'MAE: {round(mean_absolute_error(y_true=y_test, y_pred=y_pred),2)}')
    
plot_diagnostics(y_test, clf.predict(x_test))

MSE: 277.16
RMSE: 16.65
MAE: 2.47


In [16]:
df["flow_out_forecast"] = clf.predict(X)
df.head()

Unnamed: 0,id,timestamp,flow_in_(l/s),reservoir_level_(%),pressure_(mca),gmb_1_is_on,gmb_2_is_on,reservoir_level_liters,time_passed_seconds,liters_should_have_entered,...,liters_out,flow_out_(l/s),year,month,week_of_year,day_of_week,day,hour,second,flow_out_forecast
1,1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,3710.0,245045.5,...,185045.5,49.88,2023,3,11,4,17,12,56,32.964901
2,2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,150.0,9846.0,...,6846.0,45.64,2023,3,11,4,17,12,26,44.503532
3,3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,150.0,9846.0,...,6446.0,42.97,2023,3,11,4,17,12,56,42.068813
4,4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,150.0,9846.0,...,6846.0,45.64,2023,3,11,4,17,12,26,44.503532
5,5,2023-03-17 12:38:56,65.24,36.8,38.17,0,1,368000.0,150.0,9786.0,...,9786.0,65.24,2023,3,11,4,17,12,56,63.514561


In [17]:
df.to_parquet("../data/curated_data/water_consumption_curated_2.parquet")

In [18]:
with open('../models/xgb_flow_out_forecast.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [20]:
results_df = pd.DataFrame({
    'timestamp': df['timestamp'].iloc[train_size:].reset_index(drop=True),
    'Actual': y_test.reset_index(drop=True),
    'Predicted': pd.Series(clf.predict(x_test))
})

# Plotar os resultados
fig = px.line(results_df, x='timestamp', y=['Actual', 'Predicted'],
              labels={'value': 'Flow Out (l/s)', 'timestamp': 'Timestamp'},
              title='Comparação entre valores reais e previstos')

fig.show()

In [23]:
print(results_df.head(50))

             timestamp  Actual  Predicted
0  2024-01-15 03:47:42   26.67  27.149809
1  2024-01-15 03:52:42    8.00  15.302932
2  2024-01-15 03:55:12   26.67  27.149809
3  2024-01-15 04:00:12   12.00  16.241867
4  2024-01-15 04:02:42   20.00  20.039970
5  2024-01-15 04:05:12   16.00  16.587841
6  2024-01-15 04:07:42    0.00   1.986433
7  2024-01-15 04:10:12   26.67  27.149809
8  2024-01-15 04:12:42   17.33  16.587841
9  2024-01-15 04:15:12   20.00  20.039970
10 2024-01-15 04:17:42   16.00  16.587841
11 2024-01-15 04:20:12    0.00   1.986433
12 2024-01-15 04:22:42   26.67  27.149809
13 2024-01-15 04:25:12   24.00  24.417923
14 2024-01-15 04:27:42    0.00   1.986433
15 2024-01-15 04:32:42   23.33  23.848652
16 2024-01-15 04:35:12   26.67  27.149809
17 2024-01-15 04:37:42    0.00   1.986433
18 2024-01-15 04:40:12   17.33  16.587841
19 2024-01-15 04:42:42   22.67  23.009981
20 2024-01-15 04:45:13    0.00   1.986433
21 2024-01-15 04:47:42   20.13  20.039970
22 2024-01-15 04:50:12   29.33  29