## Qual a previsão da vazão de saída para as próximas 24h para um determinado dia e horário?

In [18]:
from xgboost import XGBRegressor
import plotly.express as px
from pathlib import Path
import pandas as pd 
import pickle

In [2]:
df_path = Path("../data/curated_data/water_consumption_curated_1.partquet")
df = pd.read_parquet(df_path)
df.head()

Unnamed: 0,timestamp,flow_in_(l/s),reservoir_level_(%),pressure_(mca),gmb_1_is_on,gmb_2_is_on,reservoir_level_liters,time_passed_seconds,liters_entered,flow_out_(l/s)
0,2023-03-17 11:27:06,68.59,29.86,38.2,0,1,298600.0,,,
1,2023-03-17 12:28:56,66.05,35.86,38.2,0,1,358600.0,3710.0,60000.0,49.877493
2,2023-03-17 12:31:26,65.64,36.16,38.06,0,1,361600.0,150.0,3000.0,45.64
3,2023-03-17 12:33:56,65.64,36.5,38.03,0,1,365000.0,150.0,3400.0,42.973333
4,2023-03-17 12:36:26,65.64,36.8,38.17,0,1,368000.0,150.0,3000.0,45.64


In [11]:
# Converter a coluna 'timestamp' para datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

# Adicionar colunas úteis
df['year'] = df['timestamp'].dt.year
df['month'] = df['timestamp'].dt.month
df['day'] = df['timestamp'].dt.day
df['hour'] = df['timestamp'].dt.hour
df['minute'] = df['timestamp'].dt.minute
df['second'] = df['timestamp'].dt.second
df['dayofweek'] = df['timestamp'].dt.dayofweek
df['weekofyear'] = df['timestamp'].dt.isocalendar().week

# Remover linhas com valores nulos na coluna 'flow_out_(l/s)'
df = df.dropna(subset=['flow_out_(l/s)'])

# Separar as features e o target
X = df[['flow_in_(l/s)', 'reservoir_level_(%)', 'pressure_(mca)', 'gmb_1_is_on', 'gmb_2_is_on',
        'reservoir_level_liters', 'time_passed_seconds', 'liters_entered', 'year', 'month',
        'day', 'hour', 'minute', 'second', 'dayofweek', 'weekofyear']]
y = df['flow_out_(l/s)']

# Separar em conjunto de treinamento e teste
train_size = int(len(X) * 0.8)
X_train, X_test = X[:train_size], X[train_size:]
y_train, y_test = y[:train_size], y[train_size:]

In [13]:
model = XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=3, learning_rate=0.01, colsample_bytree=0.7, subsample=0.8)
model.fit(X_train, y_train)

In [19]:
y_pred = model.predict(X_test)
mse = ((y_test - y_pred) ** 2).mean()
mae = (y_test - y_pred).abs().mean()
print(f'Mean Squared Error: {mse}')
print(f'Mean Absolute Error: {mae}')

Mean Squared Error: 4406.898012191103
Mean Absolute Error: 8.630151012778931


In [15]:
with open('../models/xgb_flow_out_forecast.pkl', 'wb') as f:
    pickle.dump(model, f)

In [16]:
results_df = pd.DataFrame({
    'timestamp': df['timestamp'].iloc[train_size:].reset_index(drop=True),
    'Actual': y_test.reset_index(drop=True),
    'Predicted': pd.Series(y_pred)
})

# Plotar os resultados
fig = px.line(results_df, x='timestamp', y=['Actual', 'Predicted'],
              labels={'value': 'Flow Out (l/s)', 'timestamp': 'Timestamp'},
              title='Comparação entre valores reais e previstos')

fig.show()