In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor
import plotly.express as px
from pathlib import Path
from typing import Tuple
import pandas as pd
import numpy as np
import pickle

In [2]:
df_path = Path("../data/curated_data/water_consumption_curated_1.parquet")
df = pd.read_parquet(df_path)

In [3]:
def create_date_columns(original_df: pd.DataFrame) -> pd.DataFrame:
    df = original_df.copy()
    
    # Extract date-related components from the timestamp
    df["year"] = df["timestamp"].dt.year
    df["month"] = df["timestamp"].dt.month
    df['week_of_year'] = df['timestamp'].dt.isocalendar().week
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df["day"] = df["timestamp"].dt.day
    df["hour"] = df["timestamp"].dt.hour
    df["second"] = df["timestamp"].dt.second

    # Drop rows with missing values in 'flow_out_(l/s)'
    df = df.dropna(subset=["flow_out_(l/s)"])
    
    return df

def prepare_train_test_data(df: pd.DataFrame, train_size: float = 0.8) -> Tuple[pd.DataFrame, pd.DataFrame, pd.Series, pd.Series]:
    # Select features for the model
    X = df[["flow_in_(l/s)", "reservoir_level_(%)", "pressure_(mca)", "gmb_1_is_on", "gmb_2_is_on", "reservoir_level_liters", 
            "time_passed_seconds", "liters_entered", "year", "month", "week_of_year", "day_of_week", "day", "hour", "second"]]
    
    # Target variable
    y = df["flow_out_(l/s)"]
    
    # Calculate the number of training samples
    train_samples = int(train_size * len(X))
    
    # Split the data into training and testing sets
    x_train, x_test = X[:train_samples], X[train_samples:]
    y_train, y_test = y[:train_samples], y[train_samples:]
    
    return x_train, x_test, y_train, y_test

In [4]:
def set_model_training_pipeline() -> GridSearchCV:
    # Initialize the XGBoost regressor with specified hyperparameters
    model = XGBRegressor(
        n_estimators=100000, 
        learning_rate=0.01, 
        early_stopping_rounds=100
    )
    
    # Define time series cross-validation strategy
    cv = TimeSeriesSplit(n_splits=5)
    
    # Define an expanded parameter grid for hyperparameter tuning
    params = {
        'n_estimators': [100, 500, 1000, 5000],
        'max_depth': [3, 5, 10, 14],
        'learning_rate': [0.01, 0.05, 0.1]
    }
    
    # Initialize GridSearchCV with the model, parameter grid, and cross-validation strategy
    clf = GridSearchCV(
        estimator=model, 
        param_grid=params, 
        cv=cv, 
        scoring='neg_mean_squared_error',
        n_jobs=-1,  # Use all available cores
        verbose=2
    )
    
    return clf

def train_model(clf: GridSearchCV, x_train: pd.DataFrame, y_train: pd.Series, x_test: pd.DataFrame, y_test: pd.Series) -> GridSearchCV:
    # Train the model with training and validation data
    clf.fit(
        x_train, y_train,
        eval_set=[(x_train, y_train), (x_test, y_test)],
        verbose=100
    )
    
    return clf.best_estimator_
    
    
def calculate_error(y_test: pd.Series, y_pred: pd.Series) -> Tuple[float, float, float, float, float]:
    # Calculate error metrics
    mae = round(mean_absolute_error(y_true=y_test, y_pred=y_pred), 2)
    mse = round(mean_squared_error(y_true=y_test, y_pred=y_pred), 2)
    rmse = round(np.sqrt(mse), 2)
    r2 = round(r2_score(y_true=y_test, y_pred=y_pred), 2)
    
    return mae, mse, rmse, r2
    
    
def predict_with_model(clf: GridSearchCV, X: pd.DataFrame) -> pd.Series:
    # Generate predictions using the trained model
    y_pred = clf.predict(X)
    
    return pd.Series(y_pred)

In [5]:
df = create_date_columns(df)
x_train, x_test, y_train, y_test = prepare_train_test_data(df)
clf = set_model_training_pipeline()
clf = train_model(clf, x_train, y_train, x_test, y_test)

Fitting 5 folds for each of 48 candidates, totalling 240 fits
[0]	validation_0-rmse:29.74861	validation_1-rmse:32.63787
[0]	validation_0-rmse:29.64828	validation_1-rmse:32.44389
[0]	validation_0-rmse:29.56139	validation_1-rmse:32.22438
[0]	validation_0-rmse:29.74861	validation_1-rmse:32.63787
[0]	validation_0-rmse:29.64828	validation_1-rmse:32.44389
[0]	validation_0-rmse:29.52874	validation_1-rmse:32.08896
[0]	validation_0-rmse:29.49724	validation_1-rmse:31.95651
[0]	validation_0-rmse:29.56139	validation_1-rmse:32.22438
[0]	validation_0-rmse:29.52874	validation_1-rmse:32.08896
[0]	validation_0-rmse:29.49724	validation_1-rmse:31.95651
[0]	validation_0-rmse:29.64828	validation_1-rmse:32.44389
[0]	validation_0-rmse:29.74861	validation_1-rmse:32.63787
[0]	validation_0-rmse:29.52874	validation_1-rmse:32.08896
[0]	validation_0-rmse:29.74861	validation_1-rmse:32.63787
[0]	validation_0-rmse:29.56139	validation_1-rmse:32.22438
[0]	validation_0-rmse:29.49724	validation_1-rmse:31.95651
[99]	valid



[100]	validation_0-rmse:12.89958	validation_1-rmse:15.23269
[100]	validation_0-rmse:7.30019	validation_1-rmse:20.33798
[160]	validation_0-rmse:16.78603	validation_1-rmse:18.20906
[CV] END .learning_rate=0.05, max_depth=10, n_estimators=500; total time=   7.1s
[0]	validation_0-rmse:28.64026	validation_1-rmse:31.50576
[100]	validation_0-rmse:7.00986	validation_1-rmse:14.95454
[100]	validation_0-rmse:13.03895	validation_1-rmse:19.20130
[153]	validation_0-rmse:13.23373	validation_1-rmse:19.42245
[CV] END learning_rate=0.05, max_depth=10, n_estimators=1000; total time=   6.9s
[100]	validation_0-rmse:7.30019	validation_1-rmse:20.33798
[0]	validation_0-rmse:28.48287	validation_1-rmse:31.29306
[0]	validation_0-rmse:28.81033	validation_1-rmse:31.61076
[173]	validation_0-rmse:13.09939	validation_1-rmse:15.33306
[CV] END .learning_rate=0.05, max_depth=10, n_estimators=500; total time=   8.4s
[100]	validation_0-rmse:16.26389	validation_1-rmse:18.01950
[0]	validation_0-rmse:28.43290	validation_1-rm

In [6]:
mae, mse, rmse, r2 = calculate_error(y_test, predict_with_model(clf, x_test))
print(f"MAE: {mae}")
print(f"MSE: {mse}")
print(f"RMSE: {rmse}")
print(f"R2: {r2}")

MAE: 1.55
MSE: 238.13
RMSE: 15.43
R2: 0.76


In [7]:
X = df[["flow_in_(l/s)", "reservoir_level_(%)", "pressure_(mca)", "gmb_1_is_on", "gmb_2_is_on", "reservoir_level_liters", 
        "time_passed_seconds", "liters_entered", "year", "month", "week_of_year", "day_of_week", "day", "hour", "second"]]
df["flow_out_forecast"] = clf.predict(X)
df.to_parquet("../data/curated_data/water_consumption_curated_2.parquet")

In [8]:
with open('../models/xgb_flow_out_forecast_2.pkl', 'wb') as f:
    pickle.dump(clf, f)

In [9]:
# Assuming df, y_test, and x_test are already defined
results_df = pd.DataFrame({
    'timestamp': df['timestamp'].iloc[len(df) - len(y_test):].reset_index(drop=True),
    'Actual': y_test.reset_index(drop=True),
    'Predicted': pd.Series(clf.predict(x_test))
})

# Select a subset of rows for plotting
subset_size = 50  # Adjust this value to show more or fewer rows
results_subset = results_df.iloc[:subset_size]

# Plot the subset of results
fig = px.line(results_subset, x='timestamp', y=['Actual', 'Predicted'],
              labels={'value': 'Flow Out (l/s)', 'timestamp': 'Timestamp'},
              title='Comparação entre valores reais e previstos (Subset)')

fig.show()