In [1]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from sklearn.model_selection import RandomizedSearchCV, TimeSeriesSplit
from xgboost import XGBRegressor
import pandas as pd
import numpy as np
import pickle
from utils import all_features, all_non_weather_features

In [2]:
df = pd.read_parquet("../data/silver/training_dataset.parquet")
df.head(1)

Unnamed: 0,timestamp,hour,day_of_week,week_of_year,year,1_hours_ago_input_flow_rate_first,1_hours_agot_input_flow_rate_last,1_hours_ago_input_flow_rate_mean,1_hours_ago_reservoir_level_percentage_first,1_hours_ago_reservoir_level_percentage_last,...,target_15,target_16,target_17,target_18,target_19,target_20,target_21,target_22,target_23,target_24
0,2023-03-20 13:00:00,13,0,12,2023,66.46,65.64,66.378182,37.56,42.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
def create_training_samples(df, with_weather, target):
        
    if with_weather:
        X = np.array(df[all_features])
    else:
        X = np.array(df[all_non_weather_features])
        
    y = np.array(df[target])
    
    return X, y

def mse_scorer(estimator, X, y):
    y_pred = estimator.predict(X)
    mse = mean_squared_error(y, y_pred)
    print(f"MSE: {mse}")
    return mse


def set_model_training_pipeline():
    model = XGBRegressor() 
    params = {
        "n_estimators": [200, 300, 500, 1000],
        "max_depth": [1, 3, 5, 15, 30],
        "learning_rate": [0.01, 0.05, 0.1]
    }

    # RandomizedSearchCV setup
    clf = RandomizedSearchCV(
        estimator=model,
        param_distributions=params,
        scoring="neg_mean_absolute_error",
        n_iter=80,
        cv=TimeSeriesSplit(n_splits=3),
        n_jobs=-1,
        verbose=2,
        random_state=42
    )
    return XGBRegressor(eval_metric="mae")
    #return clf

def train_model(model, X_train, y_train, X_test, y_test):
    model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], verbose=100)
    return model


def train_xgb_model(df, with_weather, target):
    
    print(f"\nTraining model! with_weather: {with_weather}")
    
    # split the data
    train_df = df[df.year==2023]
    test_df = df[df.year==2024]
    
    # create the training samples
    X_train, y_train = create_training_samples(train_df, with_weather, target)
    X_test, y_test = create_training_samples(test_df, with_weather, target)
    
    # set the model training pipeline
    model = set_model_training_pipeline()
    
    # train the model
    model = train_model(model, X_train, y_train, X_test, y_test)
    
    return model
    
    
def save_model(model, path):
    pickle.dump(model, open(path, "wb"))
    
    
def calculate_error(y_test: np.ndarray, y_pred: np.ndarray):
    mae = round(mean_absolute_error(y_true=y_test, y_pred=y_pred), 2)
    mse = round(mean_squared_error(y_true=y_test, y_pred=y_pred), 2)
    rmse = round(np.sqrt(mse), 2)
    r2 = round(r2_score(y_true=y_test, y_pred=y_pred), 2)
    return mae, mse, rmse, r2

    
def print_error(model, df, with_weather):
        
    # split the data
    test_df = df[df.year==2024]
    
    # create the training samples
    X_test, y_test = create_training_samples(test_df, with_weather=with_weather, target='target_1')
    
    # predict
    y_pred = model.predict(X_test)
    
    # calculate the error
    mae, mse, rmse, r2 = calculate_error(y_test, y_pred)
    
    print(f"Mean Absolute Error: {mae}")
    print(f"Mean Squared Error: {mse}")
    print(f"Root Mean Squared Error: {rmse}")
    print(f"R2 Score: {r2}")

In [19]:
xgb = train_xgb_model(df, with_weather=True, target='target_1')
print_error(xgb, df, with_weather=True)


Training model! with_weather: True
[0]	validation_0-mae:5.32029	validation_1-mae:4.83092
[99]	validation_0-mae:2.49694	validation_1-mae:7.16480
Mean Absolute Error: 4.83
Mean Squared Error: 63.7
Root Mean Squared Error: 7.98
R2 Score: 0.0


In [20]:
xgb_no_weather = train_xgb_model(df, with_weather=False, target='target_1')
print_error(xgb_no_weather, df, with_weather=False)


Training model! with_weather: False
[0]	validation_0-mae:5.32065	validation_1-mae:4.82914
[99]	validation_0-mae:2.48328	validation_1-mae:7.43989
Mean Absolute Error: 4.83
Mean Squared Error: 63.74
Root Mean Squared Error: 7.98
R2 Score: 0.0
