In [1]:
import pandas as pd
import pickle

In [2]:
df = pd.read_parquet("../data/training/water_consumption_training.parquet")
df.head()

Unnamed: 0,average_input_flow_rate_24_hours,average_change_reservoir_level_percentage_24_hours,average_total_liters_entered_24_hours,sum_total_liters_entered_last_24_hours,average_effective_liters_entered_24_hours,sum_effective_liters_entered_last_24_hours,average_total_liters_out_last_24_hours,sum_total_liters_out_last_24_hours,average_output_flow_rate_last_24_hours,average_pressure_last_24_hours,...,timestamp,second,minute,hour,day,weekday,week_of_year,month,year,output_flow_rate
0,44.306319,45.528108,8309.346181,4786183.4,2611.111111,1504000.0,8280.873958,4769783.4,43.716719,36.048941,...,2023-03-18 19:00:53,53,0,19,18,5,11,3,2023,36.67
1,44.192361,45.528003,8292.252431,4776337.4,2605.902778,1501000.0,8293.294097,4776937.4,43.799514,36.049479,...,2023-03-18 19:03:23,23,3,19,18,5,11,3,2023,93.33
2,44.078403,45.527309,8275.158681,4766491.4,2600.0,1497600.0,8282.103125,4770491.4,43.724913,36.049896,...,2023-03-18 19:05:53,53,5,19,18,5,11,3,2023,0.0
3,43.964444,45.524462,8258.064931,4756645.4,2594.791667,1494600.0,8286.537153,4773045.4,43.754479,36.050243,...,2023-03-18 19:08:23,23,8,19,18,5,11,3,2023,62.67
4,43.851181,45.519948,8241.075347,4746859.4,2594.791667,1494600.0,8286.214236,4772859.4,43.752326,36.05059,...,2023-03-18 19:10:53,53,10,19,18,5,11,3,2023,64.0


In [3]:
weather_columns = [
    'total_precip_mm', 
    'station_pressure_mb', 
    'max_pressure_last_hour_mb',
    'min_pressure_last_hour_mb', 
    'global_radiation_kj_m2', 
    'air_temp_c',
    'dew_point_temp_c', 
    'max_temp_last_hour_c', 
    'min_temp_last_hour_c',
    'max_dew_point_last_hour_c', 
    'min_dew_point_last_hour_c',
    'max_humidity_last_hour_percentage', 
    'min_humidity_last_hour_percentage',
    'relative_humidity_percentage', 
    'wind_direction_deg', 
    'max_wind_gust_m_s',
    'wind_speed_m_s'
]

weather_feature_columns = []

for window_name in ['24_hours', '10_hours', '1_hour', '10_minutes']:
    weather_feature_columns.extend([f'average_{col}_last_{window_name}' for col in weather_columns])
    weather_feature_columns.extend([f'last_{col}' for col in weather_columns])

# Include timestamp in the list
columns_to_exclude = ['timestamp'] + weather_feature_columns
training_df = df.drop(columns=columns_to_exclude)
training_df.head()

Unnamed: 0,average_input_flow_rate_24_hours,average_change_reservoir_level_percentage_24_hours,average_total_liters_entered_24_hours,sum_total_liters_entered_last_24_hours,average_effective_liters_entered_24_hours,sum_effective_liters_entered_last_24_hours,average_total_liters_out_last_24_hours,sum_total_liters_out_last_24_hours,average_output_flow_rate_last_24_hours,average_pressure_last_24_hours,...,last_pump_2_status,second,minute,hour,day,weekday,week_of_year,month,year,output_flow_rate
0,44.306319,45.528108,8309.346181,4786183.4,2611.111111,1504000.0,8280.873958,4769783.4,43.716719,36.048941,...,0,53,0,19,18,5,11,3,2023,36.67
1,44.192361,45.528003,8292.252431,4776337.4,2605.902778,1501000.0,8293.294097,4776937.4,43.799514,36.049479,...,0,23,3,19,18,5,11,3,2023,93.33
2,44.078403,45.527309,8275.158681,4766491.4,2600.0,1497600.0,8282.103125,4770491.4,43.724913,36.049896,...,0,53,5,19,18,5,11,3,2023,0.0
3,43.964444,45.524462,8258.064931,4756645.4,2594.791667,1494600.0,8286.537153,4773045.4,43.754479,36.050243,...,0,23,8,19,18,5,11,3,2023,62.67
4,43.851181,45.519948,8241.075347,4746859.4,2594.791667,1494600.0,8286.214236,4772859.4,43.752326,36.05059,...,0,53,10,19,18,5,11,3,2023,64.0


In [4]:
training_features = [col for col in training_df.columns if col != 'output_flow_rate']

In [5]:
model = pickle.load(open("../models/xgb_flow_out_forecast_3.pkl", "rb"))

In [6]:
df['output_flow_rate_forecast'] = model.predict(training_df[training_features])
df.head()

Unnamed: 0,average_input_flow_rate_24_hours,average_change_reservoir_level_percentage_24_hours,average_total_liters_entered_24_hours,sum_total_liters_entered_last_24_hours,average_effective_liters_entered_24_hours,sum_effective_liters_entered_last_24_hours,average_total_liters_out_last_24_hours,sum_total_liters_out_last_24_hours,average_output_flow_rate_last_24_hours,average_pressure_last_24_hours,...,second,minute,hour,day,weekday,week_of_year,month,year,output_flow_rate,output_flow_rate_forecast
0,44.306319,45.528108,8309.346181,4786183.4,2611.111111,1504000.0,8280.873958,4769783.4,43.716719,36.048941,...,53,0,19,18,5,11,3,2023,36.67,36.720798
1,44.192361,45.528003,8292.252431,4776337.4,2605.902778,1501000.0,8293.294097,4776937.4,43.799514,36.049479,...,23,3,19,18,5,11,3,2023,93.33,94.762589
2,44.078403,45.527309,8275.158681,4766491.4,2600.0,1497600.0,8282.103125,4770491.4,43.724913,36.049896,...,53,5,19,18,5,11,3,2023,0.0,0.080314
3,43.964444,45.524462,8258.064931,4756645.4,2594.791667,1494600.0,8286.537153,4773045.4,43.754479,36.050243,...,23,8,19,18,5,11,3,2023,62.67,62.582661
4,43.851181,45.519948,8241.075347,4746859.4,2594.791667,1494600.0,8286.214236,4772859.4,43.752326,36.05059,...,53,10,19,18,5,11,3,2023,64.0,63.940014


In [7]:
df.shape

(124496, 147)

In [8]:
df.to_parquet("../data/curated/water_consumption_forecasted.parquet", index=False)