In [1]:
import pandas as pd

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
original_df = pd.read_parquet('../data/silver/water_consumption_silver.parquet')
df = original_df.copy()

In [3]:
def create_lag_features(input_df):
    df = input_df.copy()

    all_features = []
    non_weather_features = []

    lag_features = {}

    for lag in [1, 2, 3, 6, 12, 24, 26, 48, 72]:
        
        lag_features[f'{lag}_hours_ago_input_flow_rate_first'] = df['input_flow_rate_first'].shift(lag)
        lag_features[f'{lag}_hours_agot_input_flow_rate_last'] = df['input_flow_rate_last'].shift(lag)
        lag_features[f'{lag}_hours_ago_input_flow_rate_mean'] = df['input_flow_rate_mean'].shift(lag)
        lag_features[f'{lag}_hours_ago_reservoir_level_percentage_first'] = df['reservoir_level_percentage_first'].shift(lag)
        lag_features[f'{lag}_hours_ago_reservoir_level_percentage_last'] = df['reservoir_level_percentage_last'].shift(lag)
        lag_features[f'{lag}_hours_ago_reservoir_level_percentage_mean'] = df['reservoir_level_percentage_mean'].shift(lag)
        lag_features[f'{lag}_hours_ago_output_flow_rate_first'] = df['output_flow_rate_first'].shift(lag)
        lag_features[f'{lag}_hours_ago_output_flow_rate_last'] = df['output_flow_rate_last'].shift(lag)
        lag_features[f'{lag}_hours_ago_output_flow_rate_mean'] = df['output_flow_rate_mean'].shift(lag)
        lag_features[f'{lag}_hours_ago_pressure_first'] = df['pressure_first'].shift(lag)
        lag_features[f'{lag}_hours_ago_pressure_last'] = df['pressure_last'].shift(lag)
        lag_features[f'{lag}_hours_ago_pressure_mean'] = df['pressure_mean'].shift(lag)
        lag_features[f'{lag}_hours_agot_pump_1_duration_sum'] = df['pump_1_duration_sum'].shift(lag)
        lag_features[f'{lag}_hours_agot_pump_2_duration_sum'] = df['pump_2_duration_sum'].shift(lag)
        lag_features[f'{lag}_hours_ago_temperature'] = df['air_temp_c'].shift(lag)
        lag_features[f'{lag}_hours_ago_precipitation'] = df['total_precip_mm'].shift(lag)
        lag_features[f'{lag}_hours_ago_humidity'] = df['relative_humidity_percentage'].shift(lag)

        all_features.extend([
            f'{lag}_hours_ago_input_flow_rate_first', f'{lag}_hours_agot_input_flow_rate_last', f'{lag}_hours_ago_input_flow_rate_mean',
            f'{lag}_hours_ago_reservoir_level_percentage_first', f'{lag}_hours_ago_reservoir_level_percentage_last', f'{lag}_hours_ago_reservoir_level_percentage_mean',
            f'{lag}_hours_ago_output_flow_rate_first', f'{lag}_hours_ago_output_flow_rate_last', f'{lag}_hours_ago_output_flow_rate_mean',
            f'{lag}_hours_ago_pressure_first', f'{lag}_hours_ago_pressure_last', f'{lag}_hours_ago_pressure_mean',
            f'{lag}_hours_agot_pump_1_duration_sum', f'{lag}_hours_agot_pump_2_duration_sum',
            f'{lag}_hours_ago_temperature', f'{lag}_hours_ago_precipitation', f'{lag}_hours_ago_humidity'
        ])

        non_weather_features.extend([
            f'{lag}_hours_ago_input_flow_rate_first', f'{lag}_hours_agot_input_flow_rate_last', f'{lag}_hours_ago_input_flow_rate_mean',
            f'{lag}_hours_ago_reservoir_level_percentage_first', f'{lag}_hours_ago_reservoir_level_percentage_last', f'{lag}_hours_ago_reservoir_level_percentage_mean',
            f'{lag}_hours_ago_output_flow_rate_first', f'{lag}_hours_ago_output_flow_rate_last', f'{lag}_hours_ago_output_flow_rate_mean',
            f'{lag}_hours_ago_pressure_first', f'{lag}_hours_ago_pressure_last', f'{lag}_hours_ago_pressure_mean',
            f'{lag}_hours_agot_pump_1_duration_sum', f'{lag}_hours_agot_pump_2_duration_sum'
        ])

    # Use pd.concat to concatenate the new lag features into the dataframe
    df = pd.concat([df, pd.DataFrame(lag_features)], axis=1)

    return df, all_features, non_weather_features

In [4]:
def create_window_features(input_df):
    df = input_df.copy()
    
    all_features = []
    non_weather_features = []

    window_features = {}

    for window in [2, 3, 6, 12, 24, 36, 48, 72]:
        window_features[f'{window}_hours_rolling_input_flow_rate_diff'] = df['input_flow_rate_diff'].rolling(window).apply(lambda x: x.iloc[-1] - x.iloc[0] if len(x) == window else pd.NA)
        window_features[f'{window}_hours_rolling_output_flow_rate_diff'] = df['output_flow_rate_diff'].rolling(window).apply(lambda x: x.iloc[-1] - x.iloc[0] if len(x) == window else pd.NA)
        window_features[f'{window}_hours_rolling_reservoir_level_change'] = df['reservoir_level_change'].rolling(window).apply(lambda x: x.iloc[-1] - x.iloc[0] if len(x) == window else pd.NA)
        window_features[f'{window}_hours_rolling_pressure_change'] = df['pressure_change'].rolling(window).apply(lambda x: x.iloc[-1] - x.iloc[0] if len(x) == window else pd.NA)
        window_features[f'{window}_hours_rolling_change_from_last_hour_output_flow_rate_mean'] = df['change_from_last_hour_output_flow_rate_mean'].rolling(window).apply(lambda x: x.iloc[-1] - x.iloc[0] if len(x) == window else pd.NA)
        window_features[f'{window}_hours_rolling_temperature'] = df['air_temp_c'].rolling(window).mean()
        window_features[f'{window}_hours_rolling_precipitation'] = df['total_precip_mm'].rolling(window).mean()
        window_features[f'{window}_hours_rolling_humidity'] = df['relative_humidity_percentage'].rolling(window).mean()
        
        all_features.extend([
            f'{window}_hours_rolling_input_flow_rate_diff', f'{window}_hours_rolling_output_flow_rate_diff', f'{window}_hours_rolling_reservoir_level_change',
            f'{window}_hours_rolling_pressure_change', f'{window}_hours_rolling_change_from_last_hour_output_flow_rate_mean', f'{window}_hours_rolling_temperature',
            f'{window}_hours_rolling_precipitation', f'{window}_hours_rolling_humidity'
        ])

        non_weather_features.extend([
            f'{window}_hours_rolling_input_flow_rate_diff', f'{window}_hours_rolling_output_flow_rate_diff', f'{window}_hours_rolling_reservoir_level_change',
            f'{window}_hours_rolling_pressure_change', f'{window}_hours_rolling_change_from_last_hour_output_flow_rate_mean'
        ])
    
    # Use pd.concat to concatenate the new window features into the dataframe
    df = pd.concat([df, pd.DataFrame(window_features)], axis=1)
    
    return df, all_features, non_weather_features

In [5]:
def create_targets(input_df):
    df = input_df.copy()
    
    targets = []
    for i in range(1, 25):  
        df[f'target_{i}'] = df['change_from_last_hour_output_flow_rate_mean'].shift(-i)
        targets.append(f'target_{i}')
        
    return df, targets

In [6]:
def create_training_dataset(input_df):
    df = input_df.copy()
    
    df, all_lag_features, non_weather_lag_features = create_lag_features(df)
    df, all_window_features, non_weather_window_features = create_window_features(df)
    df, targets = create_targets(df)
    df.dropna(inplace=True)
    
    date_features = ['timestamp', 'hour', 'day_of_week', 'week_of_year', 'year']
    all_features = date_features + all_lag_features + all_window_features
    all_non_weather_features = date_features + non_weather_lag_features + non_weather_window_features
    all_training_columns = all_features + targets
    
    return df[all_training_columns].reset_index(drop=True), all_features, all_non_weather_features

training_df, all_features, all_non_weather_features = create_training_dataset(df)

In [7]:
training_df.head(1)

Unnamed: 0,timestamp,hour,day_of_week,week_of_year,year,1_hours_ago_input_flow_rate_first,1_hours_agot_input_flow_rate_last,1_hours_ago_input_flow_rate_mean,1_hours_ago_reservoir_level_percentage_first,1_hours_ago_reservoir_level_percentage_last,1_hours_ago_reservoir_level_percentage_mean,1_hours_ago_output_flow_rate_first,1_hours_ago_output_flow_rate_last,1_hours_ago_output_flow_rate_mean,1_hours_ago_pressure_first,1_hours_ago_pressure_last,1_hours_ago_pressure_mean,1_hours_agot_pump_1_duration_sum,1_hours_agot_pump_2_duration_sum,1_hours_ago_temperature,1_hours_ago_precipitation,1_hours_ago_humidity,2_hours_ago_input_flow_rate_first,2_hours_agot_input_flow_rate_last,2_hours_ago_input_flow_rate_mean,2_hours_ago_reservoir_level_percentage_first,2_hours_ago_reservoir_level_percentage_last,2_hours_ago_reservoir_level_percentage_mean,2_hours_ago_output_flow_rate_first,2_hours_ago_output_flow_rate_last,2_hours_ago_output_flow_rate_mean,2_hours_ago_pressure_first,2_hours_ago_pressure_last,2_hours_ago_pressure_mean,2_hours_agot_pump_1_duration_sum,2_hours_agot_pump_2_duration_sum,2_hours_ago_temperature,2_hours_ago_precipitation,2_hours_ago_humidity,3_hours_ago_input_flow_rate_first,3_hours_agot_input_flow_rate_last,3_hours_ago_input_flow_rate_mean,3_hours_ago_reservoir_level_percentage_first,3_hours_ago_reservoir_level_percentage_last,3_hours_ago_reservoir_level_percentage_mean,3_hours_ago_output_flow_rate_first,3_hours_ago_output_flow_rate_last,3_hours_ago_output_flow_rate_mean,3_hours_ago_pressure_first,3_hours_ago_pressure_last,3_hours_ago_pressure_mean,3_hours_agot_pump_1_duration_sum,3_hours_agot_pump_2_duration_sum,3_hours_ago_temperature,3_hours_ago_precipitation,3_hours_ago_humidity,6_hours_ago_input_flow_rate_first,6_hours_agot_input_flow_rate_last,6_hours_ago_input_flow_rate_mean,6_hours_ago_reservoir_level_percentage_first,6_hours_ago_reservoir_level_percentage_last,6_hours_ago_reservoir_level_percentage_mean,6_hours_ago_output_flow_rate_first,6_hours_ago_output_flow_rate_last,6_hours_ago_output_flow_rate_mean,6_hours_ago_pressure_first,6_hours_ago_pressure_last,6_hours_ago_pressure_mean,6_hours_agot_pump_1_duration_sum,6_hours_agot_pump_2_duration_sum,6_hours_ago_temperature,6_hours_ago_precipitation,6_hours_ago_humidity,12_hours_ago_input_flow_rate_first,12_hours_agot_input_flow_rate_last,12_hours_ago_input_flow_rate_mean,12_hours_ago_reservoir_level_percentage_first,12_hours_ago_reservoir_level_percentage_last,12_hours_ago_reservoir_level_percentage_mean,12_hours_ago_output_flow_rate_first,12_hours_ago_output_flow_rate_last,12_hours_ago_output_flow_rate_mean,12_hours_ago_pressure_first,12_hours_ago_pressure_last,12_hours_ago_pressure_mean,12_hours_agot_pump_1_duration_sum,12_hours_agot_pump_2_duration_sum,12_hours_ago_temperature,12_hours_ago_precipitation,12_hours_ago_humidity,24_hours_ago_input_flow_rate_first,24_hours_agot_input_flow_rate_last,24_hours_ago_input_flow_rate_mean,24_hours_ago_reservoir_level_percentage_first,24_hours_ago_reservoir_level_percentage_last,24_hours_ago_reservoir_level_percentage_mean,24_hours_ago_output_flow_rate_first,24_hours_ago_output_flow_rate_last,24_hours_ago_output_flow_rate_mean,24_hours_ago_pressure_first,24_hours_ago_pressure_last,24_hours_ago_pressure_mean,24_hours_agot_pump_1_duration_sum,24_hours_agot_pump_2_duration_sum,24_hours_ago_temperature,24_hours_ago_precipitation,24_hours_ago_humidity,26_hours_ago_input_flow_rate_first,26_hours_agot_input_flow_rate_last,26_hours_ago_input_flow_rate_mean,26_hours_ago_reservoir_level_percentage_first,26_hours_ago_reservoir_level_percentage_last,26_hours_ago_reservoir_level_percentage_mean,26_hours_ago_output_flow_rate_first,26_hours_ago_output_flow_rate_last,26_hours_ago_output_flow_rate_mean,26_hours_ago_pressure_first,26_hours_ago_pressure_last,26_hours_ago_pressure_mean,26_hours_agot_pump_1_duration_sum,26_hours_agot_pump_2_duration_sum,26_hours_ago_temperature,26_hours_ago_precipitation,26_hours_ago_humidity,48_hours_ago_input_flow_rate_first,48_hours_agot_input_flow_rate_last,48_hours_ago_input_flow_rate_mean,48_hours_ago_reservoir_level_percentage_first,48_hours_ago_reservoir_level_percentage_last,48_hours_ago_reservoir_level_percentage_mean,48_hours_ago_output_flow_rate_first,48_hours_ago_output_flow_rate_last,48_hours_ago_output_flow_rate_mean,48_hours_ago_pressure_first,48_hours_ago_pressure_last,48_hours_ago_pressure_mean,48_hours_agot_pump_1_duration_sum,48_hours_agot_pump_2_duration_sum,48_hours_ago_temperature,48_hours_ago_precipitation,48_hours_ago_humidity,72_hours_ago_input_flow_rate_first,72_hours_agot_input_flow_rate_last,72_hours_ago_input_flow_rate_mean,72_hours_ago_reservoir_level_percentage_first,72_hours_ago_reservoir_level_percentage_last,72_hours_ago_reservoir_level_percentage_mean,72_hours_ago_output_flow_rate_first,72_hours_ago_output_flow_rate_last,72_hours_ago_output_flow_rate_mean,72_hours_ago_pressure_first,72_hours_ago_pressure_last,72_hours_ago_pressure_mean,72_hours_agot_pump_1_duration_sum,72_hours_agot_pump_2_duration_sum,72_hours_ago_temperature,72_hours_ago_precipitation,72_hours_ago_humidity,2_hours_rolling_input_flow_rate_diff,2_hours_rolling_output_flow_rate_diff,2_hours_rolling_reservoir_level_change,2_hours_rolling_pressure_change,2_hours_rolling_change_from_last_hour_output_flow_rate_mean,2_hours_rolling_temperature,2_hours_rolling_precipitation,2_hours_rolling_humidity,3_hours_rolling_input_flow_rate_diff,3_hours_rolling_output_flow_rate_diff,3_hours_rolling_reservoir_level_change,3_hours_rolling_pressure_change,3_hours_rolling_change_from_last_hour_output_flow_rate_mean,3_hours_rolling_temperature,3_hours_rolling_precipitation,3_hours_rolling_humidity,6_hours_rolling_input_flow_rate_diff,6_hours_rolling_output_flow_rate_diff,6_hours_rolling_reservoir_level_change,6_hours_rolling_pressure_change,6_hours_rolling_change_from_last_hour_output_flow_rate_mean,6_hours_rolling_temperature,6_hours_rolling_precipitation,6_hours_rolling_humidity,12_hours_rolling_input_flow_rate_diff,12_hours_rolling_output_flow_rate_diff,12_hours_rolling_reservoir_level_change,12_hours_rolling_pressure_change,12_hours_rolling_change_from_last_hour_output_flow_rate_mean,12_hours_rolling_temperature,12_hours_rolling_precipitation,12_hours_rolling_humidity,24_hours_rolling_input_flow_rate_diff,24_hours_rolling_output_flow_rate_diff,24_hours_rolling_reservoir_level_change,24_hours_rolling_pressure_change,24_hours_rolling_change_from_last_hour_output_flow_rate_mean,24_hours_rolling_temperature,24_hours_rolling_precipitation,24_hours_rolling_humidity,36_hours_rolling_input_flow_rate_diff,36_hours_rolling_output_flow_rate_diff,36_hours_rolling_reservoir_level_change,36_hours_rolling_pressure_change,36_hours_rolling_change_from_last_hour_output_flow_rate_mean,36_hours_rolling_temperature,36_hours_rolling_precipitation,36_hours_rolling_humidity,48_hours_rolling_input_flow_rate_diff,48_hours_rolling_output_flow_rate_diff,48_hours_rolling_reservoir_level_change,48_hours_rolling_pressure_change,48_hours_rolling_change_from_last_hour_output_flow_rate_mean,48_hours_rolling_temperature,48_hours_rolling_precipitation,48_hours_rolling_humidity,72_hours_rolling_input_flow_rate_diff,72_hours_rolling_output_flow_rate_diff,72_hours_rolling_reservoir_level_change,72_hours_rolling_pressure_change,72_hours_rolling_change_from_last_hour_output_flow_rate_mean,72_hours_rolling_temperature,72_hours_rolling_precipitation,72_hours_rolling_humidity,target_1,target_2,target_3,target_4,target_5,target_6,target_7,target_8,target_9,target_10,target_11,target_12,target_13,target_14,target_15,target_16,target_17,target_18,target_19,target_20,target_21,target_22,target_23,target_24
0,2023-03-20 13:00:00,13,0,12,2023,66.46,65.64,66.378182,37.56,42.5,40.150909,46.46,65.64,51.77204,30.49,30.15,30.325909,3600.0,3600.0,27.9,0.0,66.0,67.3,67.3,67.480455,32.1,37.26,34.81,47.3,67.3,52.054911,30.92,30.52,30.767273,3600.0,3600.0,25.1,0.0,79.0,0.0,68.15,40.894348,35.16,31.5,30.617391,0.0,68.15,48.754027,31.49,30.92,31.214783,2099.0,2099.0,21.7,0.0,91.0,68.59,67.3,67.117083,41.06,49.56,45.415,31.256667,67.3,41.950417,38.47,38.53,38.555417,0.0,3600.0,22.4,0.0,86.0,64.46,0.0,15.293333,58.46,52.16,57.327619,47.793333,38.0,29.928254,31.72,30.85,31.402381,900.0,900.0,24.8,0.0,72.0,68.59,67.72,67.741429,29.26,34.26,31.89619,53.923333,47.72,52.503333,39.03,39.37,39.234762,0.0,3450.0,26.1,0.0,81.0,45.63,45.63,45.63,62.26,46.66,54.14,90.963333,45.63,94.296667,38.43,38.63,38.527647,0.0,3450.0,22.4,0.0,87.0,53.64,67.72,65.370476,30.9,32.2,30.651429,122.306667,47.85245,64.493255,39.87,32.39,36.192857,0.0,3301.0,27.8,0.0,77.0,65.64,65.24,65.587143,39.0,44.4,41.617143,45.64,49.24,50.704846,38.13,38.27,38.181429,0.0,3682.0,28.9,0.0,59.0,-1.24,2.76,0.96,8.31,-2.662902,28.55,0.0,64.0,-2.06,1.94,0.74,8.37,-6.246656,27.4,0.0,69.0,63.18,41.846667,5.36,14.68,-3.89886,24.616667,0.0,79.5,-2.06,21.94,15.66,8.64,-1.866003,23.85,0.0,79.916667,-0.84,13.16,0.7,7.57,0.530288,26.1375,0.0,71.583333,-2.06,-11.393333,16.5,7.47,2.06872,24.45,0.0,78.111111,-0.84,23.982799,5.4,7.97,8.947285,25.510417,0.0,75.541667,-1.24,73.54125,0.44,7.74,-2.502555,25.615278,0.0,76.083333,-4.160296,-2.831518,0.512819,1.426672,6.314316,63.215072,-7.953333,-59.65,-5.005238,-0.971927,-3.154293,-12.108542,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [14]:
print(all_features)

['timestamp', 'hour', 'day_of_week', 'week_of_year', 'year', '1_hours_ago_input_flow_rate_first', '1_hours_agot_input_flow_rate_last', '1_hours_ago_input_flow_rate_mean', '1_hours_ago_reservoir_level_percentage_first', '1_hours_ago_reservoir_level_percentage_last', '1_hours_ago_reservoir_level_percentage_mean', '1_hours_ago_output_flow_rate_first', '1_hours_ago_output_flow_rate_last', '1_hours_ago_output_flow_rate_mean', '1_hours_ago_pressure_first', '1_hours_ago_pressure_last', '1_hours_ago_pressure_mean', '1_hours_agot_pump_1_duration_sum', '1_hours_agot_pump_2_duration_sum', '1_hours_ago_temperature', '1_hours_ago_precipitation', '1_hours_ago_humidity', '2_hours_ago_input_flow_rate_first', '2_hours_agot_input_flow_rate_last', '2_hours_ago_input_flow_rate_mean', '2_hours_ago_reservoir_level_percentage_first', '2_hours_ago_reservoir_level_percentage_last', '2_hours_ago_reservoir_level_percentage_mean', '2_hours_ago_output_flow_rate_first', '2_hours_ago_output_flow_rate_last', '2_hours

In [15]:
print(all_non_weather_features)

['timestamp', 'hour', 'day_of_week', 'week_of_year', 'year', '1_hours_ago_input_flow_rate_first', '1_hours_agot_input_flow_rate_last', '1_hours_ago_input_flow_rate_mean', '1_hours_ago_reservoir_level_percentage_first', '1_hours_ago_reservoir_level_percentage_last', '1_hours_ago_reservoir_level_percentage_mean', '1_hours_ago_output_flow_rate_first', '1_hours_ago_output_flow_rate_last', '1_hours_ago_output_flow_rate_mean', '1_hours_ago_pressure_first', '1_hours_ago_pressure_last', '1_hours_ago_pressure_mean', '1_hours_agot_pump_1_duration_sum', '1_hours_agot_pump_2_duration_sum', '2_hours_ago_input_flow_rate_first', '2_hours_agot_input_flow_rate_last', '2_hours_ago_input_flow_rate_mean', '2_hours_ago_reservoir_level_percentage_first', '2_hours_ago_reservoir_level_percentage_last', '2_hours_ago_reservoir_level_percentage_mean', '2_hours_ago_output_flow_rate_first', '2_hours_ago_output_flow_rate_last', '2_hours_ago_output_flow_rate_mean', '2_hours_ago_pressure_first', '2_hours_ago_pressure