In [1]:
from pathlib import Path
import pandas as pd

In [2]:
df_path = Path('../data/curated_data/water_consumption_curated_1.parquet')
df = pd.read_parquet(df_path)

In [3]:
def get_average_flow_out_across_day(original_df: pd.DataFrame) -> pd.DataFrame:
    df = original_df.copy()
    
    # Create new columns for day of week and hour
    df['day_of_week'] = df['timestamp'].dt.dayofweek
    df['hour'] = df['timestamp'].dt.hour
    
    # Separate data into weekdays and weekends
    df_weekdays = df[df['day_of_week'] < 5]
    df_weekends = df[df['day_of_week'] >= 5]

    # Calculate average flow out per hour for weekdays and weekends
    flow_out_weekdays = df_weekdays.groupby('hour')['flow_out_(l/s)'].mean().reset_index().rename(columns={'flow_out_(l/s)': 'average_flow_out_(l/s)'})
    flow_out_weekends = df_weekends.groupby('hour')['flow_out_(l/s)'].mean().reset_index().rename(columns={'flow_out_(l/s)': 'average_flow_out_(l/s)'})
    
    # Label the type of day for the aggregated data
    flow_out_weekdays['type'] = 'weekdays'
    flow_out_weekends['type'] = 'weekends'
    
    # Combine the weekday and weekend data into a single DataFrame
    df_combined = pd.concat([flow_out_weekdays, flow_out_weekends])
    
    return df_combined

In [4]:
df = get_average_flow_out_across_day(df)
df.head(100)

Unnamed: 0,hour,average_flow_out_(l/s),type
0,0,27.611543,weekdays
1,1,25.07562,weekdays
2,2,25.063462,weekdays
3,3,23.032023,weekdays
4,4,23.727896,weekdays
5,5,27.374432,weekdays
6,6,30.720331,weekdays
7,7,33.832132,weekdays
8,8,35.11513,weekdays
9,9,35.874879,weekdays
