In [1]:
from pathlib import Path
import pandas as pd

In [2]:
df_path = Path('../data/curated/water_consumption_curated.parquet')
df = pd.read_parquet(df_path)

In [3]:
df.head()

Unnamed: 0,id,timestamp,second,minute,hour,day,weekday,week_of_year,month,year,...,max_temp_last_hour_c,min_temp_last_hour_c,max_dew_point_last_hour_c,min_dew_point_last_hour_c,max_humidity_last_hour_percentage,min_humidity_last_hour_percentage,relative_humidity_percentage,wind_direction_deg,max_wind_gust_m_s,wind_speed_m_s
0,1,2023-03-17 12:28:56,56,28,12,17,4,11,3,2023,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
1,2,2023-03-17 12:31:26,26,31,12,17,4,11,3,2023,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
2,3,2023-03-17 12:33:56,56,33,12,17,4,11,3,2023,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
3,4,2023-03-17 12:36:26,26,36,12,17,4,11,3,2023,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8
4,5,2023-03-17 12:38:56,56,38,12,17,4,11,3,2023,...,27.9,25.9,22.9,20.8,82.0,66.0,70.0,337.0,2.2,0.8


In [4]:
def get_avg_use_per_bomb_in_minutes_corrected(original_df: pd.DataFrame) -> pd.DataFrame:
    df = original_df.copy()
    
    # Ensure timestamp is in datetime format
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    
    # Create date column
    df['date'] = df['timestamp'].dt.date
    
    # Calculate peak hours
    peak_hours = (df["hour"] >= 18) & (df["hour"] <= 21)
    df["is_peak_hour"] = peak_hours
    
    # Calculate duration of pumps
    df['pump_1_duration'] = df['pump_1'] * df['time_passed_seconds']
    df['pump_2_duration'] = df['pump_2'] * df['time_passed_seconds']
    
    # Sum water bombs usage time per day and hour
    daily_peak_usage = df[df['is_peak_hour']].groupby('date').agg({'pump_1_duration': 'sum', 'pump_2_duration': 'sum'})
    daily_off_peak_usage = df[~df['is_peak_hour']].groupby('date').agg({'pump_1_duration': 'sum', 'pump_2_duration': 'sum'})

    # Calculate water bombs average usage time per day in minutes
    gmb_1_peak_avg = daily_peak_usage['pump_1_duration'].mean() / 60  # convert seconds to minutes
    gmb_1_off_peak_avg = daily_off_peak_usage['pump_1_duration'].mean() / 60  # convert seconds to minutes

    gmb_2_peak_avg = daily_peak_usage['pump_2_duration'].mean() / 60  # convert seconds to minutes
    gmb_2_off_peak_avg = daily_off_peak_usage['pump_2_duration'].mean() / 60  # convert seconds to minutes
    
    def convert_to_hours_and_minutes(minutes):
        if pd.isna(minutes):
            return "0 hours and 0 minutes"
        total_minutes = int(minutes)
        hours = total_minutes // 60
        minutes = total_minutes % 60
        return f"{hours} hours and {minutes} minutes"
    
    data = {
        'pump': ['pump_1', 'pump_2'],
        'average_time_used_peak_hours': [
            convert_to_hours_and_minutes(gmb_1_peak_avg), 
            convert_to_hours_and_minutes(gmb_2_peak_avg)
        ],
        'average_time_used_offpeak_hours': [
            convert_to_hours_and_minutes(gmb_1_off_peak_avg), 
            convert_to_hours_and_minutes(gmb_2_off_peak_avg)
        ]
    }
    
    result_df = pd.DataFrame(data)
    
    return result_df

In [5]:
average_use_per_bomb_df_corrected = get_avg_use_per_bomb_in_minutes_corrected(df)
average_use_per_bomb_df_corrected

Unnamed: 0,pump,average_time_used_peak_hours,average_time_used_off_peak_hours
0,pump_1,1 hours and 24 minutes,7 hours and 57 minutes
1,pump_2,0 hours and 38 minutes,5 hours and 9 minutes
