---
# Setup

In [1]:
import os
import sys
import logging
import datetime
import calendar
from typing import (
    List,
    Dict,
    Tuple,
    Any,
    Optional,
    Callable,
    Iterable,
)

import dateutil
import pandas as pd
import numpy as np

from IPython.display import display, HTML

## Logging

In [2]:
# logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logger = logging.getLogger("analysis")

## PYTHONPATH

In [3]:
sys.path.append(f"{os.getcwd()}/../../../lib")

In [4]:
%load_ext autoreload
%autoreload 2
from constant import (
    TYPE_FLOAT,
)
from util_datetime import (
    get_datetime_components,
    convert_date_into_datetime,
    convert_time_into_timedelta,
    convert_date_time_into_datetime,
    parse_date_string,
    parse_time_string,
    get_dates_from_string,
    has_date_in_string,
    parse_datetime_string,
    get_epoch_from_datetime,
    get_epoch_from_string,
    get_seconds_between_datetimes,
    get_datetime_after_duration,
    get_elapsed_time,
    get_holidays,
    is_holiday,
    is_weekend,
    get_cyclic_time_of_day,
    get_cyclic_day_of_week,
    get_cyclic_month_of_year,
)

## Pandas

In [5]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Constant

In [7]:
PATH_TO_DATA: str = "../data/merged.json"

In [64]:
# Location
COUNTRY_CODE: str = "AU"

# Actors
COLUMN_FACILITY_CODE: str = "facility_code"
COLUMN_FACILITY: str = 'facility'
COLUMN_SUPPLIER_CODE: str = 'supplier_code'
# Timing
COLUMN_PROCESS_DATE: str = 'process_date'
COLUMN_START_TIME: str = 'start_date_time'
COLUMN_START_HOUR: str = "start_hour"
COLUMN_START_TIME_SINX: str = "start_time_sin_x"
COLUMN_START_TIME_COSY: str = "start_time_cos_y"
COLUMN_END_TIME: str = "end_date_time"
COLUMN_WEEKDAY: str = "dayofweek"
COLUMN_WEEKDAY_SINX: str = "dayofweek_sin_x"
COLUMN_WEEKDAY_COSY: str = "dayofweek_cos_y"
COLUMN_IS_HOLIDAY: str = 'is_holiday'  # holiday or weenends
# Performance
COLUMN_PROCESS_TIME: str = "process_time"
COLUMN_INPUT: str = "input"
COLUMN_OUTPUT: str = "output"
COLUMN_THROUGHPUT: str = 'throughput'
COLUMN_RECOVERY_RATE: str = 'recovery_rate'

In [62]:
# Utility
DEBUG: bool = False
SECONDS_IN_MIN = 60

---
# Data



In [8]:
# Read columns as strings without auto-detect/convert.
raw_df = pd.read_json(
    PATH_TO_DATA,
    convert_dates=False
)
raw_df

Unnamed: 0,facility,date,timeStart,timeEnd,supplierCode,suppliedM3,recoveredM3,processTime,supplier
0,Newcastle,"Aug 1, 2022",8:29:00 AM,9:07:00 AM,har,2.00,1.55,,
1,Newcastle,"Aug 1, 2022",9:27:00 AM,11:28:00 AM,dic,6.80,4.15,,
2,Newcastle,"Aug 1, 2022",11:38:00 AM,12:21:00 PM,har,1.95,1.55,,
3,Newcastle,"Aug 1, 2022",12:40:00 PM,2:04:00 PM,tom,3.95,2.55,,
4,Newcastle,"Aug 1, 2022",2:25:00 PM,4:29:00 PM,dic,5.30,3.10,,
...,...,...,...,...,...,...,...,...,...
805,Newcastle,"Sep 30, 2022",11:40:00 AM,12:41:00 PM,tom,3.70,2.35,,
806,Newcastle,"Sep 30, 2022",12:52:00 PM,2:36:00 PM,dic,6.35,4.55,,
807,Bundaberg,,9/30/22 1:48 PM,,,4.53,2.73,3:40,Mary Therese
808,Newcastle,"Sep 30, 2022",3:02:00 PM,3:42:00 PM,har,2.00,1.45,,


### Clone supplierCode into supplier when supplier column does not exist

In [9]:
if 'supplier' not in raw_df.columns:
    raw_df['supplier'] = raw_df['supplierCode']

### Add NaN column as processTime if not exist

In [10]:
if 'processTime' not in raw_df.columns:
    raw_df['processTime'] = raw_df['supplierCode']
    raw_df['processTime'] = np.nan

In [11]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   facility      810 non-null    object 
 1   date          481 non-null    object 
 2   timeStart     810 non-null    object 
 3   timeEnd       481 non-null    object 
 4   supplierCode  481 non-null    object 
 5   suppliedM3    810 non-null    float64
 6   recoveredM3   810 non-null    float64
 7   processTime   329 non-null    object 
 8   supplier      329 non-null    object 
dtypes: float64(2), object(7)
memory usage: 57.1+ KB


In [12]:
raw_df['facility'].unique()

array(['Newcastle', 'Bundaberg'], dtype=object)

In [13]:
raw_df['supplierCode'].unique()

array(['har', 'dic', 'tom', nan], dtype=object)

In [14]:
raw_df['supplier'].unique() if 'supplier' in raw_df.columns else None

array([nan, 'Mary Therese', 'Mary Jane', 'Mary', 'Mary Anne'],
      dtype=object)

In [15]:
if DEBUG:
    display(raw_df)

---
# Feature extractions


## Start Time

In [16]:
def get_start_date_time(row: pd.Series) -> pd.Series:
    name: str = "get_start_date_time()"
    logger.debug("%s: row type[%s] row[%s]", name, type(row), row)

    # --------------------------------------------------------------------------------
    # if timeStart column is not valid, the row is invalid as there should be no way 
    # to recover start time.
    # --------------------------------------------------------------------------------
    if pd.isnull(row['timeStart']): 
        return np.nan
    
    
    start_date_time: datetime.datetime = np.nan

    # Begin start_date_time extraction
    if has_date_in_string(row['timeStart']):
        # --------------------------------------------------------------------------------
        # timeStart already includes date.
        # --------------------------------------------------------------------------------
        start_date_time = parse_datetime_string(row['timeStart'])
        logger.debug("%s: start_date_time is [%s]", name, start_date_time)
        
    else:
        # --------------------------------------------------------------------------------
        # timeStart has no date, then (date, startTime) columns must be valid.
        # Otherwise the row is invalid.
        # --------------------------------------------------------------------------------
        if pd.isnull(row['date']):
            # --------------------------------------------------------------------------------
            # Invalid row. Return NaN as the start_date_time to mark the row as invalid.
            # --------------------------------------------------------------------------------
            start_date_time = np.nan

        else:
            # --------------------------------------------------------------------------------
            # Date from 'date' column, and omit the time part.
            # --------------------------------------------------------------------------------
            _date_time: datetime.datetime = parse_datetime_string(row['date'])
            assert isinstance(_date_time, datetime.datetime)
            
            _date: datetime.date = _date_time.date()
            logger.debug("%s: date is [%s]", name, _date)

            # --------------------------------------------------------------------------------
            # Time from 'timeStart' column
            # --------------------------------------------------------------------------------
            _temp_date_time: datetime.datetime = parse_datetime_string(row['timeStart'])
            assert isinstance(_temp_date_time, datetime.datetime)
            
            _time: datetime.time = _temp_date_time.time()
            logger.debug("%s: start_time is [%s]", name, _time)
            
            start_date_time = convert_date_time_into_datetime(_date, _time)
            logger.debug("%s: start_date_time is [%s]", name, start_date_time)

    # End start_date_time extraction
            
    return start_date_time


In [17]:
start_date_time: pd.Series = raw_df.apply(func=get_start_date_time, axis=1)
start_date_time.name = COLUMN_START_TIME
if DEBUG:
    display(start_date_time)

## End Time

In [18]:
interim_end_time_df: pd.DataFrame = pd.DataFrame({
    COLUMN_START_TIME: start_date_time,
    'processTime': raw_df['processTime'],
    'timeEnd': raw_df['timeEnd']
})
    
if DEBUG:
    display(interim_end_time_df)

In [19]:
def get_end_date_time(row: pd.Series) -> pd.Series:
    name: str = "get_end_date_time()"
    logger.debug("%s: row type[%s] row[%s]", name, type(row), row)

    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected start_date_time as datetime.datetime, got [{type(row[COLUMN_START_TIME])}]."
    start_date_time: datetime.datetime = row[COLUMN_START_TIME]
    
    # --------------------------------------------------------------------------------
    # if timeEnd and processTime columns are not valid, the row is invalid
    # --------------------------------------------------------------------------------
    if pd.isnull(row['timeEnd']) and pd.isnull(row['processTime']): 
        return np.nan
    
    # Begin end_date_time extraction
    end_date_time: datetime.datetime = np.nan
        
    if not pd.isnull(row['timeEnd']):
        # --------------------------------------------------------------------------------
        # Acquire end_date_time from (start_date_time, timeEnd)
        # --------------------------------------------------------------------------------
        if has_date_in_string(row['timeEnd']):
            # --------------------------------------------------------------------------------
            # timeEnd already includes date.
            # --------------------------------------------------------------------------------
            end_date_time = parse_datetime_string(row['timeEnd'])
            logger.debug("%s: end_date_time is [%s]", name, end_date_time)

        else:
            # --------------------------------------------------------------------------------
            # timeEnd has no date, hence it is only time expression, and date is from start_time.
            # --------------------------------------------------------------------------------
            _dummy_end_date_time: datetime.datetime = parse_datetime_string(row['timeEnd'])
            assert isinstance(_dummy_end_date_time, datetime.datetime)
            
            # take only time part
            _time: datetime.time = _dummy_end_date_time.time()
            logger.debug("%s: timeEnd is [%s]", name, _time)
            
            assert isinstance(row[COLUMN_START_TIME], datetime.datetime)
            _date = row[COLUMN_START_TIME].date()
            
            end_date_time = convert_date_time_into_datetime(
                date_in_year=_date,
                time_in_day=_time
            )
            
            # Make sure timeEnd is after start_time in case the processing is crossing midnight.
            if end_date_time <= start_date_time:
                logger.warning(
                    "%S: end_date_time [%s] is before start_date_time [%s]", 
                    end_date_time, start_date_time
                )
                # Advance the end_date_time with 24h.
                end_date_time = end_date_time + datetime.timedelta(days=1)
                logger.debug("%s: end_date_time is [%s]", name, end_date_time)

    else:
        # --------------------------------------------------------------------------------
        # Acquire end_date_time from (start_date_time, processTime)
        # [Assumption] processTime is hh:mm:yy and mm is less than 60.
        # --------------------------------------------------------------------------------
        try:
            assert isinstance(row['processTime'], str), \
                f"expected row['processTime'] of type str, got [{type(row['processTime'])}]"

            _time: datetime.time = parse_time_string(row['processTime'])
            delta: datetime.timedelta = convert_time_into_timedelta(time_in_day=_time)
            end_date_time = start_date_time + delta
            
        except ValueError as e:
            logging.error("%s: invalid time expression [%s]", name, row['processTime'])
            end_date_time = np.nan
        
    # End start_date_time extraction
            
    return end_date_time


In [20]:
end_date_time: pd.Series = interim_end_time_df.apply(func=get_end_date_time, axis=1)
end_date_time.name = COLUMN_END_TIME
interim_end_time_df.insert(
    loc=len(interim_end_time_df.columns),
    column=COLUMN_END_TIME,
    value=end_date_time,
    allow_duplicates=False
)
interim_end_time_df

Unnamed: 0,start_date_time,processTime,timeEnd,end_date_time
0,2022-08-01 08:29:00,,9:07:00 AM,2022-08-01 09:07:00
1,2022-08-01 09:27:00,,11:28:00 AM,2022-08-01 11:28:00
2,2022-08-01 11:38:00,,12:21:00 PM,2022-08-01 12:21:00
3,2022-08-01 12:40:00,,2:04:00 PM,2022-08-01 14:04:00
4,2022-08-01 14:25:00,,4:29:00 PM,2022-08-01 16:29:00
...,...,...,...,...
805,2022-09-30 11:40:00,,12:41:00 PM,2022-09-30 12:41:00
806,2022-09-30 12:52:00,,2:36:00 PM,2022-09-30 14:36:00
807,2022-09-30 13:48:00,3:40,,2022-09-30 13:51:40
808,2022-09-30 15:02:00,,3:42:00 PM,2022-09-30 15:42:00


In [21]:
interim_end_time_df[interim_end_time_df['end_date_time'].isnull()]
if DEBUG:
    display(raw_df[interim_end_time_df['end_date_time'].isnull()])
else:
    del interim_end_time_df

## Process Time

In [22]:
interim_process_time_df: pd.DataFrame = pd.DataFrame({
    COLUMN_START_TIME: start_date_time,
    COLUMN_END_TIME: end_date_time,
})
if DEBUG:
    display(interim_process_time_df)

In [23]:
def get_process_time(row: pd.Series) -> pd.Series:
    """Get proces time in seconds as TYPE_FLOAT
    Process time as end time - start time.
    
    Return: process time taken in second (as TYPE_FLOAT)
    """
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime)
    assert isinstance(row[COLUMN_END_TIME], datetime.datetime)
    
    delta: datetime.timedelta = row[COLUMN_END_TIME] - row[COLUMN_START_TIME]
    assert isinstance(delta, datetime.timedelta), \
        f"expected delta as type datetime.timedelta, got {delta} of type [type(delta)]"
    
    return TYPE_FLOAT(delta.total_seconds())

In [24]:
process_time: pd.Series = interim_process_time_df.apply(func=get_process_time, axis=1)
process_time.name = COLUMN_PROCESS_TIME
if DEBUG:
    display(process_time)

In [25]:
interim_process_time_df.insert(
    loc=len(interim_process_time_df.columns),
    column=COLUMN_PROCESS_TIME,
    value=process_time,
    allow_duplicates=False
)

if DEBUG:
    display(interim_process_time_df)
else:
    del interim_process_time_df

## Process Date

In [26]:
def get_process_date(row: pd.Series):
    _date_time: datetime.datetime = np.nan
        
    if not pd.isnull(row[COLUMN_START_TIME]):
        _date_time = convert_date_into_datetime(row[COLUMN_START_TIME].date())
     
    return _date_time

In [27]:
process_date: pd.Series = start_date_time.to_frame().apply(func=get_process_date, axis=1)
process_date.name = COLUMN_PROCESS_DATE
if DEBUG:
    display(process_date)

## Supplier Code

In [28]:
def get_code_from_supplier(supplier: str):
    supplier_to_code = {
        "mary therese": "mat",
        "mary": "mar",
        'mary anne': "maa",
        'mary jane': 'maj',
        'dick tracey': 'dic',
        'tom hanks': 'tom',
        'harry houdini': 'har',
    }
    return supplier_to_code.get(supplier.lower(), np.nan)

def get_numeric_supplier_code(code: str):
    str_to_num = {
        "mat": 0,
        "mar": 1,
        "maa": 2,
        "maj": 3,
        "har": 4, 
        "dic": 5, 
        "tom": 6,
    }
    return str_to_num.get(code.lower(), np.nan)
    
def get_supplier_code(row: pd.Series):
    code: str
    if row['supplierCode'] not in (np.nan, None):
        code = row['supplierCode']
    else:
        code = get_code_from_supplier(row['supplier']) 
    
    return get_numeric_supplier_code(code)

In [29]:
supplier_code: pd.Series = raw_df.apply(func=get_supplier_code, axis=1)
supplier_code.name = COLUMN_SUPPLIER_CODE
if DEBUG:
    display(supplier_code)

## Facility

In [30]:
def get_numeric_facility_code(code: str):
    str_to_num = {
        "newcastle": 0,
        "bundaberg": 1,
    }
    return str_to_num.get(code.lower(), np.nan)
    
def get_facility_code(row: pd.Series):
    code: str
    if row['facility'] not in (np.nan, None):
        code = row['facility']
    else:
        code = "n/a"
    
    return get_numeric_facility_code(code)

In [31]:
facility_code: pd.Series = raw_df.apply(func=get_facility_code, axis=1)
facility_code.name = COLUMN_FACILITY_CODE
if DEBUG:
    display(facility_code)

## Throughput

In [32]:
# M3 / min
throughput: pd.Series = raw_df['recoveredM3'] / (process_time / SECONDS_IN_MIN)
throughput.name = COLUMN_THROUGHPUT

## Recovery Rate

output/input

In [33]:
recovery_rate: pd.Series = raw_df['recoveredM3'] / raw_df['suppliedM3']
recovery_rate.name = COLUMN_RECOVERY_RATE

## Weekday

In [34]:
def get_weekday(row: pd.Series):
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected [{COLUMN_START_TIME}] as timedate.timedate, got [{type(datetime.datetime)}]"
    
    date_time: datetime.datetime = row[COLUMN_START_TIME]
    return date_time.weekday()  # 0: Mon, 6: Sun

In [35]:
weekday: pd.Series = start_date_time.to_frame().apply(func=get_weekday, axis=1)
weekday.name = COLUMN_WEEKDAY

## Weekday in cyclic

In [102]:
def get_weekday_cyclic(row: pd.Series) -> pd.DataFrame:
    """Hour of the start time in cyclic (sin_as_x, cos_as_y)"""
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected [{COLUMN_START_TIME}] as timedate.timedate, got [{type(datetime.datetime)}]"

    x: TYPE_FLOAT
    y: TYPE_FLOAT

    date_time: datetime.datetime = row[COLUMN_START_TIME]
    x, y = get_cyclic_day_of_week(date_time.weekday())
    return pd.Series([x, y],index=[COLUMN_WEEKDAY_SINX, COLUMN_WEEKDAY_COSY])

In [103]:
weekday_sin_x: pd.Series
weekday_cos_y: pd.Series
    
weekday_sin_x, weekday_cos_y = ( # Return views, hence modifying them will cause warnings.
    value 
    for _, value 
    in start_date_time.to_frame().apply(func=get_weekday_cyclic, axis=1).items()
)

## Holiday

In [36]:
def get_holiday(row: pd.Series):
    """Flag if the start date/time is on holiday (1: holiday, 0: not holiday)"""
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected [{COLUMN_START_TIME}] as timedate.timedate, got [{type(datetime.datetime)}]"
    
    date_time: datetime.datetime = row[COLUMN_START_TIME]
    return int(is_holiday(target=date_time, country=COUNTRY_CODE) or is_weekend(target=date_time))

In [37]:
holiday: pd.Series = start_date_time.to_frame().apply(func=get_holiday, axis=1)
holiday.name = COLUMN_IS_HOLIDAY

## Start Hour

In [38]:
def get_start_hour(row: pd.Series):
    """Hour of the start time"""
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected [{COLUMN_START_TIME}] as timedate.timedate, got [{type(datetime.datetime)}]"
    
    date_time: datetime.datetime = row[COLUMN_START_TIME]
    return int(date_time.hour)

In [39]:
start_hour: pd.Series = start_date_time.to_frame().apply(func=get_start_hour, axis=1)
start_hour.name = COLUMN_START_HOUR

## Start time in cyclic

In [97]:
def get_start_time_cyclic(row: pd.Series) -> pd.DataFrame:
    """Hour of the start time in cyclic (sin_as_x, cos_as_y)"""
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected [{COLUMN_START_TIME}] as timedate.timedate, got [{type(datetime.datetime)}]"

    x: TYPE_FLOAT
    y: TYPE_FLOAT

    date_time: datetime.datetime = row[COLUMN_START_TIME]
    x, y = get_cyclic_time_of_day(
        hours=date_time.hour,
        minutes=date_time.minute,
        seconds=date_time.second
    )
    return pd.Series([x, y],index=[COLUMN_WEEKDAY_SINX, COLUMN_WEEKDAY_COSY])

In [98]:
# https://stackoverflow.com/questions/23586510
# https://stackoverflow.com/questions/51225275
start_time_sin_x: pd.Series
start_time_cos_y: pd.Series
    
start_time_sin_x, start_time_cos_y = ( # Return views, hence modifying them will cause warnings.
    value 
    for _, value 
    in start_date_time.to_frame().apply(func=get_start_time_cyclic, axis=1).items()
)

In [100]:
start_time_sin_x.name = COLUMN_START_TIME_SINX
start_time_cos_y.name = COLUMN_START_TIME_COSY
if DEBUG:
    start_time_sin_x

---
# Result


In [105]:
df: pd.DataFrame = pd.DataFrame({
    COLUMN_FACILITY_CODE: facility_code,
    COLUMN_SUPPLIER_CODE: supplier_code,
    COLUMN_START_TIME: start_date_time,
    COLUMN_START_HOUR: start_hour,
    COLUMN_START_TIME_SINX: start_time_sin_x,
    COLUMN_START_TIME_COSY: start_time_cos_y,
    COLUMN_WEEKDAY: weekday,
    COLUMN_WEEKDAY_SINX: weekday_sin_x,
    COLUMN_WEEKDAY_COSY: weekday_cos_y,
    COLUMN_IS_HOLIDAY: holiday.astype(np.uint8),
    COLUMN_PROCESS_TIME: process_time,
    COLUMN_INPUT: raw_df['suppliedM3'].astype(TYPE_FLOAT),
    COLUMN_OUTPUT: raw_df['recoveredM3'].astype(TYPE_FLOAT),
    COLUMN_THROUGHPUT: throughput.astype(TYPE_FLOAT),
    COLUMN_RECOVERY_RATE: recovery_rate.astype(TYPE_FLOAT)
})
df

Unnamed: 0,facility_code,supplier_code,start_date_time,start_hour,start_time_sin_x,start_time_cos_y,dayofweek,dayofweek_sin_x,dayofweek_cos_y,is_holiday,process_time,input,output,throughput,recovery_rate
0,0,4,2022-08-01 08:29:00,8,0.796002,-0.605294,0,0.000000,1.000000,0,2280.0,2.00,1.55,0.040789,0.775000
1,0,5,2022-08-01 09:27:00,9,0.619094,-0.785317,0,0.000000,1.000000,0,7260.0,6.80,4.15,0.034298,0.610294
2,0,4,2022-08-01 11:38:00,11,0.095846,-0.995396,0,0.000000,1.000000,0,2580.0,1.95,1.55,0.036047,0.794872
3,0,6,2022-08-01 12:40:00,12,-0.173648,-0.984808,0,0.000000,1.000000,0,5040.0,3.95,2.55,0.030357,0.645570
4,0,5,2022-08-01 14:25:00,14,-0.591309,-0.806445,0,0.000000,1.000000,0,7440.0,5.30,3.10,0.025000,0.584906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,0,6,2022-09-30 11:40:00,11,0.087156,-0.996195,4,-0.433884,-0.900969,0,3660.0,3.70,2.35,0.038525,0.635135
806,0,5,2022-09-30 12:52:00,12,-0.224951,-0.974370,4,-0.433884,-0.900969,0,6240.0,6.35,4.55,0.043750,0.716535
807,1,0,2022-09-30 13:48:00,13,-0.453991,-0.891007,4,-0.433884,-0.900969,0,220.0,4.53,2.73,0.744545,0.602649
808,0,4,2022-09-30 15:02:00,15,-0.713251,-0.700909,4,-0.433884,-0.900969,0,2400.0,2.00,1.45,0.036250,0.725000


In [41]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   facility_code    810 non-null    int64         
 1   supplier_code    810 non-null    int64         
 2   start_date_time  810 non-null    datetime64[ns]
 3   start_hour       810 non-null    int64         
 4   day_of_week      810 non-null    int64         
 5   is_holiday       810 non-null    uint8         
 6   process_time     810 non-null    float32       
 7   input            810 non-null    float32       
 8   output           810 non-null    float32       
 9   throughput       810 non-null    float32       
 10  recovery_rate    810 non-null    float32       
dtypes: datetime64[ns](1), float32(5), int64(4), uint8(1)
memory usage: 48.4 KB


In [42]:
df.describe()

Unnamed: 0,facility_code,supplier_code,start_hour,day_of_week,is_holiday,process_time,input,output,throughput,recovery_rate
count,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0,810.0
mean,0.406173,3.576543,11.332099,2.767901,0.224691,3913.073975,4.109407,2.805049,0.801889,0.690545
std,0.491421,1.991873,2.579194,1.914219,0.417637,4159.885742,1.387956,0.944784,1.081123,0.0856
min,0.0,0.0,8.0,0.0,0.0,35.0,1.9,1.2,0.009524,0.415584
25%,0.0,2.0,9.0,1.0,0.0,115.0,3.0125,2.11,0.025297,0.635354
50%,0.0,4.0,11.0,3.0,0.0,2760.0,4.15,2.835,0.038197,0.698286
75%,1.0,5.0,14.0,4.0,0.0,6465.0,5.05,3.52,1.526667,0.75
max,1.0,6.0,16.0,6.0,1.0,17820.0,7.0,5.5,4.422857,0.873418


In [None]:
del raw_df