---
# Setup

In [1]:
import os
import sys
import logging
import datetime
import calendar
from typing import (
    List,
    Dict,
    Tuple,
    Any,
    Optional,
    Callable,
    Iterable,
)

import dateutil
import pandas as pd
import numpy as np

## Logging

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logger = logging.getLogger("analysis")

## PYTHONPATH

In [3]:
sys.path.append(f"{os.getcwd()}/../../../lib")

In [43]:
%load_ext autoreload
%autoreload 2
from constant import (
    TYPE_FLOAT,
)
from util_datetime import (
    get_datetime_components,
    convert_date_into_datetime,
    convert_time_into_timedelta,
    convert_date_time_into_datetime,
    parse_date_string,
    parse_time_string,
    get_dates_from_string,
    has_date_in_string,
    parse_datetime_string,
    get_epoch_from_datetime,
    get_epoch_from_string,
    get_seconds_between_datetimes,
    get_datetime_after_duration,
    get_elapsed_time,
    get_holidays,
    get_cyclic_time_of_day,
    get_cyclic_day_of_week,
    get_cyclic_month_of_year,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Pandas

In [5]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Constant

In [60]:
COLUMN_SUPPLIER_CODE: str = 'supplier_code'
COLUMN_FACILITY: str = 'facility'
COLUMN_START_TIME: str = "start_date_time"
COLUMN_END_TIME: str = "end_date_time"
COLUMN_PROCESS_DATE: str = 'process_date'
COLUMN_PROCESS_TIME: str = "process_time"
COLUMN_INPUT: str = "input"
COLUMN_OUTPUT: str = "output"

---
# Data

Read columns as strings without auto-detect/convert.

In [7]:
raw_df = pd.read_json(
    "../data/september.json",
    convert_dates=False
)
raw_df

Unnamed: 0,facility,timeStart,processTime,supplier,suppliedM3,recoveredM3,date,timeEnd,supplierCode
0,Bundaberg,9/1/22 8:16 AM,4:05,Mary,5.09,4.13,,,
1,Newcastle,8:29:00 AM,,,2.00,1.55,"Sep 1, 2022",9:07:00 AM,har
2,Newcastle,9:27:00 AM,,,6.80,4.15,"Sep 1, 2022",11:28:00 AM,dic
3,Newcastle,11:38:00 AM,,,1.95,1.55,"Sep 1, 2022",12:21:00 PM,har
4,Bundaberg,9/1/22 12:34 PM,1:50,Mary Therese,3.78,2.56,,,
...,...,...,...,...,...,...,...,...,...
227,Newcastle,11:40:00 AM,,,3.70,2.35,"Sep 30, 2022",12:41:00 PM,tom
228,Newcastle,12:52:00 PM,,,6.35,4.55,"Sep 30, 2022",2:36:00 PM,dic
229,Bundaberg,9/30/22 1:48 PM,3:40,Mary Therese,4.53,2.73,,,
230,Newcastle,3:02:00 PM,,,2.00,1.45,"Sep 30, 2022",3:42:00 PM,har


In [8]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   facility      232 non-null    object 
 1   timeStart     232 non-null    object 
 2   processTime   111 non-null    object 
 3   supplier      111 non-null    object 
 4   suppliedM3    232 non-null    float64
 5   recoveredM3   232 non-null    float64
 6   date          121 non-null    object 
 7   timeEnd       121 non-null    object 
 8   supplierCode  121 non-null    object 
dtypes: float64(2), object(7)
memory usage: 16.4+ KB


---
# Feature extractions


## Start Time

In [9]:
def get_start_date_time(row: pd.Series) -> pd.Series:
    name: str = "get_start_date_time()"
    logger.debug("%s: row type[%s] row[%s]", name, type(row), row)

    # --------------------------------------------------------------------------------
    # if timeStart column is not valid, the row is invalid as there should be no way 
    # to recover start time.
    # --------------------------------------------------------------------------------
    if pd.isnull(row['timeStart']): 
        return np.nan
    
    
    start_date_time: datetime.datetime = np.nan

    # Begin start_date_time extraction
    if has_date_in_string(row['timeStart']):
        # --------------------------------------------------------------------------------
        # timeStart already includes date.
        # --------------------------------------------------------------------------------
        start_date_time = parse_datetime_string(row['timeStart'])
        logger.debug("%s: start_date_time is [%s]", name, start_date_time)
        
    else:
        # --------------------------------------------------------------------------------
        # timeStart has no date, then (date, startTime) columns must be valid.
        # Otherwise the row is invalid.
        # --------------------------------------------------------------------------------
        if pd.isnull(row['date']):
            # --------------------------------------------------------------------------------
            # Invalid row. Return NaN as the start_date_time to mark the row as invalid.
            # --------------------------------------------------------------------------------
            start_date_time = np.nan

        else:
            # --------------------------------------------------------------------------------
            # Date from 'date' column, and omit the time part.
            # --------------------------------------------------------------------------------
            _date_time: datetime.datetime = parse_datetime_string(row['date'])
            assert isinstance(_date_time, datetime.datetime)
            
            _date: datetime.date = _date_time.date()
            logger.debug("%s: date is [%s]", name, _date)

            # --------------------------------------------------------------------------------
            # Time from 'timeStart' column
            # --------------------------------------------------------------------------------
            _temp_date_time: datetime.datetime = parse_datetime_string(row['timeStart'])
            assert isinstance(_temp_date_time, datetime.datetime)
            
            _time: datetime.time = _temp_date_time.time()
            logger.debug("%s: start_time is [%s]", name, _time)
            
            start_date_time = convert_date_time_into_datetime(_date, _time)
            logger.debug("%s: start_date_time is [%s]", name, start_date_time)

    # End start_date_time extraction
            
    return start_date_time


In [11]:
start_date_time: pd.Series = raw_df.apply(func=get_start_date_time, axis=1)
start_date_time.name = COLUMN_START_TIME

## End Time

In [49]:
interim_end_time_df: pd.DataFrame = pd.DataFrame({
    'start_date_time': start_date_time,
    'processTime': raw_df['processTime'],
    'timeEnd': raw_df['timeEnd']
})
# interim_time_df

In [50]:
def get_end_date_time(row: pd.Series) -> pd.Series:
    name: str = "get_end_date_time()"
    logger.debug("%s: row type[%s] row[%s]", name, type(row), row)

    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected start_date_time as datetime.datetime, got [{type(row[COLUMN_START_TIME])}]."
    start_date_time: datetime.datetime = row[COLUMN_START_TIME]
    
    # --------------------------------------------------------------------------------
    # if timeEnd and processTime columns are not valid, the row is invalid
    # --------------------------------------------------------------------------------
    if pd.isnull(row['timeEnd']) and pd.isnull(row['processTime']): 
        return np.nan
    
    # Begin end_date_time extraction
    end_date_time: datetime.datetime = np.nan
        
    if not pd.isnull(row['timeEnd']):
        # --------------------------------------------------------------------------------
        # Acquire end_date_time from (start_date_time, timeEnd)
        # --------------------------------------------------------------------------------
        if has_date_in_string(row['timeEnd']):
            # --------------------------------------------------------------------------------
            # timeEnd already includes date.
            # --------------------------------------------------------------------------------
            end_date_time = parse_datetime_string(row['timeEnd'])
            logger.debug("%s: end_date_time is [%s]", name, end_date_time)

        else:
            # --------------------------------------------------------------------------------
            # timeEnd has no date, hence it is only time expression, and date is from start_time.
            # --------------------------------------------------------------------------------
            _dummy_end_date_time: datetime.datetime = parse_datetime_string(row['timeEnd'])
            assert isinstance(_dummy_end_date_time, datetime.datetime)
            
            # take only time part
            _time: datetime.time = _dummy_end_date_time.time()
            logger.debug("%s: timeEnd is [%s]", name, _time)
            
            assert isinstance(row['start_date_time'], datetime.datetime)
            _date = row['start_date_time'].date()
            
            end_date_time = convert_date_time_into_datetime(
                date_in_year=_date,
                time_in_day=_time
            )
            
            # Make sure timeEnd is after start_time in case the processing is crossing midnight.
            if end_date_time <= start_date_time:
                logger.warning(
                    "%S: end_date_time [%s] is before start_date_time [%s]", 
                    end_date_time, start_date_time
                )
                # Advance the end_date_time with 24h.
                end_date_time = end_date_time + datetime.timedelta(days=1)
                logger.debug("%s: end_date_time is [%s]", name, end_date_time)

    else:
        # --------------------------------------------------------------------------------
        # Acquire end_date_time from (start_date_time, processTime)
        # [Assumption] processTime is hh:mm:yy and mm is less than 60.
        # --------------------------------------------------------------------------------
        try:
            assert isinstance(row['processTime'], str), \
                f"expected row['processTime'] of type str, got [{type(row['processTime'])}]"

            _time: datetime.time = parse_time_string(row['processTime'])
            delta: datetime.timedelta = convert_time_into_timedelta(time_in_day=_time)
            end_date_time = start_date_time + delta
            
        except ValueError as e:
            logging.error("%s: invalid time expression [%s]", name, row['processTime'])
            end_date_time = np.nan
        
    # End start_date_time extraction
            
    return end_date_time


In [51]:
end_date_time: pd.Series = interim_end_time_df.apply(func=get_end_date_time, axis=1)
end_date_time.name = COLUMN_END_TIME
interim_end_time_df.insert(
    loc=len(interim_end_time_df.columns),
    column=COLUMN_END_TIME,
    value=end_date_time,
    allow_duplicates=False
)
# interim_end_time_df
del interim_end_time_df

## Process Time

In [52]:
interim_process_time_df: pd.DataFrame = pd.DataFrame({
    COLUMN_START_TIME: start_date_time,
    COLUMN_END_TIME: end_date_time,
})
# interim_process_time_df

In [53]:
def get_process_time(row: pd.Series) -> pd.Series:
    """Get proces time in seconds as TYPE_FLOAT
    Process time as end time - start time.
    
    Return: process time taken in second (as TYPE_FLOAT)
    """
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime)
    assert isinstance(row[COLUMN_END_TIME], datetime.datetime)
    
    delta: datetime.timedelta = row[COLUMN_END_TIME] - row[COLUMN_START_TIME]
    assert isinstance(delta, datetime.timedelta)
    
    return TYPE_FLOAT(delta.total_seconds())

In [54]:
process_time: pd.Series = interim_process_time_df.apply(func=get_process_time, axis=1)
process_time.name = COLUMN_PROCESS_TIME

In [55]:
interim_process_time_df.insert(
    loc=len(interim_process_time_df.columns),
    column=COLUMN_PROCESS_TIME,
    value=process_time,
    allow_duplicates=False
)

# interim_process_time_df
del interim_process_time_df

## Process Date

In [56]:
def get_process_date(row: pd.Series):
    _date_time: datetime.datetime = np.nan
        
    if not pd.isnull(row['start_date_time']):
        _date_time = convert_date_into_datetime(row['start_date_time'].date())
     
    return _date_time

In [57]:
process_date: pd.Series = start_date_time.to_frame().apply(func=get_process_date, axis=1)
process_date.name = COLUMN_PROCESS_DATE

## Supplier Code

In [58]:
def get_code_from_supplier(supplier: str):
    supplier_to_code = {
        "mary therese": "mar",
        "mary": "mar"
    }
    return supplier_to_code.get(supplier.lower(), np.nan)
    
    
def get_supplier_code(row: pd.DataFrame):
    if row['supplier'] not in (np.nan, None):
        return get_code_from_supplier(row['supplier'])
    else:
        return row['supplierCode']

In [59]:
supplier_code: pd.Series = raw_df.apply(func=get_supplier_code, axis=1)
supplier_code.name = COLUMN_SUPPLIER_CODE

---
# Result


In [65]:
df: pd.DataFrame = pd.DataFrame({
    COLUMN_FACILITY: raw_df['facility'],
    COLUMN_SUPPLIER_CODE: supplier_code,
    COLUMN_START_TIME: start_date_time,
    COLUMN_PROCESS_TIME: process_time,
    COLUMN_INPUT: raw_df['suppliedM3'],
    COLUMN_OUTPUT: raw_df['recoveredM3'],
})
df

Unnamed: 0,facility,supplier_code,start_date_time,process_time,input,output
0,Bundaberg,mar,2022-09-01 08:16:00,245.0,5.09,4.13
1,Newcastle,har,2022-09-01 08:29:00,2280.0,2.00,1.55
2,Newcastle,dic,2022-09-01 09:27:00,7260.0,6.80,4.15
3,Newcastle,har,2022-09-01 11:38:00,2580.0,1.95,1.55
4,Bundaberg,mar,2022-09-01 12:34:00,110.0,3.78,2.56
...,...,...,...,...,...,...
227,Newcastle,tom,2022-09-30 11:40:00,3660.0,3.70,2.35
228,Newcastle,dic,2022-09-30 12:52:00,6240.0,6.35,4.55
229,Bundaberg,mar,2022-09-30 13:48:00,220.0,4.53,2.73
230,Newcastle,har,2022-09-30 15:02:00,2400.0,2.00,1.45
