---
# Setup

In [1]:
import os
import sys
import logging
import datetime
import calendar
from typing import (
    List,
    Dict,
    Tuple,
    Any,
    Optional,
    Callable,
    Iterable,
)

import dateutil
import pandas as pd
import numpy as np

from IPython.display import display, HTML

## Logging

In [2]:
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logger = logging.getLogger("analysis")

## PYTHONPATH

In [3]:
sys.path.append(f"{os.getcwd()}/../../../lib")

In [4]:
%load_ext autoreload
%autoreload 2
from constant import (
    TYPE_FLOAT,
)
from util_datetime import (
    get_datetime_components,
    convert_date_into_datetime,
    convert_time_into_timedelta,
    convert_date_time_into_datetime,
    parse_date_string,
    parse_time_string,
    get_dates_from_string,
    has_date_in_string,
    parse_datetime_string,
    get_epoch_from_datetime,
    get_epoch_from_string,
    get_seconds_between_datetimes,
    get_datetime_after_duration,
    get_elapsed_time,
    get_holidays,
    get_cyclic_time_of_day,
    get_cyclic_day_of_week,
    get_cyclic_month_of_year,
)

## Pandas

In [5]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Constant

In [34]:
COLUMN_SUPPLIER_CODE: str = 'supplier_code'
COLUMN_FACILITY: str = 'facility'
COLUMN_START_TIME: str = "start_date_time"
COLUMN_END_TIME: str = "end_date_time"
COLUMN_PROCESS_DATE: str = 'process_date'
COLUMN_PROCESS_TIME: str = "process_time"
COLUMN_INPUT: str = "input"
COLUMN_OUTPUT: str = "output"
    
PATH_TO_DATA: str = "../data/november.json"

DEBUG: bool = True

---
# Data



In [35]:
# Read columns as strings without auto-detect/convert.
raw_df = pd.read_json(
    PATH_TO_DATA,
    convert_dates=False
)
raw_df

Unnamed: 0,facility,date,timeStart,timeEnd,supplierCode,suppliedM3,recoveredM3,processTime,supplier
0,Newcastle,"Nov 1, 2022",8:22:00 AM,10:39:00 AM,tom,4.25,2.50,,
1,Bundaberg,,11/1/22 8:26 AM,,,3.83,2.86,1:25,Mary Therese
2,Bundaberg,,11/1/22 10:08 AM,,,5.04,3.87,2:05,Mary Jane
3,Newcastle,"Nov 1, 2022",11:03:00 AM,12:07:00 PM,har,2.00,1.45,,
4,Newcastle,"Nov 1, 2022",12:37:00 PM,1:20:00 PM,har,2.00,1.50,,
...,...,...,...,...,...,...,...,...,...
223,Bundaberg,,11/30/22 11:27 AM,,,2.93,2.32,0:40,Mary Anne
224,Bundaberg,,11/30/22 12:25 PM,,,4.94,3.75,1:45,Mary
225,Newcastle,"Nov 30, 2022",1:30:00 PM,3:33:00 PM,tom,4.05,2.80,,
226,Bundaberg,,11/30/22 2:28 PM,,,4.59,3.13,3:00,Mary Therese


### Clone supplierCode into supplier when supplier column does not exist

In [8]:
if 'supplier' not in raw_df.columns:
    raw_df['supplier'] = raw_df['supplierCode']

### Add NaN column as processTime if not exist

In [9]:
if 'processTime' not in raw_df.columns:
    raw_df['processTime'] = raw_df['supplierCode']
    raw_df['processTime'] = np.nan

In [10]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   facility      227 non-null    object 
 1   timeStart     227 non-null    object 
 2   processTime   105 non-null    object 
 3   supplier      105 non-null    object 
 4   suppliedM3    227 non-null    float64
 5   recoveredM3   227 non-null    float64
 6   date          122 non-null    object 
 7   timeEnd       122 non-null    object 
 8   supplierCode  122 non-null    object 
dtypes: float64(2), object(7)
memory usage: 16.1+ KB


In [11]:
raw_df['facility'].unique()

array(['Bundaberg', 'Newcastle'], dtype=object)

In [12]:
raw_df['supplierCode'].unique()

array([nan, 'har', 'tom', 'dic'], dtype=object)

In [13]:
raw_df['supplier'].unique() if 'supplier' in raw_df.columns else None

array(['Mary Anne', nan, 'Mary', 'Mary Jane', 'Mary Therese'],
      dtype=object)

In [14]:
if DEBUG:
    display(raw_df)

Unnamed: 0,facility,timeStart,processTime,supplier,suppliedM3,recoveredM3,date,timeEnd,supplierCode
0,Bundaberg,10/1/22 8:33 AM,2:05,Mary Anne,2.82,2.24,,,
1,Newcastle,8:49:00 AM,,,2.00,1.40,"Oct 1, 2022",9:37:00 AM,har
2,Newcastle,10:37:00 AM,,,1.90,1.40,"Oct 1, 2022",11:24:00 AM,har
3,Bundaberg,10/1/22 11:35 AM,4:05,Mary,5.00,3.15,,,
4,Newcastle,12:03:00 PM,,,4.00,2.25,"Oct 1, 2022",2:20:00 PM,tom
...,...,...,...,...,...,...,...,...,...
222,Newcastle,10:40:00 AM,,,5.80,4.05,"Oct 31, 2022",1:23:00 PM,dic
223,Bundaberg,10/31/22 11:14 AM,4:55,Mary,5.00,3.29,,,
224,Newcastle,1:35:00 PM,,,3.80,2.70,"Oct 31, 2022",2:41:00 PM,tom
225,Newcastle,2:55:00 PM,,,2.05,1.40,"Oct 31, 2022",3:40:00 PM,har


---
# Feature extractions


## Start Time

In [15]:
def get_start_date_time(row: pd.Series) -> pd.Series:
    name: str = "get_start_date_time()"
    logger.debug("%s: row type[%s] row[%s]", name, type(row), row)

    # --------------------------------------------------------------------------------
    # if timeStart column is not valid, the row is invalid as there should be no way 
    # to recover start time.
    # --------------------------------------------------------------------------------
    if pd.isnull(row['timeStart']): 
        return np.nan
    
    
    start_date_time: datetime.datetime = np.nan

    # Begin start_date_time extraction
    if has_date_in_string(row['timeStart']):
        # --------------------------------------------------------------------------------
        # timeStart already includes date.
        # --------------------------------------------------------------------------------
        start_date_time = parse_datetime_string(row['timeStart'])
        logger.debug("%s: start_date_time is [%s]", name, start_date_time)
        
    else:
        # --------------------------------------------------------------------------------
        # timeStart has no date, then (date, startTime) columns must be valid.
        # Otherwise the row is invalid.
        # --------------------------------------------------------------------------------
        if pd.isnull(row['date']):
            # --------------------------------------------------------------------------------
            # Invalid row. Return NaN as the start_date_time to mark the row as invalid.
            # --------------------------------------------------------------------------------
            start_date_time = np.nan

        else:
            # --------------------------------------------------------------------------------
            # Date from 'date' column, and omit the time part.
            # --------------------------------------------------------------------------------
            _date_time: datetime.datetime = parse_datetime_string(row['date'])
            assert isinstance(_date_time, datetime.datetime)
            
            _date: datetime.date = _date_time.date()
            logger.debug("%s: date is [%s]", name, _date)

            # --------------------------------------------------------------------------------
            # Time from 'timeStart' column
            # --------------------------------------------------------------------------------
            _temp_date_time: datetime.datetime = parse_datetime_string(row['timeStart'])
            assert isinstance(_temp_date_time, datetime.datetime)
            
            _time: datetime.time = _temp_date_time.time()
            logger.debug("%s: start_time is [%s]", name, _time)
            
            start_date_time = convert_date_time_into_datetime(_date, _time)
            logger.debug("%s: start_date_time is [%s]", name, start_date_time)

    # End start_date_time extraction
            
    return start_date_time


In [16]:
start_date_time: pd.Series = raw_df.apply(func=get_start_date_time, axis=1)
start_date_time.name = COLUMN_START_TIME
if DEBUG:
    display(start_date_time)

0     2022-10-01 08:33:00
1     2022-10-01 08:49:00
2     2022-10-01 10:37:00
3     2022-10-01 11:35:00
4     2022-10-01 12:03:00
              ...        
222   2022-10-31 10:40:00
223   2022-10-31 11:14:00
224   2022-10-31 13:35:00
225   2022-10-31 14:55:00
226   2022-10-31 15:53:00
Name: start_date_time, Length: 227, dtype: datetime64[ns]

## End Time

In [17]:
interim_end_time_df: pd.DataFrame = pd.DataFrame({
    'start_date_time': start_date_time,
    'processTime': raw_df['processTime'],
    'timeEnd': raw_df['timeEnd']
})
    
if DEBUG:
    display(interim_end_time_df)

Unnamed: 0,start_date_time,processTime,timeEnd
0,2022-10-01 08:33:00,2:05,
1,2022-10-01 08:49:00,,9:37:00 AM
2,2022-10-01 10:37:00,,11:24:00 AM
3,2022-10-01 11:35:00,4:05,
4,2022-10-01 12:03:00,,2:20:00 PM
...,...,...,...
222,2022-10-31 10:40:00,,1:23:00 PM
223,2022-10-31 11:14:00,4:55,
224,2022-10-31 13:35:00,,2:41:00 PM
225,2022-10-31 14:55:00,,3:40:00 PM


In [18]:
def get_end_date_time(row: pd.Series) -> pd.Series:
    name: str = "get_end_date_time()"
    logger.debug("%s: row type[%s] row[%s]", name, type(row), row)

    assert isinstance(row[COLUMN_START_TIME], datetime.datetime), \
        f"expected start_date_time as datetime.datetime, got [{type(row[COLUMN_START_TIME])}]."
    start_date_time: datetime.datetime = row[COLUMN_START_TIME]
    
    # --------------------------------------------------------------------------------
    # if timeEnd and processTime columns are not valid, the row is invalid
    # --------------------------------------------------------------------------------
    if pd.isnull(row['timeEnd']) and pd.isnull(row['processTime']): 
        return np.nan
    
    # Begin end_date_time extraction
    end_date_time: datetime.datetime = np.nan
        
    if not pd.isnull(row['timeEnd']):
        # --------------------------------------------------------------------------------
        # Acquire end_date_time from (start_date_time, timeEnd)
        # --------------------------------------------------------------------------------
        if has_date_in_string(row['timeEnd']):
            # --------------------------------------------------------------------------------
            # timeEnd already includes date.
            # --------------------------------------------------------------------------------
            end_date_time = parse_datetime_string(row['timeEnd'])
            logger.debug("%s: end_date_time is [%s]", name, end_date_time)

        else:
            # --------------------------------------------------------------------------------
            # timeEnd has no date, hence it is only time expression, and date is from start_time.
            # --------------------------------------------------------------------------------
            _dummy_end_date_time: datetime.datetime = parse_datetime_string(row['timeEnd'])
            assert isinstance(_dummy_end_date_time, datetime.datetime)
            
            # take only time part
            _time: datetime.time = _dummy_end_date_time.time()
            logger.debug("%s: timeEnd is [%s]", name, _time)
            
            assert isinstance(row['start_date_time'], datetime.datetime)
            _date = row['start_date_time'].date()
            
            end_date_time = convert_date_time_into_datetime(
                date_in_year=_date,
                time_in_day=_time
            )
            
            # Make sure timeEnd is after start_time in case the processing is crossing midnight.
            if end_date_time <= start_date_time:
                logger.warning(
                    "%S: end_date_time [%s] is before start_date_time [%s]", 
                    end_date_time, start_date_time
                )
                # Advance the end_date_time with 24h.
                end_date_time = end_date_time + datetime.timedelta(days=1)
                logger.debug("%s: end_date_time is [%s]", name, end_date_time)

    else:
        # --------------------------------------------------------------------------------
        # Acquire end_date_time from (start_date_time, processTime)
        # [Assumption] processTime is hh:mm:yy and mm is less than 60.
        # --------------------------------------------------------------------------------
        try:
            assert isinstance(row['processTime'], str), \
                f"expected row['processTime'] of type str, got [{type(row['processTime'])}]"

            _time: datetime.time = parse_time_string(row['processTime'])
            delta: datetime.timedelta = convert_time_into_timedelta(time_in_day=_time)
            end_date_time = start_date_time + delta
            
        except ValueError as e:
            logging.error("%s: invalid time expression [%s]", name, row['processTime'])
            end_date_time = np.nan
        
    # End start_date_time extraction
            
    return end_date_time


In [19]:
end_date_time: pd.Series = interim_end_time_df.apply(func=get_end_date_time, axis=1)
end_date_time.name = COLUMN_END_TIME
interim_end_time_df.insert(
    loc=len(interim_end_time_df.columns),
    column=COLUMN_END_TIME,
    value=end_date_time,
    allow_duplicates=False
)
interim_end_time_df

Unnamed: 0,start_date_time,processTime,timeEnd,end_date_time
0,2022-10-01 08:33:00,2:05,,2022-10-01 08:35:05
1,2022-10-01 08:49:00,,9:37:00 AM,2022-10-01 09:37:00
2,2022-10-01 10:37:00,,11:24:00 AM,2022-10-01 11:24:00
3,2022-10-01 11:35:00,4:05,,2022-10-01 11:39:05
4,2022-10-01 12:03:00,,2:20:00 PM,2022-10-01 14:20:00
...,...,...,...,...
222,2022-10-31 10:40:00,,1:23:00 PM,2022-10-31 13:23:00
223,2022-10-31 11:14:00,4:55,,2022-10-31 11:18:55
224,2022-10-31 13:35:00,,2:41:00 PM,2022-10-31 14:41:00
225,2022-10-31 14:55:00,,3:40:00 PM,2022-10-31 15:40:00


In [20]:
interim_end_time_df[interim_end_time_df['end_date_time'].isnull()]
if DEBUG:
    display(raw_df[interim_end_time_df['end_date_time'].isnull()])
else:
    del interim_end_time_df

Unnamed: 0,facility,timeStart,processTime,supplier,suppliedM3,recoveredM3,date,timeEnd,supplierCode


## Process Time

In [21]:
interim_process_time_df: pd.DataFrame = pd.DataFrame({
    COLUMN_START_TIME: start_date_time,
    COLUMN_END_TIME: end_date_time,
})
if DEBUG:
    display(interim_process_time_df)

Unnamed: 0,start_date_time,end_date_time
0,2022-10-01 08:33:00,2022-10-01 08:35:05
1,2022-10-01 08:49:00,2022-10-01 09:37:00
2,2022-10-01 10:37:00,2022-10-01 11:24:00
3,2022-10-01 11:35:00,2022-10-01 11:39:05
4,2022-10-01 12:03:00,2022-10-01 14:20:00
...,...,...
222,2022-10-31 10:40:00,2022-10-31 13:23:00
223,2022-10-31 11:14:00,2022-10-31 11:18:55
224,2022-10-31 13:35:00,2022-10-31 14:41:00
225,2022-10-31 14:55:00,2022-10-31 15:40:00


In [22]:
def get_process_time(row: pd.Series) -> pd.Series:
    """Get proces time in seconds as TYPE_FLOAT
    Process time as end time - start time.
    
    Return: process time taken in second (as TYPE_FLOAT)
    """
    assert isinstance(row[COLUMN_START_TIME], datetime.datetime)
    assert isinstance(row[COLUMN_END_TIME], datetime.datetime)
    
    delta: datetime.timedelta = row[COLUMN_END_TIME] - row[COLUMN_START_TIME]
    assert isinstance(delta, datetime.timedelta), \
        f"expected delta as type datetime.timedelta, got {delta} of type [type(delta)]"
    
    return TYPE_FLOAT(delta.total_seconds())

In [23]:
process_time: pd.Series = interim_process_time_df.apply(func=get_process_time, axis=1)
process_time.name = COLUMN_PROCESS_TIME
if DEBUG:
    display(process_time)

0        125.0
1       2880.0
2       2820.0
3        245.0
4       8220.0
        ...   
222     9780.0
223      295.0
224     3960.0
225     2700.0
226    11220.0
Name: process_time, Length: 227, dtype: float32

In [24]:
interim_process_time_df.insert(
    loc=len(interim_process_time_df.columns),
    column=COLUMN_PROCESS_TIME,
    value=process_time,
    allow_duplicates=False
)

if DEBUG:
    display(interim_process_time_df)
else:
    del interim_process_time_df

Unnamed: 0,start_date_time,end_date_time,process_time
0,2022-10-01 08:33:00,2022-10-01 08:35:05,125.0
1,2022-10-01 08:49:00,2022-10-01 09:37:00,2880.0
2,2022-10-01 10:37:00,2022-10-01 11:24:00,2820.0
3,2022-10-01 11:35:00,2022-10-01 11:39:05,245.0
4,2022-10-01 12:03:00,2022-10-01 14:20:00,8220.0
...,...,...,...
222,2022-10-31 10:40:00,2022-10-31 13:23:00,9780.0
223,2022-10-31 11:14:00,2022-10-31 11:18:55,295.0
224,2022-10-31 13:35:00,2022-10-31 14:41:00,3960.0
225,2022-10-31 14:55:00,2022-10-31 15:40:00,2700.0


## Process Date

In [25]:
def get_process_date(row: pd.Series):
    _date_time: datetime.datetime = np.nan
        
    if not pd.isnull(row['start_date_time']):
        _date_time = convert_date_into_datetime(row['start_date_time'].date())
     
    return _date_time

In [26]:
process_date: pd.Series = start_date_time.to_frame().apply(func=get_process_date, axis=1)
process_date.name = COLUMN_PROCESS_DATE
if DEBUG:
    display(process_date)

0     2022-10-01
1     2022-10-01
2     2022-10-01
3     2022-10-01
4     2022-10-01
         ...    
222   2022-10-31
223   2022-10-31
224   2022-10-31
225   2022-10-31
226   2022-10-31
Name: process_date, Length: 227, dtype: datetime64[ns]

## Supplier Code

In [27]:
def get_code_from_supplier(supplier: str):
    supplier_to_code = {
        "mary therese": "mat",
        "mary": "mar",
        'mary anne': "maa",
        'mary jane': 'maj',
    }
    return supplier_to_code.get(supplier.lower(), np.nan)
    
    
def get_supplier_code(row: pd.DataFrame):
    code: str
    if row['supplierCode'] not in (np.nan, None):
        code = row['supplierCode']
    else:
        code = get_code_from_supplier(row['supplier']) 
    
    return code

In [28]:
supplier_code: pd.Series = raw_df.apply(func=get_supplier_code, axis=1)
supplier_code.name = COLUMN_SUPPLIER_CODE
if DEBUG:
    display(supplier_code)

0      maa
1      har
2      har
3      mar
4      tom
      ... 
222    dic
223    mar
224    tom
225    har
226    dic
Name: supplier_code, Length: 227, dtype: object

---
# Result


In [29]:
df: pd.DataFrame = pd.DataFrame({
    COLUMN_FACILITY: raw_df['facility'],
    COLUMN_SUPPLIER_CODE: supplier_code,
    COLUMN_START_TIME: start_date_time,
    COLUMN_PROCESS_TIME: process_time,
    COLUMN_INPUT: raw_df['suppliedM3'].astype(TYPE_FLOAT),
    COLUMN_OUTPUT: raw_df['recoveredM3'].astype(TYPE_FLOAT),
})
df

Unnamed: 0,facility,supplier_code,start_date_time,process_time,input,output
0,Bundaberg,maa,2022-10-01 08:33:00,125.0,2.82,2.24
1,Newcastle,har,2022-10-01 08:49:00,2880.0,2.00,1.40
2,Newcastle,har,2022-10-01 10:37:00,2820.0,1.90,1.40
3,Bundaberg,mar,2022-10-01 11:35:00,245.0,5.00,3.15
4,Newcastle,tom,2022-10-01 12:03:00,8220.0,4.00,2.25
...,...,...,...,...,...,...
222,Newcastle,dic,2022-10-31 10:40:00,9780.0,5.80,4.05
223,Bundaberg,mar,2022-10-31 11:14:00,295.0,5.00,3.29
224,Newcastle,tom,2022-10-31 13:35:00,3960.0,3.80,2.70
225,Newcastle,har,2022-10-31 14:55:00,2700.0,2.05,1.40


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 227 entries, 0 to 226
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   facility         227 non-null    object        
 1   supplier_code    227 non-null    object        
 2   start_date_time  227 non-null    datetime64[ns]
 3   process_time     227 non-null    float32       
 4   input            227 non-null    float32       
 5   output           227 non-null    float32       
dtypes: datetime64[ns](1), float32(3), object(2)
memory usage: 8.1+ KB


In [31]:
df.describe()

Unnamed: 0,process_time,input,output
count,227.0,227.0,227.0
mean,3452.400879,4.096255,2.807401
std,3928.424561,1.31145,0.900538
min,35.0,1.9,1.3
25%,100.0,3.07,2.15
50%,2280.0,4.15,2.85
75%,5550.0,5.035,3.46
max,16740.0,6.95,5.05


In [32]:
df['supplier_code'].unique()

array(['maa', 'har', 'mar', 'tom', 'maj', 'mat', 'dic'], dtype=object)

In [33]:
df['facility'].unique()

array(['Bundaberg', 'Newcastle'], dtype=object)