---
# Setup

In [11]:
import os
import sys
import logging
import datetime
import calendar
from typing import (
    List,
    Dict,
    Tuple,
    Any,
    Optional,
    Callable,
    Iterable,
)

import dateutil
import pandas as pd
import numpy as np

## Logging

In [12]:
logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logger = logging.getLogger("analysis")

## PYTHONPATH

In [13]:
sys.path.append(f"{os.getcwd()}/../../../lib")

In [14]:
%load_ext autoreload
%autoreload 2

from util_datetime import (
    get_datetime_components,
    convert_date_into_datetime,
    convert_time_into_timedelta,
    convert_date_time_into_datetime,
    parse_date_string,
    parse_time_string,
    get_dates_from_string,
    has_date_in_string,
    parse_datetime_string,
    get_epoch_from_datetime,
    get_epoch_from_string,
    get_seconds_between_datetimes,
    get_datetime_after_duration,
    get_elapsed_time,
    get_holidays,
    get_cyclic_time_of_day,
    get_cyclic_day_of_week,
    get_cyclic_month_of_year,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Pandas

In [15]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

---
# Data

In [16]:
raw_df = pd.read_json(
    "../data/september.json",
    convert_dates=False
)
raw_df

Unnamed: 0,facility,timeStart,processTime,supplier,suppliedM3,recoveredM3,date,timeEnd,supplierCode
0,Bundaberg,9/1/22 8:16 AM,4:05,Mary,5.09,4.13,,,
1,Newcastle,8:29:00 AM,,,2.00,1.55,"Sep 1, 2022",9:07:00 AM,har
2,Newcastle,9:27:00 AM,,,6.80,4.15,"Sep 1, 2022",11:28:00 AM,dic
3,Newcastle,11:38:00 AM,,,1.95,1.55,"Sep 1, 2022",12:21:00 PM,har
4,Bundaberg,9/1/22 12:34 PM,1:50,Mary Therese,3.78,2.56,,,
...,...,...,...,...,...,...,...,...,...
227,Newcastle,11:40:00 AM,,,3.70,2.35,"Sep 30, 2022",12:41:00 PM,tom
228,Newcastle,12:52:00 PM,,,6.35,4.55,"Sep 30, 2022",2:36:00 PM,dic
229,Bundaberg,9/30/22 1:48 PM,3:40,Mary Therese,4.53,2.73,,,
230,Newcastle,3:02:00 PM,,,2.00,1.45,"Sep 30, 2022",3:42:00 PM,har


In [17]:
raw_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 232 entries, 0 to 231
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   facility      232 non-null    object 
 1   timeStart     232 non-null    object 
 2   processTime   111 non-null    object 
 3   supplier      111 non-null    object 
 4   suppliedM3    232 non-null    float64
 5   recoveredM3   232 non-null    float64
 6   date          121 non-null    object 
 7   timeEnd       121 non-null    object 
 8   supplierCode  121 non-null    object 
dtypes: float64(2), object(7)
memory usage: 16.4+ KB


---
# Feature extractions


## Start Time

In [37]:
def get_start_time(row: pd.Series):
    logger.debug("extract_start_time(): row type[%s] row[%s]", type(row), row)

    # --------------------------------------------------------------------------------
    # if timeStart column is not valid, the row is invalid as there should be no way 
    # to recover start time.
    # --------------------------------------------------------------------------------
    if pd.isnull(row['timeStart']): 
        return np.nan
    
    
    start_date_time: datetime.datetime = np.nan

    # Begin start_date_time extraction
    if has_date_in_string(row['timeStart']):
        # --------------------------------------------------------------------------------
        # timeStart already includes date.
        # --------------------------------------------------------------------------------
        start_date_time = parse_datetime_string(row['timeStart'])
        logger.debug("extract_start_time(): start_date_time is [%s]", start_date_time)
        
    else:
        # --------------------------------------------------------------------------------
        # timeStart has no date, then (date, startTime) columns must be valid.
        # Otherwise the row is invalid.
        # --------------------------------------------------------------------------------
        if pd.isnull(row['date']):
            # --------------------------------------------------------------------------------
            # Invalid row. Return NaN as the start_date_time to mark the row as invalid.
            # --------------------------------------------------------------------------------
            start_date_time = np.nan

        else:
            # --------------------------------------------------------------------------------
            # Date from 'date' column, and omit the time part.
            # --------------------------------------------------------------------------------
            _date_time: datetime.datetime = parse_datetime_string(row['date'])
            assert isinstance(_date_time, datetime.datetime)
            
            _date: datetime.date = _date_time.date()
            logger.debug("extract_start_time(): date is [%s]", _date)

            # --------------------------------------------------------------------------------
            # Time from 'timeStart' column
            # --------------------------------------------------------------------------------
            _temp_date_time: datetime.datetime = parse_datetime_string(row['timeStart'])
            assert isinstance(_temp_date_time, datetime.datetime)
            
            _time: datetime.time = _temp_date_time.time()
            logger.debug("extract_start_time(): start_time is [%s]", _time)
            
            start_date_time = convert_date_time_into_datetime(_date, _time)
            logger.debug("extract_start_time(): start_date_time is [%s]", start_date_time)

    # End start_date_time extraction
            
    return start_date_time


In [38]:
start_date_time: pd.Series = raw_df.apply(func=get_start_time, axis=1)
start_date_time.name = "start_date_time"

## Process Time / End Date Time

In [39]:
interim_df: pd.DataFrame = pd.DataFrame({
    'start_date_time': start_date_time,
    'processTime': raw_df['processTime'],
    'timeEnd': raw_df['timeEnd']
})
interim_df

Unnamed: 0,start_date_time,processTime,timeEnd
0,2022-09-01 08:16:00,4:05,
1,2022-09-01 08:29:00,,9:07:00 AM
2,2022-09-01 09:27:00,,11:28:00 AM
3,2022-09-01 11:38:00,,12:21:00 PM
4,2022-09-01 12:34:00,1:50,
...,...,...,...
227,2022-09-30 11:40:00,,12:41:00 PM
228,2022-09-30 12:52:00,,2:36:00 PM
229,2022-09-30 13:48:00,3:40,
230,2022-09-30 15:02:00,,3:42:00 PM


## Process Date

In [29]:
def get_process_date(row: pd.Series):
    _date_time: datetime.datetime = np.nan
        
    if not pd.isnull(row['start_date_time']):
        _date_time = convert_date_into_datetime(row['start_date_time'].date())
     
    return _date_time

In [31]:
process_date: pd.Series = start_date_time.to_frame().apply(func=get_process_date, axis=1)
process_date.name = 'process_date'

## Supplier Code

In [33]:
def get_code_from_supplier(supplier: str):
    supplier_to_code = {
        "mary therese": "mar",
        "mary": "mar"
    }
    return supplier_to_code.get(supplier.lower(), np.nan)
    
    
def get_supplier_code(row: pd.DataFrame):
    if row['supplier'] not in (np.nan, None):
        return get_code_from_supplier(row['supplier'])
    else:
        return row['supplierCode']

In [34]:
supplier_code: pd.Series = raw_df.apply(func=get_supplier_code, axis=1)
supplier_code.name = 'supplier_code'