---
# Setup

In [1]:
import os
import sys
import logging
import datetime
import calendar
from typing import (
    List,
    Dict,
    Tuple,
    Any,
    Optional,
    Callable,
    Iterable,
)

import dateutil
import pandas as pd
import numpy as np

import xgboost as xgb
print(xgb.__version__)

from sklearn.metrics import (
    mean_squared_error
)
from sklearn.model_selection import (
    cross_val_score,
    RepeatedKFold,
    GridSearchCV
)
from xgboost import plot_importance
import matplotlib.pyplot as plt
from IPython.display import display, HTML

%matplotlib inline

1.7.3


## Logging

In [2]:
# logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logger = logging.getLogger("analysis")

## PYTHONPATH

In [3]:
sys.path.append(f"{os.getcwd()}/../../../lib")

In [4]:
%load_ext autoreload
%autoreload 2
from util_constant import (
    TYPE_FLOAT,
)
from util_datetime import (
    get_datetime_components,
    convert_date_into_datetime,
    convert_time_into_timedelta,
    convert_date_time_into_datetime,
    parse_date_string,
    parse_time_string,
    get_dates_from_string,
    has_date_in_string,
    parse_datetime_string,
    get_epoch_from_datetime,
    get_epoch_from_string,
    get_seconds_between_datetimes,
    get_datetime_after_duration,
    get_elapsed_time,
    get_holidays,
    is_holiday,
    is_weekend,
    get_cyclic_time_of_day,
    get_cyclic_day_of_week,
    get_cyclic_month_of_year,
)

In [5]:
from functions import (
    get_start_date_time,
    get_end_date_time,
    get_process_time,
    get_process_date,
    get_code_from_supplier,
    get_numeric_supplier_code,
    get_supplier_code,
    get_numeric_facility_code,
    get_facility_code,
    get_weekday,
    get_weekday_cyclic,
    get_holiday,
    get_start_hour,
    get_start_time_cyclic,
)

## Pandas

In [6]:
# pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Constant

In [7]:
PATH_TO_DATA: str = "../data/schedules.json"

In [8]:
from constants import (
    # loation
    COUNTRY_CODE,
    # Actors
    COLUMN_FACILITY_CODE,
    COLUMN_FACILITY,
    COLUMN_SUPPLIER_CODE,
    # Timing
    COLUMN_PROCESS_DATE,
    COLUMN_START_TIME,
    COLUMN_START_HOUR,
    COLUMN_START_TIME_SINX,
    COLUMN_START_TIME_COSY,
    COLUMN_END_TIME,
    COLUMN_WEEKDAY,
    COLUMN_WEEKDAY_SINX,
    COLUMN_WEEKDAY_COSY,
    COLUMN_IS_HOLIDAY,
    # Performance
    COLUMN_PROCESS_TIME,
    COLUMN_INPUT,
    COLUMN_OUTPUT,
    COLUMN_THROUGHPUT,
    COLUMN_RECOVERY_RATE,

    # utility
    SECONDS_IN_MIN
)

In [9]:
# Utility
DEBUG = False

---
# Data



In [10]:
# Read columns as strings without auto-detect/convert.
raw_df = pd.read_json(
    PATH_TO_DATA,
    convert_dates=False
)

In [11]:
raw_df.columns

Index(['facility', 'date', 'time', 'supplier', 'volumeM3'], dtype='object')

In [12]:
raw_df.rename(
    columns={
        'time':'timeStart',
        'volumeM3': 'suppliedM3'
    }, 
    inplace=True
)

### Add NaN column as supplierCode if not exist

In [13]:
if 'supplierCode' not in raw_df.columns:
    raw_df['supplierCode'] = raw_df['supplier']
    raw_df['supplierCode'] = np.nan

In [14]:
raw_df['facility'].unique()

array(['Bundaberg', 'Newcastle'], dtype=object)

In [15]:
raw_df['supplierCode'].unique()

array([nan])

In [16]:
raw_df['supplier'].unique() if 'supplier' in raw_df.columns else None

array(['Mary Anne', 'Dick Tracey', 'Mary', 'Tom Hanks', 'Mary Jane',
       'Harry Houdini', 'Mary Therese'], dtype=object)

In [17]:
if DEBUG:
    display(raw_df[:10])

---
# Feature extractions


## Start Time

In [18]:
start_date_time: pd.Series = raw_df.apply(func=get_start_date_time, axis=1)
start_date_time.name = COLUMN_START_TIME
if DEBUG:
    display(start_date_time[:5])

## Process Date

In [19]:
process_date: pd.Series = start_date_time.to_frame().apply(func=get_process_date, axis=1)
process_date.name = COLUMN_PROCESS_DATE
if DEBUG:
    display(process_date[:5])

## Supplier Code

In [20]:
supplier_code: pd.Series = raw_df.apply(func=get_supplier_code, axis=1)
supplier_code.name = COLUMN_SUPPLIER_CODE
if DEBUG:
    display(supplier_code[:5])

## Facility

In [21]:
facility_code: pd.Series = raw_df.apply(func=get_facility_code, axis=1)
facility_code.name = COLUMN_FACILITY_CODE
if DEBUG:
    display(facility_code[:5])

## Weekday

In [22]:
weekday: pd.Series = start_date_time.to_frame().apply(func=get_weekday, axis=1)
weekday.name = COLUMN_WEEKDAY

## Weekday in cyclic

In [23]:
weekday_sin_x: pd.Series
weekday_cos_y: pd.Series
    
weekday_sin_x, weekday_cos_y = ( # Return views, hence modifying them will cause warnings.
    value 
    for _, value 
    in start_date_time.to_frame().apply(func=get_weekday_cyclic, axis=1).items()
)

## Holiday

In [24]:
holiday: pd.Series = start_date_time.to_frame().apply(func=get_holiday, axis=1)
holiday.name = COLUMN_IS_HOLIDAY

## Start Hour

In [25]:
start_hour: pd.Series = start_date_time.to_frame().apply(func=get_start_hour, axis=1)
start_hour.name = COLUMN_START_HOUR

## Start time in cyclic

In [26]:
# https://stackoverflow.com/questions/23586510
# https://stackoverflow.com/questions/51225275
start_time_sin_x: pd.Series
start_time_cos_y: pd.Series
    
start_time_sin_x, start_time_cos_y = ( # Return views, hence modifying them will cause warnings.
    value 
    for _, value 
    in start_date_time.to_frame().apply(func=get_start_time_cyclic, axis=1).items()
)

In [27]:
start_time_sin_x.name = COLUMN_START_TIME_SINX
start_time_cos_y.name = COLUMN_START_TIME_COSY
if DEBUG:
    start_time_sin_x

---
# Prediction Input


In [28]:
df: pd.DataFrame = pd.DataFrame({
    COLUMN_FACILITY_CODE: facility_code,
    COLUMN_SUPPLIER_CODE: supplier_code,
    COLUMN_PROCESS_DATE: process_date,
    COLUMN_START_HOUR: start_hour,
    COLUMN_START_TIME_SINX: start_time_sin_x,
    COLUMN_START_TIME_COSY: start_time_cos_y,
    COLUMN_WEEKDAY: weekday,
    COLUMN_WEEKDAY_SINX: weekday_sin_x,
    COLUMN_WEEKDAY_COSY: weekday_cos_y,
    COLUMN_IS_HOLIDAY: holiday.astype(np.uint8),
    COLUMN_INPUT: raw_df['suppliedM3'].astype(TYPE_FLOAT),
})
df[:5]

Unnamed: 0,facility_code,supplier_code,process_date,start_hour,start_time_sin_x,start_time_cos_y,dayofweek,dayofweek_sin_x,dayofweek_cos_y,is_holiday,input
0,1,2,9,8,0.809017,-0.587785,0,0.0,1.0,0,2.9
1,0,5,9,8,0.801254,-0.598325,0,0.0,1.0,0,6.5
2,1,1,9,9,0.625923,-0.779885,0,0.0,1.0,0,5.0
3,0,5,9,11,0.01309,-0.999914,0,0.0,1.0,0,6.3
4,1,2,9,12,-0.199368,-0.979925,0,0.0,1.0,0,3.0


---
# Prediction

In [29]:
model = xgb.XGBRegressor()
model.load_model("xgb_model_sklearn.txt")

In [30]:
predictions = model.predict(df)

# Save Result

In [31]:
raw_df.drop(['supplierCode'], axis=1, inplace=True)
raw_df.rename(
    columns={
        'timeStart': 'time',
        'suppliedM3': 'volumeM3',
    }, 
    inplace=True
)
raw_df['prediction'] = predictions
raw_df

Unnamed: 0,facility,date,time,supplier,volumeM3,prediction
0,Bundaberg,"Jan 9, 2023",8:24:00 AM,Mary Anne,2.9,2.377049
1,Newcastle,"Jan 9, 2023",8:27:00 AM,Dick Tracey,6.5,4.767631
2,Bundaberg,"Jan 9, 2023",9:25:00 AM,Mary,5.0,3.61918
3,Newcastle,"Jan 9, 2023",11:57:00 AM,Dick Tracey,6.3,4.198112
4,Bundaberg,"Jan 9, 2023",12:46:00 PM,Mary Anne,3.0,2.358366
5,Newcastle,"Jan 9, 2023",1:49:00 PM,Tom Hanks,3.7,2.281763
6,Bundaberg,"Jan 9, 2023",2:49:00 PM,Mary Anne,2.9,2.157882
7,Newcastle,"Jan 10, 2023",8:17:00 AM,Dick Tracey,6.7,4.804445
8,Bundaberg,"Jan 10, 2023",8:30:00 AM,Mary,5.1,3.839736
9,Newcastle,"Jan 10, 2023",10:16:00 AM,Dick Tracey,7.0,4.433242


In [32]:
raw_df.to_json("prediction.json", orient='records')