# Linear regression training/predict/evaluate

Train the model to predict the recovery M3.

---
# Setup

## Ingestion

In [1]:
%run ingestion.ipynb

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   facility      810 non-null    object 
 1   date          481 non-null    object 
 2   timeStart     810 non-null    object 
 3   timeEnd       481 non-null    object 
 4   supplierCode  481 non-null    object 
 5   suppliedM3    810 non-null    float64
 6   recoveredM3   810 non-null    float64
 7   processTime   329 non-null    object 
 8   supplier      329 non-null    object 
dtypes: float64(2), object(7)
memory usage: 57.1+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   facility_code     810 non-null    int64         
 1   supplier_code     810 non-null    int64         
 2   process_date      810 non-null    int64     

## PYTHONPATH

In [2]:
if f"{os.getcwd()}/../../../lib" not in sys.path:
    sys.path.append(f"{os.getcwd()}/../../../lib") 

## Logging

In [3]:
# logging.basicConfig(stream=sys.stdout, level=logging.ERROR)
logger = logging.getLogger("training")

## Dependencies

In [4]:
import xgboost as xgb
print(xgb.__version__)

from sklearn.metrics import (
    mean_squared_error
)
from sklearn.preprocessing import (
    OneHotEncoder,
    MinMaxScaler
)
from sklearn.model_selection import (
    cross_val_score,
    RepeatedKFold,
    GridSearchCV
)
from sklearn.linear_model import (
    Ridge
)
from sklearn.compose import (
    ColumnTransformer
)
from sklearn.pipeline import (
    Pipeline
)

1.7.3


In [5]:
%load_ext autoreload
%autoreload 2
from util_pandas.ml import (
    stratified_shuffle_split_into_train_test
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


## Constant

In [6]:
DEBUG = False

---
# Data

In [7]:
df

Unnamed: 0,facility_code,supplier_code,process_date,start_date_time,start_hour,start_time_sin_x,start_time_cos_y,dayofweek,dayofweek_sin_x,dayofweek_cos_y,is_holiday,process_time,input,output,throughput,recovery_rate
0,0,4,1,2022-08-01 08:29:00,8,0.796002,-0.605294,0,0.000000,1.000000,0,2280.0,2.00,1.55,0.040789,0.775000
1,0,5,1,2022-08-01 09:27:00,9,0.619094,-0.785317,0,0.000000,1.000000,0,7260.0,6.80,4.15,0.034298,0.610294
2,0,4,1,2022-08-01 11:38:00,11,0.095846,-0.995396,0,0.000000,1.000000,0,2580.0,1.95,1.55,0.036047,0.794872
3,0,6,1,2022-08-01 12:40:00,12,-0.173648,-0.984808,0,0.000000,1.000000,0,5040.0,3.95,2.55,0.030357,0.645570
4,0,5,1,2022-08-01 14:25:00,14,-0.591309,-0.806445,0,0.000000,1.000000,0,7440.0,5.30,3.10,0.025000,0.584906
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
805,0,6,30,2022-09-30 11:40:00,11,0.087156,-0.996195,4,-0.433884,-0.900969,0,3660.0,3.70,2.35,0.038525,0.635135
806,0,5,30,2022-09-30 12:52:00,12,-0.224951,-0.974370,4,-0.433884,-0.900969,0,6240.0,6.35,4.55,0.043750,0.716535
807,1,0,30,2022-09-30 13:48:00,13,-0.453991,-0.891007,4,-0.433884,-0.900969,0,220.0,4.53,2.73,0.744545,0.602649
808,0,4,30,2022-09-30 15:02:00,15,-0.713251,-0.700909,4,-0.433884,-0.900969,0,2400.0,2.00,1.45,0.036250,0.725000


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 810 entries, 0 to 809
Data columns (total 16 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   facility_code     810 non-null    int64         
 1   supplier_code     810 non-null    int64         
 2   process_date      810 non-null    int64         
 3   start_date_time   810 non-null    datetime64[ns]
 4   start_hour        810 non-null    int64         
 5   start_time_sin_x  810 non-null    float32       
 6   start_time_cos_y  810 non-null    float32       
 7   dayofweek         810 non-null    int64         
 8   dayofweek_sin_x   810 non-null    float64       
 9   dayofweek_cos_y   810 non-null    float64       
 10  is_holiday        810 non-null    uint8         
 11  process_time      810 non-null    float32       
 12  input             810 non-null    float32       
 13  output            810 non-null    float32       
 14  throughput        810 non-

## Drop 

Drop non feature columns e.g. throughput.

In [9]:
df.drop([
    COLUMN_START_TIME,
    COLUMN_RECOVERY_RATE,
    COLUMN_THROUGHPUT,
    COLUMN_PROCESS_TIME,
], axis=1, inplace=True)
df

Unnamed: 0,facility_code,supplier_code,process_date,start_hour,start_time_sin_x,start_time_cos_y,dayofweek,dayofweek_sin_x,dayofweek_cos_y,is_holiday,input,output
0,0,4,1,8,0.796002,-0.605294,0,0.000000,1.000000,0,2.00,1.55
1,0,5,1,9,0.619094,-0.785317,0,0.000000,1.000000,0,6.80,4.15
2,0,4,1,11,0.095846,-0.995396,0,0.000000,1.000000,0,1.95,1.55
3,0,6,1,12,-0.173648,-0.984808,0,0.000000,1.000000,0,3.95,2.55
4,0,5,1,14,-0.591309,-0.806445,0,0.000000,1.000000,0,5.30,3.10
...,...,...,...,...,...,...,...,...,...,...,...,...
805,0,6,30,11,0.087156,-0.996195,4,-0.433884,-0.900969,0,3.70,2.35
806,0,5,30,12,-0.224951,-0.974370,4,-0.433884,-0.900969,0,6.35,4.55
807,1,0,30,13,-0.453991,-0.891007,4,-0.433884,-0.900969,0,4.53,2.73
808,0,4,30,15,-0.713251,-0.700909,4,-0.433884,-0.900969,0,2.00,1.45


## Train/Test Split

Split data into train and test with preserving the strata of supplier code. By preserving the propotion of supplier code, the propotion of facilities in the data is preserved as well.

In [10]:
train_data_set: pd.DataFrame
test_data_set: pd.DataFrame
    
train_data_set, test_data_set = stratified_shuffle_split_into_train_test(
    dataframe=df, column_name=COLUMN_SUPPLIER_CODE
)

In [11]:
train_data_set[COLUMN_SUPPLIER_CODE].value_counts() / len(train_data_set)

6    0.200617
5    0.200617
4    0.191358
0    0.114198
3    0.104938
2    0.095679
1    0.092593
Name: supplier_code, dtype: float64

In [12]:
test_data_set[COLUMN_SUPPLIER_CODE].value_counts() / len(test_data_set)

5    0.203704
6    0.203704
4    0.191358
0    0.111111
3    0.104938
1    0.092593
2    0.092593
Name: supplier_code, dtype: float64

In [13]:
X_train = train_data_set.drop(COLUMN_OUTPUT, axis=1)
y_train = train_data_set[COLUMN_OUTPUT].copy()
# del train_data_set

In [14]:
if DEBUG:
    display(X_train)
    display(y_train)

In [15]:
X_test = test_data_set.drop(COLUMN_OUTPUT, axis=1)
y_test = test_data_set[COLUMN_OUTPUT].copy()
# del test_data_set

---
# Feature Engineering 
## Normalisation

Normalize the numeric columns so that the value is in between 0-1. Untouch already normalized columns.

## OHE

One hot encode the categorical columns.

In [16]:
numeric_columns = [
    COLUMN_START_HOUR,
    COLUMN_PROCESS_DATE,
    COLUMN_WEEKDAY,
    COLUMN_INPUT,
]
category_columns = [
    COLUMN_FACILITY_CODE,
    COLUMN_SUPPLIER_CODE,
]

In [17]:
numeric_pipeline = Pipeline([
    ('normalizer', MinMaxScaler()),
])

In [18]:
categorical_pipeline = Pipeline([
    ('one_hot_encoder', OneHotEncoder()),
])

In [19]:
full_pipeline = ColumnTransformer(
    transformers=[
        ("numeric", numeric_pipeline, numeric_columns),
        ("category", categorical_pipeline, category_columns),
    ],
    remainder='passthrough'
)

In [20]:
X_train = full_pipeline.fit_transform(X_train)

In [21]:
pd.DataFrame(X_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
0,0.375,0.200000,1.000000,0.421569,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.034899,-0.999391,-0.781831,0.623490,1.0
1,0.375,0.533333,0.833333,0.556863,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.203642,-0.979046,-0.974928,-0.222521,1.0
2,0.750,0.266667,0.666667,0.000000,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.580703,-0.814116,-0.433884,-0.900969,0.0
3,0.125,0.800000,1.000000,0.372549,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.672367,-0.740218,-0.781831,0.623490,1.0
4,0.000,0.200000,0.000000,0.656863,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.814116,-0.580703,0.000000,1.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
643,0.500,0.600000,0.666667,0.029412,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,-0.207912,-0.978148,-0.433884,-0.900969,0.0
644,0.625,0.900000,0.333333,0.619608,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.292372,-0.956305,0.974928,-0.222521,0.0
645,0.750,0.533333,0.500000,0.209804,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,-0.646124,-0.763232,0.433884,-0.900969,0.0
646,0.625,0.933333,0.500000,0.607843,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,-0.337917,-0.941176,0.433884,-0.900969,0.0


In [22]:
X_test = full_pipeline.fit_transform(X_test)

---
# Model Training

## Ridge Linear Regression

* [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html#sklearn.linear_model.Ridge)

> Linear least squares with l2 regularization.


## Grid Search/K Hold Cross Validation

* [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

In [23]:
model = Ridge()

In [24]:
param_grid = {
    "alpha": [0.01, 0.03, 0.05, 0.07, 0.1],
}

In [25]:
cv = RepeatedKFold(n_splits=3, n_repeats=10, random_state=3)
search = GridSearchCV(
    model, 
    param_grid, 
    cv=cv, 
    n_jobs=-1,
    verbose=1, 
    return_train_score=True,
).fit(X_train, y_train)

Fitting 30 folds for each of 5 candidates, totalling 150 fits


In [26]:
print("The best hyperparameters are ", search.best_params_)

The best hyperparameters are  {'alpha': 0.03}


In [27]:
estimator = search.best_estimator_

In [28]:
print("Best Estimator: \n{}\n".format(search.best_estimator_))
print("Best Parameters: \n{}\n".format(search.best_params_))
print("Best Test Score: \n{}\n".format(search.best_score_))

Best Estimator: 
Ridge(alpha=0.03)

Best Parameters: 
{'alpha': 0.03}

Best Test Score: 
0.920354623544662



---
# Model Testing

In [29]:
predictions = estimator.predict(X_test)

In [30]:
scores = np.sqrt(mean_squared_error(y_true=y_test, y_pred=predictions))
scores

0.29320670137464744

In [31]:
result = pd.DataFrame({
    'prediction': predictions, 
    'truth': y_test
})
result['deviation %'] = np.abs((1.0 - result['prediction'] / result['truth']) * 100.0)
result

Unnamed: 0,prediction,truth,deviation %
221,3.602854,3.55,1.488843
439,3.655683,3.25,12.482551
33,2.616441,1.95,34.176442
442,3.464537,3.14,10.335565
568,2.252493,2.45,8.061526
...,...,...,...
611,2.330597,2.45,4.873581
294,3.320798,2.83,17.342683
4,3.339682,3.10,7.731666
668,2.279064,2.60,12.343690


## Percentage of deviation (mean)

In [32]:
result['deviation %'].mean()

8.548833678564877