# Model


## Seed Fix

In [3]:
import random
import numpy as np

seed = 42

random.seed(seed)
np.random.seed(seed)

## Data Loading

In [1]:
import os

import numpy as np
import pandas as pd

In [4]:
ROOT_DIR = '.'
DATA_PATH = os.path.join(ROOT_DIR, 'data', 'delivery_raw.csv')

delivery = pd.read_csv(DATA_PATH, sep='\t')

delivery.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift,total_busy,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


## Data Cleaning & Preprocessing


- Null value handling
  - `actual_delivery_time` : null 값 제거 및 레이블링에 사용 후 drop
  - `market_id` : mode
  - `order_protocol` : mode
  - `store_primary_category` : other 
  - `total_onshift` : mean
  - `total_busy` : mean
  - `total_outstanding_orders` : mean
  - `estimated_store_to_consumer_driving_duration`: mean
- Cleaning
  - 제거
    - `label` : >= 60000 제거
    - `total_items` : >= 400 제거
    - `max_item_price` >= 10000 제거
  - 변경
    - `total_outstanding_orders` : < 0 -> 0
    - `min_item_price` : < 0 -> 0
    - `total_outstanding_orders` : < 0 -> 0
- Extra Column
  - `onshift` = `total_onshift` - `total_busy` 값 중 음의 값을 0으로 만들어 학습에 사용
  - `created_at` : 시간대를 범주형 데이터 (19 ~ 1], (1 ~ 5] (5 ~ 19]

- Numeric Columns
  - `total_items`
  - `subtotal`
  - `num_distint_item`
  - `min_item_price`
  - `max_item_price`
  - `total_outstanding_orders`
  - `estimated_store_to_consumer_driving_duration`
  - `onshift`
- Category Columns
  - `market_id`
  - `order_protocol`
  - `created_at` : one-hot encoding
  - `store_primary_category` : ordinal encoding
  - 
- **DROP COLUMNS**
  - `total_onshift`, `total_busy`, `store_id`, `actual_delivery_time`, **`estimated_order_place_duration`**

In [None]:
def preprocessing(data):
    # 레이블링
    data = data.drop(data.index[data['actual_delivery_time'].isnull()], axis=0)
    data['created_at'] = pd.to_datetime(data['created_at'])
    data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])
    data['label'] = (pd.DatetimeIndex(data['actual_delivery_time']) - pd.DatetimeIndex(data['created_at'])).total_seconds()

    # cleaning
    ## 최빈값으로 채우기
    data['market_id'].fillna(float(data['market_id'].mode()), inplace=True)
    data['order_protocol'].fillna(float(data['order_protocol'].mode()), inplace=True)
    ## 평균으로 채우기
    data['total_outstanding_orders'].fillna(float(data['total_outstanding_orders'].mean()), inplace=True)
    data['total_onshift'].fillna(float(data['total_onshift'].mean()), inplace=True)
    data['total_busy'].fillna(float(data['total_busy'].mean()), inplace=True)
    data['estimated_store_to_consumer_driving_duration'].fillna(float(data['estimated_store_to_consumer_driving_duration'].mean()), inplace=True)
    ## 특정값으로 채우기
    data['store_primary_category'].fillna('other', inplace=True)

    ## 이상치 제거
    mask = (data['label'] > 60000) | (data['total_items'] >= 400) | (data['max_item_price'] > 10000)
    data.drop(data[mask].index, axis=0, inplace=True)
    data['min_item_price'][data['min_item_price'] < 0] = 0
    data['total_outstanding_orders'][data['total_outstanding_orders'] < 0] = 0
    data['onshift'] = data['total_onshift'] - data['total_busy']
    data['onshift'][data['onshift'] < 0] = 0
    data['onshift'].fillna(float(data['onshift'].mean()), inplace=True)
    
    ## 시간 범주화 
    data['created_at'] = (data['created_at'].dt.hour)
    data['created_at'][(data['created_at'] >= 19) | (data['created_at'] < 1)] = 0
    data['created_at'][(data['created_at'] >= 1) & (data['created_at'] <= 4)] = 1
    data['created_at'][(data['created_at'] >= 5) & (data['created_at'] <= 18)] = 2
    
    drop_list = ['actual_delivery_time', 'store_id', 'total_onshift', 'total_busy', 'estimated_order_place_duration']
    data.drop(drop_list, axis=1, inplace=True)
    
    return data

In [None]:
# split train validation set [0.9, 0.1]
from sklearn.model_selection import train_test_split

cleaned_delivery = preprocessing(delivery)

train_data, test_data = train_test_split(cleaned_delivery, test_size=0.1)

train_label = train_data['label']
train_feature = train_data.drop(['label'], axis=1)
test_label = test_data['label']
test_feature = test_data.drop(['label'], axis=1)

train_feature.shape, train_label.shape, test_feature.shape, test_label.shape

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['min_item_price'][data['min_item_price'] < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['total_outstanding_orders'][data['total_outstanding_orders'] < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['onshift'][data['onshift'] < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

((177674, 12), (177674,), (19742, 12), (19742,))

In [None]:
# 파이프라인
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

num_attribs = ['total_items', 'subtotal', 'num_distinct_items', 
               'min_item_price', 'max_item_price', 'total_outstanding_orders', 
               'estimated_store_to_consumer_driving_duration', 'onshift']
one_hot_attribs = ['created_at']
ord_attribs = ['store_primary_category']

num_pipline = Pipeline([
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipline, num_attribs),
    ('one-hot', OneHotEncoder(), one_hot_attribs),
    ('ord', OrdinalEncoder(), ord_attribs),
], remainder='passthrough')

delivery_prepared = full_pipeline.fit_transform(train_feature)

## Model Selection

In [None]:
def display_score(scores):
    print(f"Mean: {scores.mean()} \t Std: {scores.std()}")

In [None]:
from sklearn.model_selection import cross_val_score
import sklearn.linear_model as lin_model
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


models = [
    {
    'name': 'LinearRegression',
    'model': lin_model.LinearRegression()
    }, {
    'name': 'ElasticNet',
    'model': lin_model.ElasticNet()
    }, {
    'name': 'DecisionTreeRegressor',
    'model': DecisionTreeRegressor()
    }, {
    'name': 'GradientBoostingRegressor',
    'model': GradientBoostingRegressor()
    }, {
    'name': 'RandomForestRegressor',
    'model': RandomForestRegressor()
    },
]

for model in models:
    model_scores = cross_val_score(model['model'], X_train, y_train,
                                   scoring='neg_mean_squared_error', cv=5)
    print(model['name'])
    display_score(np.sqrt(-model_scores))

LinearRegression
Mean: 1081.0139592493238 	 Std: 32.532987815255396
ElasticNet
Mean: 1091.629814542836 	 Std: 32.56719530478251
DecisionTreeRegressor
Mean: 1559.9843475224595 	 Std: 32.29266179560744
GradientBoostingRegressor
Mean: 1048.0335287447708 	 Std: 32.67576313825857
RandomForestRegressor
Mean: 1071.0723168044428 	 Std: 35.67669197114766


## FineTune Model


In [None]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
import random

param_distribs = {
    'n_estimators': randint(low=50, high=200),
    'max_depth': randint(low=1, high=6),
    'max_features': randint(low=5, high=14),
}
grad_reg = GradientBoostingRegressor(random_state=42)
rnd_search = RandomizedSearchCV(grad_reg, param_distributions=param_distribs,
                                n_iter=10, cv=5, scoring='neg_mean_squared_error', random_state=42)

rnd_search.fit(X_train, y_train)

RandomizedSearchCV(cv=5, estimator=GradientBoostingRegressor(random_state=42),
                   param_distributions={'max_depth': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f183c3834c0>,
                                        'max_features': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f183bc98f40>,
                                        'n_estimators': <scipy.stats._distn_infrastructure.rv_frozen object at 0x7f183abd3910>},
                   random_state=42, scoring='neg_mean_squared_error')

In [None]:
cvres = rnd_search.cv_results_
for mean_score, params in zip(cvres['mean_test_score'], cvres['params']):
    print(np.sqrt(-mean_score), params)

1047.097710879185 {'max_depth': 4, 'max_features': 12, 'n_estimators': 70}
1055.8574849205818 {'max_depth': 2, 'max_features': 7, 'n_estimators': 124}
1043.9616026620017 {'max_depth': 3, 'max_features': 12, 'n_estimators': 166}
1037.94472075293 {'max_depth': 4, 'max_features': 12, 'n_estimators': 180}
1037.435502651077 {'max_depth': 5, 'max_features': 6, 'n_estimators': 137}
1038.5038317036124 {'max_depth': 4, 'max_features': 10, 'n_estimators': 179}
1042.5863357332553 {'max_depth': 4, 'max_features': 9, 'n_estimators': 107}
1039.6212690402067 {'max_depth': 5, 'max_features': 13, 'n_estimators': 98}
1045.1805886797656 {'max_depth': 3, 'max_features': 7, 'n_estimators': 157}
1038.2486462181926 {'max_depth': 4, 'max_features': 13, 'n_estimators': 180}


## 테스트 셋 평가

In [None]:
from sklearn.metrics import mean_squared_error

final_model = rnd_search.best_estimator_

final_preds = final_model.predict(X_test)

final_rmse = np.sqrt(mean_squared_error(y_test, final_preds))
under_pred_rate = ((final_preds - y_test) < 0).sum() / y_test.shape[0]

print(f"TEST SET RMSE : {final_rmse:0.4f}, Under Predict Rate: {under_pred_rate:0.3f}")

TEST SET RMSE : 1108.9209, Under Predict Rate: 0.423
