# Deep Learning Model

## Load Data in Google Colab

In [1]:
from google.colab import drive

drive.mount('./drive', force_remount=True)

Mounted at ./drive


In [2]:
!unzip '/content/drive/MyDrive/Colab Notebooks/KDT-MISSION/month2/delivery_raw.csv.zip'

Archive:  /content/drive/MyDrive/Colab Notebooks/KDT-MISSION/month2/delivery_raw.csv.zip
  inflating: delivery_raw.csv        
  inflating: __MACOSX/._delivery_raw.csv  


## Seed Set

In [53]:
import torch
import random
import numpy as np
import os

seed = 42

os.environ['PYTHONHASHSEED'] = str(seed)
random.seed(seed)

np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.enabled = False

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda')

## Load Data

In [54]:
import os
import pandas as pd

ROOT_DIR = '/content/'
DATA_PATH = os.path.join(ROOT_DIR, 'delivery_raw.csv')
delivery = pd.read_csv(DATA_PATH, sep='\t')

delivery.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift,total_busy,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,mexican,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


## Data Cleaning & Preprocessing

- Null value handling
  - `actual_delivery_time` : null 값 제거 및 레이블링에 사용 후 drop
  - `market_id` : mode
  - `order_protocol` : mode
  - `store_primary_category` : other 
  - `total_onshift` : mean
  - `total_busy` : mean
  - `total_outstanding_orders` : mean
  - `estimated_store_to_consumer_driving_duration`: mean
- Cleaning
  - 제거
    - `label` : >= 60000 제거
    - `total_items` : >= 400 제거
    - `max_item_price` >= 10000 제거
  - 변경
    - `total_outstanding_orders` : < 0 -> 0
    - `min_item_price` : < 0 -> 0
    - `total_outstanding_orders` : < 0 -> 0
- Extra Column
  - `onshift` = `total_onshift` - `total_busy` 값 중 음의 값을 0으로 만들어 학습에 사용
  - `created_at` : 시간대를 범주형 데이터 (19 ~ 1], (1 ~ 5] (5 ~ 19]

- Numeric Columns
  - `total_items`
  - `subtotal`
  - `num_distint_item`
  - `min_item_price`
  - `max_item_price`
  - `total_outstanding_orders`
  - `estimated_store_to_consumer_driving_duration`
  - `onshift`
- Category Columns
    - One-hot
        - `created_at`
    - Ordinal
        - `market_id`
        - `order_protocol`
        - `store_primary_category`
- **DROP COLUMNS**
  - `total_onshift`, `total_busy`, `store_id`, `actual_delivery_time`, `estimated_order_place_duration`

In [55]:
def cleaning(data, labeling=True):
    if labeling:
        # 레이블링
        data = data.drop(data.index[data['actual_delivery_time'].isnull()], axis=0)
        data['created_at'] = pd.to_datetime(data['created_at'])
        data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])
        data['label'] = (pd.DatetimeIndex(data['actual_delivery_time']) - pd.DatetimeIndex(data['created_at'])).total_seconds()

    # cleaning
    ## 최빈값으로 채우기
    data['market_id'].fillna(float(data['market_id'].mode()), inplace=True)
    data['order_protocol'].fillna(float(data['order_protocol'].mode()), inplace=True)
    ## 평균으로 채우기
    data['total_outstanding_orders'].fillna(float(data['total_outstanding_orders'].mean()), inplace=True)
    data['total_onshift'].fillna(float(data['total_onshift'].mean()), inplace=True)
    data['total_busy'].fillna(float(data['total_busy'].mean()), inplace=True)
    data['estimated_store_to_consumer_driving_duration'].fillna(float(data['estimated_store_to_consumer_driving_duration'].mean()), inplace=True)
    ## 특정값으로 채우기
    data['store_primary_category'].fillna('other', inplace=True)

    ## 이상치 제거
    mask = (data['label'] > 60000) | (data['total_items'] >= 400) | (data['max_item_price'] > 10000)
    data.drop(data[mask].index, axis=0, inplace=True)
    data['min_item_price'][data['min_item_price'] < 0] = 0
    data['total_outstanding_orders'][data['total_outstanding_orders'] < 0] = 0
    data['onshift'] = data['total_onshift'] - data['total_busy']
    data['onshift'][data['onshift'] < 0] = 0
    data['onshift'].fillna(float(data['onshift'].mean()), inplace=True)
    
    ## 시간 범주화 
    data['created_at'] = (data['created_at'].dt.hour)
    data['created_at'][(data['created_at'] >= 19) | (data['created_at'] < 1)] = 0
    data['created_at'][(data['created_at'] >= 1) & (data['created_at'] <= 4)] = 1
    data['created_at'][(data['created_at'] >= 5) & (data['created_at'] <= 18)] = 2
    
    drop_list = ['actual_delivery_time', 'store_id', 'total_onshift', 'total_busy', 'estimated_order_place_duration']
    data.drop(drop_list, axis=1, inplace=True)
    
    return data

In [56]:
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

num_attribs = ['total_items', 'subtotal', 'num_distinct_items', 
               'min_item_price', 'max_item_price', 'total_outstanding_orders', 
               'estimated_store_to_consumer_driving_duration', 'onshift']
one_hot_attribs = ['created_at']
ord_attribs = ['market_id', 'order_protocol', 'store_primary_category']

num_pipline = Pipeline([
    ('std_scaler', StandardScaler())
])

full_pipeline = ColumnTransformer([
    ('num', num_pipline, num_attribs),
    ('one-hot', OneHotEncoder(), one_hot_attribs),
    ('ord', OrdinalEncoder(), ord_attribs),
], remainder='passthrough')

In [57]:
import torch
from torch.utils.data import DataLoader, Dataset

class TensorData(Dataset):

    def __init__(self, X, y):
        self.X_data = torch.FloatTensor(X)
        self.y_data = torch.FloatTensor(y)
    

    def __getitem__(self, index):
        return self.X_data[index], self.y_data[index]
    

    def __len__(self):
        return self.y_data.shape[0]

In [58]:
cleaned_delivery = cleaning(delivery)

# split train test set [0.9, 0.1]
train_data, test_data = train_test_split(cleaned_delivery, test_size=0.1)

X_train = train_data.drop(['label'], axis=1)
y_train = train_data['label'].to_numpy().reshape((-1, 1))
X_test = test_data.drop(['label'], axis=1)
y_test = test_data['label'].to_numpy().reshape((-1, 1))

X_train.shape, y_train.shape, X_test.shape, y_test.shape

X_train = full_pipeline.fit_transform(X_train)
X_test = full_pipeline.fit_transform(X_test)

train_set = TensorData(X_train, y_train)
test_set = TensorData(X_test, y_test)

# train_dataloader = DataLoader(train_set, batch_size=32, shuffle=True)
test_dataloader = DataLoader(test_set, batch_size=32, shuffle=False)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['min_item_price'][data['min_item_price'] < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['total_outstanding_orders'][data['total_outstanding_orders'] < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['onshift'][data['onshift'] < 0] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

## Model

In [59]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F

from sklearn.model_selection import KFold

from sklearn.metrics import mean_squared_error

import matplotlib.pyplot as plt

from tqdm import tqdm

In [60]:
class Regressor(nn.Module):

    def __init__(self, input_dim):
        super(Regressor, self).__init__()
        self.reg = nn.Sequential(
            nn.Linear(input_dim, 24, bias=True),
            nn.ReLU(),
            nn.Linear(24, 12, bias=True),
            nn.ReLU(),
            nn.Linear(12, 6, bias=True),
            nn.ReLU(),
            nn.Linear(6, 1)            
        )
    

    def forward(self, X):
        out = self.reg(X)
        return out


class UnderPredPreventLoss(nn.Module):

    def __init__(self):
        super(UnderPredPreventLoss, self).__init__()

    
    def forward(self, output, target):
        size = target.size(0)
        output = output.view(-1)
        target = target.view(-1)
        diff = output - target
        mask = diff < 0
        weight = (mask.float() + 1)

        return ((diff * weight)**2).sum() / size

In [70]:
def train_one_epoch(model, dataloader, criterion, optimizer, device, scheduler=None):
    train_loss = 0.
    tot_size = 0

    model.train()
    for data, label in dataloader:
        data = data.to(device)
        label = label.to(device)

        optimizer.zero_grad()

        output = model(data)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()

        train_loss += loss
        tot_size += label.size(0)
        
    if scheduler is not None:
        scheduler.step()

    train_loss /= tot_size
    return train_loss 


def evaluation(model, dataloader, device):
    preds = torch.tensor([], dtype=torch.float)
    actual = torch.tensor([], dtype=torch.float)
    preds = preds.to(device)
    actual = actual.to(device)
    
    with torch.no_grad():
        model.eval()
        for data, label in dataloader:
            data = data.to(device)
            label = label.to(device)

            output = model(data)
            
            preds = torch.cat((preds, output), dim=0)
            actual = torch.cat((actual, label), dim=0)
        
    preds = preds.cpu().numpy()
    actual = actual.cpu().numpy()
    rmse = np.sqrt(mean_squared_error(preds, actual))
    return rmse

In [75]:
def full_train(model, epochs, train_loader, val_loader,
               criterion, optimizer, scheduler, save_path, device, 
               max_patience=5):
    min_val_rmse = float('inf')
    patience = 0

    for epoch in range(epochs):
        train_one_epoch(model, train_loader, criterion, optimizer, device, scheduler)
        train_rmse = evaluation(model, train_loader, device)
        val_rmse = evaluation(model, val_loader, device)
        print(f"Epoch: {epoch + 1:03d} Train Loss: {train_rmse:0.4f} Val Loss: {val_rmse:0.4f}")  
        
        if min_val_rmse > val_rmse:
            print("Detected New Best Model")
            torch.save(model, save_path)
            patience = 0
            min_val_rmse = val_rmse
        else:
            patience += 1
            if patience > max_patience:
                print("Early Stopping")
                return
            

In [68]:
def val_cross_train(dataset, epoch, device, save_path):
    validation_loss = []
    min_val_rmse = float('inf')

    kfold = KFold(n_splits=10, shuffle=True)
    for fold, (train_idx, val_idx) in enumerate(kfold.split(dataset)):
        train_subsampler = torch.utils.data.SubsetRandomSampler(train_idx)
        val_subsampler = torch.utils.data.SubsetRandomSampler(val_idx)

        train_dataloader = DataLoader(dataset, batch_size=128, sampler=train_subsampler)
        val_dataloader = DataLoader(dataset, batch_size=128, sampler=val_subsampler)

        model = Regressor(14)
        model.to(device)
        criterion = UnderPredPreventLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)

        for _ in range(epoch):
            cur_loss = train_one_epoch(model, train_dataloader, criterion, optimizer, device, scheduler)
            print(f"Epoch { _ + 1:03d} Loss: {cur_loss:0.4f}")

        train_rmse = evaluation(model, train_dataloader, device)
        val_rmse = evaluation(model, val_dataloader, device)
        print(f"K-fold: {fold:03d} Train Loss: {train_rmse:0.4f} Val Loss: {val_rmse:0.4f}")  
        validation_loss.append(val_rmse)

        if min_val_rmse > val_rmse:
            min_val_rmse = val_rmse
            print("Detected New Best Model Save ...")
            torch.save(model, save_path)
        
    validation_loss = np.array(validation_loss)
    mean = np.mean(validation_loss)
    std = np.std(validation_loss)
    print(f"Validation Score: {mean:04f} , {std:0.4f}")
    return validation_loss

In [34]:
val_cross_train(train_set, 15, device, './val_cross_best_1.h5')

Epoch 001 Loss: 25367.5020
Epoch 002 Loss: 8993.7207
Epoch 003 Loss: 6457.7368
Epoch 004 Loss: 5982.7700
Epoch 005 Loss: 5934.2158
Epoch 006 Loss: 5912.3232
Epoch 007 Loss: 5893.2559
Epoch 008 Loss: 5880.7881
Epoch 009 Loss: 5866.8579
Epoch 010 Loss: 5865.7598
Epoch 011 Loss: 5853.6094
Epoch 012 Loss: 5852.0737
Epoch 013 Loss: 5846.8159
Epoch 014 Loss: 5842.2212
Epoch 015 Loss: 5837.2285
K-fold: 000 Train Loss: 1178.3875 Val Loss: 1142.9750
Detected New Best Model Save ...
Epoch 001 Loss: 24587.2930
Epoch 002 Loss: 8482.4209
Epoch 003 Loss: 6014.3320
Epoch 004 Loss: 5730.3213
Epoch 005 Loss: 5707.9570
Epoch 006 Loss: 5687.3555
Epoch 007 Loss: 5679.6323
Epoch 008 Loss: 5667.1807
Epoch 009 Loss: 5657.5898
Epoch 010 Loss: 5654.1987
Epoch 011 Loss: 5649.8096
Epoch 012 Loss: 5648.3076
Epoch 013 Loss: 5643.9204
Epoch 014 Loss: 5647.3481
Epoch 015 Loss: 5643.2847
K-fold: 001 Train Loss: 1228.1510 Val Loss: 1290.7897
Epoch 001 Loss: 24606.9590
Epoch 002 Loss: 8559.1465
Epoch 003 Loss: 6224.579

In [97]:
final_preds = predict(test_dataloader, './val_cross_best_1.h5')

final_rmse = np.sqrt(mean_squared_error(y_test.reshape(-1), final_preds))
under_pred_rate = ((final_preds - y_test.reshape(-1)) < 0).sum() / y_test.shape[0]

print(f"TEST SET RMSE : {final_rmse:0.4f}, Under Predict Rate: {under_pred_rate:0.3f}")

TEST SET RMSE : 1167.2243, Under Predict Rate: 0.233
