# 음식배달에 걸리는 시간 예측하기

Time 2 vec : https://towardsdatascience.com/time2vec-for-time-series-features-encoding-a03a4f3f937e

https://ojus1.github.io/posts/time2vec/

In [241]:
# 필요한 라이브러리 불러오기
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os

import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
%matplotlib inline

In [242]:
data = pd.read_csv('./delivery_raw.csv',sep='\t')
print('=============결측치 개수=============')
print(f'총 데이터 개수 : {len(data)}개')
for key in data.keys():
    num = data[key].isna().sum()
    if num: print(f'{key} : {num}')

총 데이터 개수 : 197428개
market_id : 987
actual_delivery_time : 7
store_primary_category : 4760
order_protocol : 995
total_onshift : 16262
total_busy : 16262
total_outstanding_orders : 16262
estimated_store_to_consumer_driving_duration : 526


## 결측치 처리
- market_id : 지역번호로 987개밖에 없으므로 삭제 or 랜덤
- actual_delivery_time : 예측해야하는 값이므로 dropna
- store_primary_category : 음식종류로 unique값은 74개이며 결측치 4760개가 아깝긴 하지만 drop?
- order_protocol : 주문방법으로 unique값은 7개 , 995개의 결측치 drop?
- total_onshift,busy,outstanding_orders : 결측치가 엄청 많음 drop하긴 아까움 -> ?????????????????????????

In [243]:
# 결측치 제거

data_lenth = len(data)
#data.dropna(axis=0, how='any', subset=['actual_delivery_time','market_id','order_protocol','store_primary_category'], inplace=True) 
data.dropna(axis=0, how='any', subset=data.columns, inplace=True)      # 도착시간이 null인경우 해당 raw삭제
data_lenth -= len(data)
print(f'{data_lenth}개의 null data 삭제')

#data.dropna(axis=0, how='any', subset=['market_id'], inplace=True)   
#data['market_id'].fillna(value=np.random.randint(1,7), inplace=True)

21651개의 null data 삭제


In [244]:
# 날짜 datetime 변환 및 target 만들기
data['created_at'] = pd.to_datetime(data['created_at'])
data['actual_delivery_time'] = pd.to_datetime(data['actual_delivery_time'])
data['target'] = (data['actual_delivery_time'] - data['created_at']).dt.seconds
data = data[data['target']<10000]

- 주문날짜는 2015-01~02
## 범주형
1. 명목형 : market_id, store_id, store_primary_category, order_protocol , 요일
2. 순서형

## 수치형
1. 이산형 : total_items, subtotal, num_distinct_items, min_item_price, max_item_price, total_onshift, total_busy, total_outstanding_orders
2. 연속형 : created_at, estimated_store_to_consumer_driving_duration, estimated_order_place_duration , 주문 시간

In [245]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 175606 entries, 0 to 197427
Data columns (total 17 columns):
 #   Column                                        Non-Null Count   Dtype         
---  ------                                        --------------   -----         
 0   market_id                                     175606 non-null  float64       
 1   created_at                                    175606 non-null  datetime64[ns]
 2   actual_delivery_time                          175606 non-null  datetime64[ns]
 3   store_id                                      175606 non-null  int64         
 4   store_primary_category                        175606 non-null  object        
 5   order_protocol                                175606 non-null  float64       
 6   total_items                                   175606 non-null  int64         
 7   subtotal                                      175606 non-null  int64         
 8   num_distinct_items                            175606 n

## 데이터 전처리
- 날짜 , 범주형 데이터 , 수치형 데이터로 나눔
    - 범주형 데이터에 날짜도 포함시키기

    
날짜 -> 년 월 일 시간으로 나누기
임베딩을 위해선 범주화 해야함

In [246]:
# hour, minute 추출 후 주문시간 만들기 + 요일 만들기

data['hour'] = data['created_at'].apply(lambda x: x.hour).astype('category')            # 9~13이 없음
data['minute'] = data['created_at'].apply(lambda x: x.minute).astype('category')        # 0~59 모두 있음
data['order_time'] = data['created_at'].apply(lambda x: (60*x.hour + x.minute) if x.hour>10 else (60*(x.hour+24) + x.minute))
data['day_of_week'] = data['created_at'].apply(lambda x: x.day_name()).astype('category')

In [247]:
# 수치형 변수
numeric_features = ['total_items','subtotal','num_distinct_items','min_item_price','max_item_price','total_onshift','total_busy',
                    'total_outstanding_orders','estimated_order_place_duration','estimated_store_to_consumer_driving_duration','order_time']

X_data_numeric = np.stack([data[col] for col in numeric_features],1)
print(f'수치형 데이터 : {X_data_numeric.shape}')

# 범주형 변수
categorical_features = ['market_id','store_id','store_primary_category','order_protocol','day_of_week']
day_of_week_dic = {'Monday': 0,'Tuesday': 1,'Wednesday': 2,'Thursday': 3,'Friday': 4,'Saturday': 5,'Sunday': 6}
store_id_dic = {}
for i,v in enumerate(sorted(data['store_id'].unique())):
    store_id_dic[v] = i

def convert_to_int_day(day):                # 요일
    return day_of_week_dic[day]
def convert_to_int_category(category):      # 음식 카테고리
    return pd.factorize(category)[0]
def convert_to_int_store_id(store_id):
    return store_id_dic[store_id]

data['market_id'] = data['market_id'] - 1                                                   # market_id
data['store_id'] = data['store_id'].apply(convert_to_int_store_id)                          # store_id
data['store_primary_category'] = convert_to_int_category(data['store_primary_category'])    # store_primary_category
data['order_protocol'] = data['order_protocol'] - 1                                         # order_protocol
data['day_of_week'] = data['day_of_week'].apply(convert_to_int_day)                         # 요일
X_data_categorical = np.stack([data[col] for col in categorical_features],1)
print(f'범주형 데이터 : {X_data_categorical.shape}')


# 최종 데이터 & 타겟 값
X_data = np.concatenate((X_data_categorical,X_data_numeric),axis=1)
y_data = np.array(data['target']).reshape(-1,1)
print(X_data.shape,y_data.shape)



# 임베딩을 위한 범주형 데이터 차원
categorical_features_size = [len(data[col].unique()) for col in categorical_features]
categorical_features_size = [(n,min(100,(n+1)//2)) for n in categorical_features_size]
print(categorical_features_size)

수치형 데이터 : (175606, 11)
범주형 데이터 : (175606, 5)
(175606, 16) (175606, 1)
[(6, 3), (5644, 100), (73, 37), (7, 4), (7, 4)]


In [248]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X_data, y_data,test_size=0.2, shuffle=True)
X_val , X_test , y_val , y_test = train_test_split(X_test, y_test,test_size=0.5, shuffle=True)

X_train , y_train = torch.tensor(X_train,dtype=torch.float32) , torch.tensor(y_train,dtype=torch.float32)
X_val , y_val = torch.tensor(X_val,dtype=torch.float32) , torch.tensor(y_val,dtype=torch.float32)
X_test , y_test = torch.tensor(X_test,dtype=torch.float32) , torch.tensor(y_test,dtype=torch.float32)
print(X_train.shape,y_train.shape)

# 데이터로더
batch_size = 64
train_loader = DataLoader(TensorDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
valid_loader = DataLoader(TensorDataset(X_val, y_val), batch_size=batch_size, shuffle=True)
test_loader = DataLoader(TensorDataset(X_test, y_test), batch_size=batch_size, shuffle=True)

torch.Size([140484, 16]) torch.Size([140484, 1])


In [266]:
# 모델 생성

class Model(nn.Module):
    def __init__(self, embedding_size, layers,layer_size=10, p=0.4,categorical_size=5,numeric_size=11):
        super().__init__()
        self.all_embeddings = nn.ModuleList([nn.Embedding(ni, nf) for ni, nf in embedding_size])
        self.dropout = nn.Dropout(p)
        self.categorical_size = categorical_size
        num_categorical_cols = sum((nf for ni, nf in embedding_size))
        self.input_batch = nn.BatchNorm1d(numeric_size)

        all_layers = []
        input_size_t = num_categorical_cols+numeric_size
        for i in layers:
            all_layers.append(nn.Linear(input_size_t, i))
            all_layers.append(nn.ReLU(inplace=True))
            all_layers.append(nn.BatchNorm1d(i))
            all_layers.append(nn.Dropout(p))
            input_size_t = i
        all_layers.append(nn.Linear(layers[-1], layer_size))
        self.layers = nn.Sequential(*all_layers)
        self.outlayer = nn.Linear(layer_size, 1)

    def forward(self, x):
        embeddings = []
        for i,e in enumerate(self.all_embeddings):
            embeddings.append(e(x[:,i].long()))
        x1 = torch.cat(embeddings, 1)
        x2 = self.input_batch(x[:,self.categorical_size:])
        x = torch.cat((x1,x2),1)
        x = self.dropout(x)
        x = self.layers(x)
        x = self.outlayer(x)
        return x

In [267]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')

In [268]:
model = Model(categorical_features_size,[200,100,50], p=0.2)
model.to(device)
#print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
class RMSELoss(nn.Module):
    def __init__(self):
        super(RMSELoss, self).__init__()

    def forward(self, y_pred, y_true):
        mse_loss = nn.MSELoss()
        rmse_loss = torch.sqrt(mse_loss(y_pred, y_true))
        return rmse_loss
RMSEloss = RMSELoss()

class MyLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, y_pred, y_true):
        mse_loss = nn.MSELoss(reduction='none')                                            # 각 요소별로 MSE 계산
        result = torch.where(y_pred < y_true, torch.tensor(2.0), torch.tensor(1.0))
        weighted_loss = result * mse_loss(y_pred, y_true)                                  # 각 요소별로 가중치 적용된 손실 계산
        summed_loss = torch.sum(weighted_loss)                                             # 손실 값들을 합산
        rmse_loss = torch.sqrt(summed_loss)                                                # RMSE 계산
        return rmse_loss
myloss = MyLoss()

In [269]:
Epoch = 40
train_loss , valid_loss = [] , []
train_uprate , valid_uprate = [] , []
for epoch in range(1,Epoch+1):
    total_loss , total_count = 0 , 0
    model.train()
    for data, y in train_loader:
        data , y = data.to(device) , y.to(device)
        output = model(data)
        rmseloss = RMSEloss(output, y)
        uploss = myloss(output,y)
        count = torch.sum(output < y).cpu()
        optimizer.zero_grad()
        uploss.backward()
        optimizer.step()
        total_loss += len(data)*rmseloss.cpu()
        total_count += count.item()
    epoch_loss = total_loss/len(train_loader.dataset)
    train_uprate.append(total_count/len(train_loader.dataset))
    train_loss.append(epoch_loss)
    print(f'Epoch{epoch} train_loss:{epoch_loss} {100*train_uprate[epoch-1]:.0f}%',end='  ')

    total_loss , total_count = 0 , 0
    model.eval()
    with torch.no_grad():
        for data, y in valid_loader:
            data , y = data.to(device) , y.to(device)
            output = model(data)
            rmseloss = RMSEloss(output, y)
            uploss = myloss(output,y)
            count = torch.sum(output < y).cpu()
            total_loss += len(data)*rmseloss.cpu()
            total_count += count.item()
    epoch_loss = total_loss/len(valid_loader.dataset)
    valid_loss.append(epoch_loss)
    valid_uprate.append(total_count/len(valid_loader.dataset))
    print(f'valid_loss:{epoch_loss} {100*valid_uprate[epoch-1]:.0f}%')
    torch.save(model.state_dict(), os.path.join('./model_data/', '{0:0=2d}.pth'.format(epoch)))


Epoch1 train_loss:1617.4173583984375 61%  valid_loss:940.6710205078125 32%
Epoch2 train_loss:998.389892578125 34%  valid_loss:951.8718872070312 29%
Epoch3 train_loss:982.769287109375 34%  valid_loss:923.7914428710938 32%
Epoch4 train_loss:973.6511840820312 34%  valid_loss:925.972412109375 31%
Epoch5 train_loss:963.0562744140625 34%  valid_loss:918.4759521484375 33%
Epoch6 train_loss:955.578857421875 34%  valid_loss:927.7946166992188 30%
Epoch7 train_loss:950.878662109375 34%  valid_loss:925.0840454101562 30%
Epoch8 train_loss:945.3604125976562 34%  valid_loss:930.6998901367188 31%
Epoch9 train_loss:941.4080810546875 34%  valid_loss:899.0188598632812 33%
Epoch10 train_loss:935.7836303710938 34%  valid_loss:911.0087890625 31%
Epoch11 train_loss:935.882568359375 34%  valid_loss:915.2188720703125 31%
Epoch12 train_loss:930.0257568359375 34%  valid_loss:910.0231323242188 32%
Epoch13 train_loss:925.6402587890625 34%  valid_loss:903.5979614257812 32%
Epoch14 train_loss:923.5065307617188 34%  

In [270]:
# test
total_loss , total_count = 0 , 0
model.eval()
with torch.no_grad():
    for data, y in test_loader:
        data , y = data.to(device) , y.to(device)
        output = model(data)
        rmseloss = RMSEloss(output, y)
        uploss = myloss(output,y)
        count = torch.sum(output < y).cpu()
        total_loss += len(data)*rmseloss.cpu()
        total_count += count.item()
total_loss = total_loss/len(valid_loader.dataset)
total_count = total_count / len(test_loader.dataset)
print(f'test_loss:{total_loss} {100*total_count:.0f}%')

test_loss:887.2420654296875 38%


결론 : RMSE는 887.2 
Under-prediction의 비율 : 38%