In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from haversine import haversine
import re
import gc
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost

import matplotlib.pyplot as plt
import seaborn as sns

## data load

In [2]:
train = pd.read_parquet('./train07.parquet') # 7월 only 
test = pd.read_parquet('./test.parquet')
print(train.shape, test.shape)

(274695, 23) (291241, 22)


## 데이터 전처리

### base_date
train_base_date_df = train.groupby(['base_date','road_name']).count()[['id']].reset_index()
test_base_date_df = test.groupby(['base_date','road_name']).count()[['id']].reset_index()

train_base_date_df.columns = ['base_date','road_name','base_date_cnt']
test_base_date_df.columns = ['base_date','road_name','base_date_cnt']

train = train.merge(train_base_date_df, how='inner', on=['base_date','road_name'])
test = test.merge(test_base_date_df, how='inner', on=['base_date','road_name'])

In [3]:
# 위경도로 거리 도출
%time
train["distance"] = train[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")
test["distance"] = test[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")

Wall time: 995 µs


In [4]:
# label encoding 
str_col = ['day_of_week', 'start_turn_restricted','end_turn_restricted'
          ]   
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [5]:
# one-hot
cols = ['road_type', 'start_turn_restricted', 'end_turn_restricted', 'lane_count', 'road_rating','weight_restricted', 'day_of_week'
       ]  

train = pd.get_dummies(train, prefix_sep='_', sparse=False, drop_first=True, columns = cols)
test = pd.get_dummies(test, prefix_sep='_', sparse=False, drop_first=True, columns = cols)

In [6]:
# del 
del_cols = ['id', 'connect_code','height_restricted','vehicle_restricted'
            , 'start_node_name', 'end_node_name', 'road_name'
           ]  # , 'start_node_name', 
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

In [None]:
train.corr()[['target']].abs().sort_values('target', ascending=False)

In [None]:
#train[(train['base_date'] == 20220701) & (train['base_hour'] == 0)].sort_values('target').to_excel('temp/20220701_00.xlsx', encoding='utf-8-sig') 

## 모델링

In [7]:
gc.collect()

40

In [None]:
print(train.columns)
print(test.columns)

In [8]:
from pycaret.regression import *

In [None]:
sup = setup(train, target = 'target', session_id=42)

In [None]:
best_3 = compare_models(sort = 'MAE', n_select=3, fold = 3, exclude=['knn','ada','lar','lr','ridge','br','omp','lasso','en','huber','llar','par'])

In [None]:
best_3

In [9]:
from sklearn.ensemble import RandomForestRegressor

In [11]:
X_train = pd.concat([train, test])

In [24]:
train

Unnamed: 0,base_date,base_hour,multi_linked,maximum_speed_limit,start_latitude,start_longitude,end_latitude,end_longitude,target,distance,road_type_3,start_turn_restricted_1,end_turn_restricted_1,lane_count_2,lane_count_3,road_rating_106,road_rating_107,weight_restricted_32400.0,weight_restricted_43200.0,weight_restricted_50000.0,day_of_week_1,day_of_week_2,day_of_week_3,day_of_week_4,day_of_week_5,day_of_week_6
1,20220728,21,0,60.0,33.500730,126.529107,33.504811,126.526240,30.0,0.525891,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0
11,20220724,2,0,50.0,33.248505,126.569797,33.248633,126.567766,40.0,0.189391,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0
16,20220701,22,0,50.0,33.485885,126.489979,33.485975,126.486409,35.0,0.331239,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0
20,20220701,21,0,70.0,33.500103,126.512851,33.500132,126.512046,21.0,0.074720,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
37,20220702,19,0,50.0,33.485704,126.496451,33.483589,126.496368,24.0,0.235388,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4701149,20220711,19,0,50.0,33.251092,126.435439,33.251095,126.435138,41.0,0.028025,1,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0
4701153,20220720,23,0,60.0,33.251426,126.509066,33.251045,126.510574,28.0,0.146448,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0
4701164,20220708,9,0,70.0,33.471061,126.545467,33.469352,126.547314,27.0,0.255851,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
4701178,20220731,15,0,60.0,33.290116,126.489006,33.289861,126.487409,52.0,0.151111,1,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0


In [13]:
from datetime import datetime, timedelta

In [16]:
from tqdm import tqdm

In [28]:
start_date = pd.to_datetime('20220630')
end_date = pd.to_datetime('20220730')
pred_date = pd.to_datetime('20220731')

for d in tqdm(range(1, 32)):

    p_start_date = int((start_date + timedelta(days=d)).strftime('%Y%m%d'))
    p_end_date = int((end_date + timedelta(days=d)).strftime('%Y%m%d'))
    p_pred_date = int((pred_date + timedelta(days=d)).strftime('%Y%m%d'))
    print(d, p_start_date, p_end_date, p_pred_date)
    
    model = RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       max_samples=None, min_impurity_decrease=0.0,
                       min_impurity_split=None, min_samples_leaf=1,
                       min_samples_split=2, min_weight_fraction_leaf=0.0,
                       n_estimators=100, n_jobs=-1, oob_score=False,
                       random_state=42, verbose=0, warm_start=False)
    
    train_temp = train[(train['base_date'] >= p_start_date) & (train['base_date'] <= p_end_date)]
    model.fit(train_temp.drop('target', axis=1), train_temp['target'])
    
    test_temp = test[test['base_date'] == p_pred_date]
    pred = pd.DataFrame(model.predict(test_temp))
    
    pred = pd.concat([test_temp.reset_index(), pred], axis=1, ignore_index=True)
    
    pred.columns = ['index', 'base_date', 'base_hour', 'multi_linked', 'maximum_speed_limit',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'distance', 'road_type_3', 'start_turn_restricted_1',
       'end_turn_restricted_1', 'lane_count_2', 'lane_count_3',
       'road_rating_106', 'road_rating_107', 'weight_restricted_32400.0',
       'weight_restricted_43200.0', 'weight_restricted_50000.0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6', 'target']
    
    pred = pred.set_index('index')
    
    train = pd.concat([train_temp, pred]) 

  0%|                                                                                           | 0/31 [00:00<?, ?it/s]

1 20220701 20220731 20220801


  3%|██▋                                                                                | 1/31 [00:49<24:56, 49.89s/it]

2 20220702 20220801 20220802


  6%|█████▎                                                                             | 2/31 [01:40<24:13, 50.12s/it]

3 20220703 20220802 20220803


 10%|████████                                                                           | 3/31 [02:30<23:26, 50.22s/it]

4 20220704 20220803 20220804


 13%|██████████▋                                                                        | 4/31 [03:21<22:36, 50.24s/it]

5 20220705 20220804 20220805


 16%|█████████████▍                                                                     | 5/31 [04:11<21:44, 50.17s/it]

6 20220706 20220805 20220806


 19%|████████████████                                                                   | 6/31 [05:01<20:57, 50.30s/it]

7 20220707 20220806 20220807


 23%|██████████████████▋                                                                | 7/31 [05:54<20:24, 51.04s/it]

8 20220708 20220807 20220808


 26%|█████████████████████▍                                                             | 8/31 [06:44<19:28, 50.80s/it]

9 20220709 20220808 20220809


 29%|████████████████████████                                                           | 9/31 [07:35<18:33, 50.61s/it]

10 20220710 20220809 20220810


 32%|██████████████████████████▍                                                       | 10/31 [08:25<17:38, 50.42s/it]

11 20220711 20220810 20220811


 35%|█████████████████████████████                                                     | 11/31 [09:15<16:45, 50.29s/it]

12 20220712 20220811 20220812


 39%|███████████████████████████████▋                                                  | 12/31 [10:04<15:52, 50.14s/it]

13 20220713 20220812 20220813


 42%|██████████████████████████████████▍                                               | 13/31 [10:55<15:04, 50.25s/it]

14 20220714 20220813 20220814


 45%|█████████████████████████████████████                                             | 14/31 [11:45<14:14, 50.28s/it]

15 20220715 20220814 20220815


 48%|███████████████████████████████████████▋                                          | 15/31 [12:36<13:25, 50.32s/it]

16 20220716 20220815 20220816


 52%|██████████████████████████████████████████▎                                       | 16/31 [13:26<12:33, 50.24s/it]

17 20220717 20220816 20220817


 55%|████████████████████████████████████████████▉                                     | 17/31 [14:16<11:44, 50.31s/it]

18 20220718 20220817 20220818


 58%|███████████████████████████████████████████████▌                                  | 18/31 [15:07<10:55, 50.40s/it]

19 20220719 20220818 20220819


 61%|██████████████████████████████████████████████████▎                               | 19/31 [15:58<10:06, 50.53s/it]

20 20220720 20220819 20220820


 65%|████████████████████████████████████████████████████▉                             | 20/31 [16:49<09:17, 50.69s/it]

21 20220721 20220820 20220821


 68%|███████████████████████████████████████████████████████▌                          | 21/31 [17:39<08:26, 50.66s/it]

22 20220722 20220821 20220822


 71%|██████████████████████████████████████████████████████████▏                       | 22/31 [18:30<07:37, 50.84s/it]

23 20220723 20220822 20220823


 74%|████████████████████████████████████████████████████████████▊                     | 23/31 [19:22<06:48, 51.11s/it]

24 20220724 20220823 20220824


 77%|███████████████████████████████████████████████████████████████▍                  | 24/31 [20:13<05:57, 51.09s/it]

25 20220725 20220824 20220825


 81%|██████████████████████████████████████████████████████████████████▏               | 25/31 [21:04<05:05, 50.95s/it]

26 20220726 20220825 20220826


 84%|████████████████████████████████████████████████████████████████████▊             | 26/31 [21:55<04:15, 51.01s/it]

27 20220727 20220826 20220827


 87%|███████████████████████████████████████████████████████████████████████▍          | 27/31 [22:46<03:24, 51.09s/it]

28 20220728 20220827 20220828


 90%|██████████████████████████████████████████████████████████████████████████        | 28/31 [23:37<02:33, 51.03s/it]

29 20220729 20220828 20220829


 94%|████████████████████████████████████████████████████████████████████████████▋     | 29/31 [24:29<01:42, 51.23s/it]

30 20220730 20220829 20220830


 97%|███████████████████████████████████████████████████████████████████████████████▎  | 30/31 [25:22<00:51, 51.79s/it]

31 20220731 20220830 20220831


100%|██████████████████████████████████████████████████████████████████████████████████| 31/31 [26:16<00:00, 50.87s/it]


In [None]:
blended = blend_models(estimator_list = best_3, fold = 5)

In [None]:
pred_holdout = predict_model(blended)

In [None]:
final_model = finalize_model(blended)

In [None]:
predictions = predict_model(final_model, data = test)

In [29]:
sample_submission = pd.read_csv('input/sample_submission.csv')

In [41]:
sample_submission = pd.concat([sample_submission[['id']], train[train.base_date >= 20220801][['target']]], axis=1 )

In [42]:
sample_submission.to_csv("output/20221030-1.csv", index = False)