In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from haversine import haversine
import re
import gc
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost

import matplotlib.pyplot as plt
import seaborn as sns

## data load

In [2]:
train = pd.read_parquet('./train07.parquet') # 7월 only 
test = pd.read_parquet('./test.parquet')
print(train.shape, test.shape)

(274695, 23) (291241, 22)


## 데이터 전처리

### base_date
train_base_date_df = train.groupby(['base_date','road_name']).count()[['id']].reset_index()
test_base_date_df = test.groupby(['base_date','road_name']).count()[['id']].reset_index()

train_base_date_df.columns = ['base_date','road_name','base_date_cnt']
test_base_date_df.columns = ['base_date','road_name','base_date_cnt']

train = train.merge(train_base_date_df, how='inner', on=['base_date','road_name'])
test = test.merge(test_base_date_df, how='inner', on=['base_date','road_name'])

In [3]:
# 위경도로 거리 도출
%time
train["distance"] = train[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")
test["distance"] = test[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")

Wall time: 0 ns


In [4]:
# label encoding 
str_col = ['day_of_week', 'start_turn_restricted','end_turn_restricted'
          ]   
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [5]:
# one-hot
cols = ['road_type', 'start_turn_restricted', 'end_turn_restricted', 'lane_count', 'road_rating','weight_restricted', 'day_of_week'
       ]  

train = pd.get_dummies(train, prefix_sep='_', sparse=False, drop_first=True, columns = cols)
test = pd.get_dummies(test, prefix_sep='_', sparse=False, drop_first=True, columns = cols)

In [6]:
# del 
del_cols = ['id', 'connect_code','height_restricted','vehicle_restricted'
            , 'start_node_name', 'end_node_name', 'road_name'
           ]  # , 'start_node_name', 
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

In [7]:
train.corr()[['target']].abs().sort_values('target', ascending=False)

Unnamed: 0,target
target,1.0
road_rating_107,0.388484
road_rating_106,0.301745
weight_restricted_43200.0,0.29009
road_type_3,0.260593
maximum_speed_limit,0.253601
base_hour,0.208142
distance,0.191126
start_longitude,0.166391
end_longitude,0.163042


In [8]:
#train[(train['base_date'] == 20220701) & (train['base_hour'] == 0)].sort_values('target').to_excel('temp/20220701_00.xlsx', encoding='utf-8-sig') 

## 모델링

In [9]:
gc.collect()

40

In [10]:
print(train.columns)
print(test.columns)

Index(['base_date', 'base_hour', 'multi_linked', 'maximum_speed_limit',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'target', 'distance', 'road_type_3', 'start_turn_restricted_1',
       'end_turn_restricted_1', 'lane_count_2', 'lane_count_3',
       'road_rating_106', 'road_rating_107', 'weight_restricted_32400.0',
       'weight_restricted_43200.0', 'weight_restricted_50000.0',
       'day_of_week_1', 'day_of_week_2', 'day_of_week_3', 'day_of_week_4',
       'day_of_week_5', 'day_of_week_6'],
      dtype='object')
Index(['base_date', 'base_hour', 'multi_linked', 'maximum_speed_limit',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'distance', 'road_type_3', 'start_turn_restricted_1',
       'end_turn_restricted_1', 'lane_count_2', 'lane_count_3',
       'road_rating_106', 'road_rating_107', 'weight_restricted_32400.0',
       'weight_restricted_43200.0', 'weight_restricted_50000.0',
       'day_of_week_1', '

In [11]:
from pycaret.regression import *

In [12]:
sup = setup(train, target = 'target', session_id=42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Original Data,"(274695, 26)"
3,Missing Values,False
4,Numeric Features,24
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(192286, 23)"


In [13]:
best_3 = compare_models(sort = 'MAE', n_select=3, exclude=['knn','ada','lar','lr','ridge','br','omp','lasso','en','huber','llar','par'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,3.0308,18.6558,4.3191,0.9119,0.1476,0.1054,29.547
et,Extra Trees Regressor,3.1775,20.7888,4.5593,0.9018,0.1554,0.1101,35.755
catboost,CatBoost Regressor,3.3061,20.7086,4.5506,0.9022,0.1592,0.1186,23.389
lightgbm,Light Gradient Boosting Machine,3.8857,26.7656,5.1735,0.8736,0.1824,0.1418,0.873
dt,Decision Tree Regressor,3.9038,32.4632,5.6975,0.8467,0.1952,0.133,0.503
gbr,Gradient Boosting Regressor,5.449,48.7129,6.9794,0.77,0.2438,0.204,10.822
knn,K Neighbors Regressor,6.4614,72.3977,8.5085,0.6582,0.2772,0.2303,13.78
ada,AdaBoost Regressor,8.1975,98.2344,9.9109,0.5362,0.3163,0.2917,8.978
lar,Least Angle Regression,8.8466,123.4343,11.1098,0.4172,0.3418,0.3132,0.05
lr,Linear Regression,8.847,123.4327,11.1098,0.4172,0.3419,0.3132,0.998


In [15]:
blended = blend_models(estimator_list = best_3, fold = 5)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9873,17.7413,4.212,0.9163,0.1463,0.1057
1,2.9694,17.3732,4.1681,0.9186,0.1435,0.104
2,2.999,18.0255,4.2456,0.9148,0.1479,0.1066
3,2.9917,17.8818,4.2287,0.9144,0.1443,0.1047
4,3.0069,18.022,4.2452,0.9155,0.1478,0.1069
Mean,2.9909,17.8088,4.2199,0.9159,0.146,0.1056
SD,0.0126,0.2417,0.0287,0.0015,0.0018,0.0011


In [16]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,2.9545,17.4395,4.1761,0.9175,0.1441,0.1037


In [17]:
final_model = finalize_model(blended)

In [18]:
predictions = predict_model(final_model, data = test)

In [19]:
sample_submission = pd.read_csv('input/sample_submission.csv')

In [23]:
sample_submission['target'] = predictions['Label']

In [24]:
#sample_submission.to_csv("output/20221028-1.csv", index = False)