In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from haversine import haversine
import re
import gc
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost

import matplotlib.pyplot as plt
import seaborn as sns

## data load

In [2]:
train = pd.read_parquet('./train07.parquet') # 7월 only 
test = pd.read_parquet('./test.parquet')
print(train.shape, test.shape)

(274695, 23) (291241, 22)


## 데이터 전처리

### base_date
train_base_date_df = train.groupby(['base_date','road_name']).count()[['id']].reset_index()
test_base_date_df = test.groupby(['base_date','road_name']).count()[['id']].reset_index()

train_base_date_df.columns = ['base_date','road_name','base_date_cnt']
test_base_date_df.columns = ['base_date','road_name','base_date_cnt']

train = train.merge(train_base_date_df, how='inner', on=['base_date','road_name'])
test = test.merge(test_base_date_df, how='inner', on=['base_date','road_name'])

In [3]:
# 위경도로 거리 도출
%time
train["distance"] = train[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")
test["distance"] = test[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")

Wall time: 0 ns


In [4]:
# label encoding 
str_col = ['day_of_week', 'start_turn_restricted','end_turn_restricted'
          ]   
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [5]:
# one-hot
cols = ['road_type', 'start_turn_restricted', 'end_turn_restricted', 'lane_count', 'road_rating','weight_restricted', 'day_of_week'
       ]  

train = pd.get_dummies(train, prefix_sep='_', sparse=False, drop_first=True, columns = cols)
test = pd.get_dummies(test, prefix_sep='_', sparse=False, drop_first=True, columns = cols)

In [6]:
# del 
del_cols = ['id', 'connect_code','height_restricted','vehicle_restricted'
            , 'start_node_name', 'end_node_name', 'road_name'
           ]  # , 'start_node_name', 
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

In [None]:
train.corr()[['target']].abs().sort_values('target', ascending=False)

In [None]:
#train[(train['base_date'] == 20220701) & (train['base_hour'] == 0)].sort_values('target').to_excel('temp/20220701_00.xlsx', encoding='utf-8-sig') 

## 모델링

In [7]:
gc.collect()

40

In [None]:
print(train.columns)
print(test.columns)

In [8]:
from pycaret.regression import *

In [9]:
sup = setup(train, target = 'target', session_id=42
           , normalize=True)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Original Data,"(274695, 26)"
3,Missing Values,False
4,Numeric Features,24
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(192286, 23)"


In [10]:
best_3 = compare_models(sort = 'MAE', n_select=3, fold = 5, exclude=['knn','ada','lar','lr','ridge','br','omp','lasso','en','huber','llar','par'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,2.8772,17.0671,4.1311,0.9194,0.1414,0.1001,26.881
et,Extra Trees Regressor,3.0799,19.7534,4.4443,0.9067,0.1511,0.1066,30.454
catboost,CatBoost Regressor,3.2705,20.3857,4.5149,0.9037,0.1577,0.1171,23.392
dt,Decision Tree Regressor,3.5283,29.2468,5.4078,0.8619,0.1848,0.1206,1.497
lightgbm,Light Gradient Boosting Machine,3.7905,25.6964,5.0691,0.8787,0.1779,0.1377,0.875
gbr,Gradient Boosting Regressor,5.3963,47.8751,6.9191,0.774,0.2421,0.2022,10.87


In [11]:
blended = blend_models(estimator_list = best_3, fold = 5)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.9133,16.9906,4.122,0.9199,0.1434,0.1031
1,2.8943,16.5819,4.0721,0.9223,0.1402,0.1013
2,2.9244,17.253,4.1537,0.9184,0.1446,0.1037
3,2.9141,17.0622,4.1306,0.9183,0.1411,0.102
4,2.9284,17.1966,4.1469,0.9193,0.1443,0.104
Mean,2.9149,17.0169,4.1251,0.9197,0.1427,0.1028
SD,0.0118,0.2366,0.0288,0.0014,0.0018,0.001


In [12]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,2.857,16.3946,4.049,0.9225,0.1397,0.1002


In [13]:
final_model = finalize_model(blended)

In [14]:
predictions = predict_model(final_model, data = test)

In [15]:
sample_submission = pd.read_csv('input/sample_submission.csv')

In [16]:
sample_submission['target'] = predictions['Label']

In [17]:
sample_submission.to_csv("output/20221029-3.csv", index = False)