In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from haversine import haversine
import re
import gc
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost

import matplotlib.pyplot as plt
import seaborn as sns

## data load

In [2]:
train = pd.read_parquet('./train07.parquet') # 7월 only 
test = pd.read_parquet('./test.parquet')
print(train.shape, test.shape)

(274695, 23) (291241, 22)


## 데이터 전처리

### base_date
train_base_date_df = train.groupby(['base_date','road_name']).count()[['id']].reset_index()
test_base_date_df = test.groupby(['base_date','road_name']).count()[['id']].reset_index()

train_base_date_df.columns = ['base_date','road_name','base_date_cnt']
test_base_date_df.columns = ['base_date','road_name','base_date_cnt']

train = train.merge(train_base_date_df, how='inner', on=['base_date','road_name'])
test = test.merge(test_base_date_df, how='inner', on=['base_date','road_name'])

In [3]:
# 위경도로 거리 도출
%time
train["distance"] = train[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")
test["distance"] = test[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")

Wall time: 0 ns


In [4]:
import math
def truncate(number, digits) -> float:
    # Improve accuracy with floating point operations, to avoid truncate(16.4, 2) = 16.39 or truncate(-1.13, 2) = -1.12
    nbDecimals = len(str(number).split('.')[1]) 
    if nbDecimals <= digits:
        return number
    stepper = 10.0 ** digits
    return math.trunc(stepper * number) / stepper

In [10]:
num = 1 # num 0~2까지 해봄 
cols = ['start_node_name'] # 'start_latitude','start_longitude', 'end_latitude','end_longitude'
# 'start_node_name', 'end_node_name', 'road_name'
# for c in cols:
#     train[c] = list(map(lambda x: truncate(x,num), train[c]))
#     test[c] = list(map(lambda x: truncate(x,num), test[c]))
    
train_dist = train.groupby(['base_date','base_hour'] + cols).agg({'id':'count', 'lane_count':'max'})
test_dist = test.groupby(['base_date','base_hour'] + cols).agg({'id':'count', 'lane_count':'max'})    

train_dist['dist_cnt'] = list(map(lambda x: truncate(x,num)*-1, train_dist['id'] / train_dist['lane_count']))
test_dist['dist_cnt'] = list(map(lambda x: truncate(x,num)*-1, test_dist['id'] / test_dist['lane_count']))

train_dist = train_dist.drop(['id', 'lane_count'], axis=1)
test_dist = test_dist.drop(['id', 'lane_count'], axis=1)

train_dist = train_dist.reset_index()
test_dist = test_dist.reset_index()

train = train.merge(train_dist, on=['base_date','base_hour']+cols)
test = test.merge(test_dist, on=['base_date','base_hour']+cols)

In [11]:
# label encoding 
str_col = ['day_of_week', 'start_turn_restricted','end_turn_restricted'
          ]   
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [12]:
# one-hot
cols = ['road_type', 'start_turn_restricted', 'end_turn_restricted', 'lane_count', 'road_rating','weight_restricted', 'day_of_week'
       ]  

train = pd.get_dummies(train, prefix_sep='_', sparse=False, drop_first=True, columns = cols)
test = pd.get_dummies(test, prefix_sep='_', sparse=False, drop_first=True, columns = cols)

In [13]:
# del 
del_cols = ['id', 'connect_code','height_restricted','vehicle_restricted'
            , 'start_node_name', 'end_node_name', 'road_name'
           ]  # , 'start_node_name', 
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

## 모델링

In [14]:
gc.collect()

104

In [15]:
from pycaret.regression import *

In [16]:
sup = setup(train, target = 'target', session_id=42)

Unnamed: 0,Description,Value
0,session_id,42
1,Target,target
2,Original Data,"(274695, 27)"
3,Missing Values,False
4,Numeric Features,25
5,Categorical Features,1
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(192286, 24)"


In [17]:
best_3 = compare_models(sort = 'MAE', n_select=3, fold=5, exclude=['gbr','knn','ada','lar','lr','ridge','br','omp','lasso','en','huber','llar','par'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
rf,Random Forest Regressor,2.9119,17.4429,4.1764,0.9177,0.1433,0.1016,24.922
et,Extra Trees Regressor,3.1115,20.0304,4.4755,0.9055,0.1523,0.1076,25.236
catboost,CatBoost Regressor,3.2549,20.2332,4.4981,0.9045,0.1572,0.1166,21.542
dt,Decision Tree Regressor,3.6154,30.0035,5.4775,0.8584,0.188,0.124,1.282
lightgbm,Light Gradient Boosting Machine,3.75,25.2901,5.0289,0.8806,0.1756,0.1356,0.822


In [18]:
blended = blend_models(estimator_list = best_3, fold = 5)

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,2.913,16.9627,4.1186,0.9198,0.1434,0.103
1,2.9102,16.9943,4.1224,0.9204,0.1425,0.1025
2,2.9147,17.1049,4.1358,0.9183,0.1438,0.1033
3,2.8962,16.7284,4.09,0.9212,0.1405,0.1015
4,2.9148,17.1432,4.1404,0.9196,0.1434,0.1028
Mean,2.9098,16.9867,4.1215,0.9198,0.1427,0.1026
SD,0.007,0.1455,0.0177,0.001,0.0012,0.0006


In [19]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,2.858,16.5708,4.0707,0.9215,0.14,0.1001


In [None]:
2.9545, 2.8580
# start 1 - 2.9528 (-2)
# end  1 - 2.9827
# start + end  - X
# start_node_name - 2.8580(-2-1)
# end
# start + end 
# road 2 1 

In [20]:
final_model = finalize_model(blended)
predictions = predict_model(final_model, data = test)
sample_submission = pd.read_csv('input/sample_submission.csv')
sample_submission['target'] = predictions['Label']

In [21]:
sample_submission.to_csv("output/20221028-2-1.csv", index = False)