In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from haversine import haversine
import re
import gc
from sklearn.metrics import mean_absolute_error

import lightgbm as lgb
import xgboost

## data load

In [52]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

In [53]:
### 직전 한달(2022-07)만으로 예측
print(train.shape)
train['base_date'] = pd.to_datetime(train['base_date'], format='%Y%m%d')
train = train[train['base_date'].dt.strftime('%Y%m') == '202207']
print(train.shape)

(4701217, 23)
(274695, 23)


## 데이터 전처리

### base_date
train_base_date_df = train.groupby(['base_date','road_name']).count()[['id']].reset_index()
test_base_date_df = test.groupby(['base_date','road_name']).count()[['id']].reset_index()

train_base_date_df.columns = ['base_date','road_name','base_date_cnt']
test_base_date_df.columns = ['base_date','road_name','base_date_cnt']

train = train.merge(train_base_date_df, how='inner', on=['base_date','road_name'])
test = test.merge(test_base_date_df, how='inner', on=['base_date','road_name'])

In [54]:
# 위경도로 거리 도출
%time
train["distance"] = train[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")
test["distance"] = test[["start_latitude", "start_longitude", "end_latitude", "end_longitude"]].apply(lambda x: haversine((x[0], x[1]), (x[2], x[3])), axis="columns")

Wall time: 0 ns


In [55]:
# label encoding 
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted']
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

In [56]:
# one-hot
cols = ['day_of_week','road_type', 'start_turn_restricted', 'end_turn_restricted', 'lane_count', 'road_rating'
       ,'weight_restricted'] 
train = pd.get_dummies(train, prefix_sep='_', sparse=False, drop_first=True, columns = cols)
test = pd.get_dummies(test, prefix_sep='_', sparse=False, drop_first=True, columns = cols)

In [57]:
# del 
del_cols = ['id','height_restricted','vehicle_restricted', 'start_node_name', 'end_node_name', 'base_date', 'road_name']
train = train.drop(del_cols, axis=1)
test = test.drop(del_cols, axis=1)

## 모델링

In [58]:
gc.collect()

80

In [59]:
print(train.columns)

Index(['base_hour', 'multi_linked', 'connect_code', 'maximum_speed_limit',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'target', 'distance', 'day_of_week_1', 'day_of_week_2', 'day_of_week_3',
       'day_of_week_4', 'day_of_week_5', 'day_of_week_6', 'road_type_3',
       'start_turn_restricted_1', 'end_turn_restricted_1', 'lane_count_2',
       'lane_count_3', 'road_rating_106', 'road_rating_107',
       'weight_restricted_32400.0', 'weight_restricted_43200.0',
       'weight_restricted_50000.0'],
      dtype='object')


In [68]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(train.drop('target', axis=1) , train['target'], \
                                                    test_size=0.3, shuffle=True, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((192286, 25), (82409, 25), (192286,), (82409,))

In [69]:
%%time 
model = lgb.LGBMRegressor(random_state=42)

Wall time: 0 ns


In [70]:
%%time
model.fit(X_train, y_train)

Wall time: 971 ms


LGBMRegressor(random_state=42)

## 예측

In [71]:
mean_absolute_error(y_test, model.predict(X_test))

3.6285139132464463

In [None]:
3.62

## 제출

In [18]:
sample_submission = pd.read_csv('input/sample_submission.csv')

In [19]:
%%time
model2 = lgb.LGBMRegressor(random_state=42)

model2.fit(train.drop('target', axis=1), train['target'])

Wall time: 1.4 s


LGBMRegressor(random_state=42)

In [20]:
sample_submission['target'] = model2.predict(test)

In [21]:
sample_submission.to_csv("output/20221024-1.csv", index = False)