## 1. Environment

In [1]:
import platform
platform.platform()

'Linux-5.10.133+-x86_64-with-Ubuntu-18.04-bionic'

In [2]:
!cat /etc/issue.net

Ubuntu 18.04.6 LTS


In [3]:
!python --version

Python 3.7.15


In [4]:
from google.colab import drive 
drive.mount('/content/drive') 

DATA_PATH = "/content/drive/MyDrive/DACON/제주도 도로 교통량 예측 AI 경진대회/Final Version/"

Mounted at /content/drive


### 1.2 Install Necessary Libraries

In [5]:
#catboost 설치
!pip install catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting catboost
  Downloading catboost-1.1.1-cp37-none-manylinux1_x86_64.whl (76.6 MB)
[K     |████████████████████████████████| 76.6 MB 1.2 MB/s 
Installing collected packages: catboost
Successfully installed catboost-1.1.1


### 1.3 Load Libraries

In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc
import copy

from sklearn.ensemble import HistGradientBoostingRegressor
import lightgbm as lgb
import catboost as cb

import warnings
warnings.filterwarnings("ignore")

## 2. Data Preprocessing

In [7]:
# parquet 변환
def csv_to_parquet(csv_path, save_name):
  df = pd.read_csv(csv_path)
  df.to_parquet(f"./{save_name}.parquet")
  del df
  gc.collect()
  print(save_name, "Done")

csv_to_parquet(DATA_PATH + 'train.csv', 'train')
csv_to_parquet(DATA_PATH + 'test.csv', 'test')

train Done
test Done


In [8]:
train = pd.read_parquet('./train.parquet')
test = pd.read_parquet('./test.parquet')

### 2.1 Change Road Name

In [9]:
def change_road_name(df):
  df.loc[(df['start_node_name']=='남수교') | (df['end_node_name']=='남수교'),'road_name']='서성로'
  df.loc[(df['start_node_name']=='성읍사거리') | (df['end_node_name']=='성읍사거리'),'road_name']='지방도1136호선' 
  df.loc[(df['start_node_name']=='난산사거리') | (df['end_node_name']=='난산사거리'),'road_name']='난산로'
  df.loc[df['start_node_name']=='교보생명','road_name']='고성오조로'
  df.loc[(df['start_node_name']=='성산포식당') | (df['end_node_name']=='성산포식당'),'road_name']='일출로' 
  df.loc[(df['start_node_name']=='수협') | (df['end_node_name']=='수협'),'road_name']='동류암로'
  df.loc[(df['start_node_name']=='주차장') | (df['end_node_name']=='주차장'),'road_name']='김녕로'
  df.loc[(df['start_node_name']=='해안교') | (df['end_node_name']=='해안교'),'road_name']='애조로'
  df.loc[(df['start_node_name']=='수간교차로') | (df['end_node_name']=='수간교차로'),'road_name']='애조로'
  df.loc[(df['start_node_name']=='상귀교차로') | (df['end_node_name']=='상귀교차로'),'road_name']='애조로'
  df.loc[(df['start_node_name']=='광삼교') | (df['end_node_name']=='광삼교'),'road_name']='애조로'
  df.loc[(df['start_node_name']=='일호유리') & (df['end_node_name']=='하귀입구'),'road_name']='하광로'
  df.loc[(df['start_node_name']=='하귀입구') & (df['end_node_name']=='일호유리'),'road_name']='하광로'
  df.loc[(df['start_node_name']=='감귤선과장') | (df['end_node_name']=='감귤선과장'),'road_name']='천덕로'
  df.loc[(df['start_node_name']=='삼다식품') | (df['end_node_name']=='삼다식품'),'road_name']='천덕로'
  df.loc[(df['start_node_name']=='나동') | (df['end_node_name']=='나동'),'road_name']='한림상로'
  df.loc[(df['start_node_name']=='월계교') | (df['end_node_name']=='월계교'),'road_name']='한림상로'
  df.loc[(df['start_node_name']=='농협주유소') & (df['end_node_name']=='하나로마트'),'road_name']='한림상로'
  df.loc[(df['start_node_name']=='하나로마트') & (df['end_node_name']=='농협주유소'),'road_name']='한림상로'
  df.loc[(df['start_node_name']=='창성세차장') | (df['end_node_name']=='창성세차장'),'road_name']='한림서길'
  df.loc[(df['start_node_name']=='수원씽크공장') | (df['end_node_name']=='수원씽크공장'),'road_name']='한수풀로'
  df.loc[(df['start_node_name']=='금덕해운') | (df['end_node_name']=='금덕해운'),'road_name']='한수풀로'
  df.loc[(df['start_node_name']=='원일공사') | (df['end_node_name']=='원일공사'),'road_name']='한수풀로'
  df.loc[(df['start_node_name']=='한림1리복지회관') | (df['end_node_name']=='한림1리복지회관'),'road_name']='한림해안로'
  df.loc[(df['start_node_name']=='한수풀횟집') | (df['end_node_name']=='한수풀횟집'),'road_name']='한림해안로'
  df.loc[(df['start_node_name']=='수협중앙회') | (df['end_node_name']=='수협중앙회'),'road_name']='한림해안로'
  df.loc[(df['start_node_name']=='한림어촌계') & (df['end_node_name']=='옹포사거리'),'road_name']='한림로'
  df.loc[(df['start_node_name']=='옹포사거리') & (df['end_node_name']=='한림어촌계'),'road_name']='한림로'
  df.loc[(df['start_node_name']=='고림동4거리') | (df['end_node_name']=='고림동4거리'),'road_name']='명월로'
  df.loc[(df['start_node_name']=='가는질') | (df['end_node_name']=='가는질'),'road_name']='일반국도16호선'    
  df.loc[(df['start_node_name']=='광평교차로') | (df['end_node_name']=='광평교차로'),'road_name']='신록남로'
  df.loc[(df['start_node_name']=='예래입구') | (df['end_node_name']=='예래입구'),'road_name']='천제연로'
  df.loc[(df['start_node_name']=='산신주유소') | (df['end_node_name']=='산신주유소'),'road_name']='천제연로'
  df.loc[(df['start_node_name']=='천제이교') | (df['end_node_name']=='천제이교'),'road_name']='관광단지1로' 
  df.loc[(df['start_node_name']=='중산간도로삼거리') | (df['end_node_name']=='중산간도로삼거리'),'road_name']='신서귀로'
  df.loc[(df['start_node_name']=='서울이용원') & (df['end_node_name']=='남양리조트'),'road_name']='태평로'
  df.loc[(df['start_node_name']=='남양리조트') & (df['end_node_name']=='서울이용원'),'road_name']='태평로'
  df.loc[(df['start_node_name']=='서울이용원') & (df['end_node_name']=='뉴본아파트'),'road_name']='태평로'
  df.loc[(df['start_node_name']=='뉴본아파트') & (df['end_node_name']=='서울이용원'),'road_name']='태평로'
  df.loc[(df['start_node_name']=='정방수퍼') & (df['end_node_name']=='서울이용원'),'road_name']='정방로'
  df.loc[(df['start_node_name']=='서울이용원') & (df['end_node_name']=='정방수퍼'),'road_name']='정방로'
  df.loc[(df['start_node_name']=='오렌지농원') | (df['end_node_name']=='오렌지농원'),'road_name']='일반국도11호선'
  df.loc[(df['start_node_name']=='송목교') | (df['end_node_name']=='송목교'),'road_name']='서성로' 
  df.loc[(df['start_node_name']=='한남교차로') | (df['end_node_name']=='한남교차로'),'road_name']='서성로' 
  df.loc[(df['start_node_name']=='서중2교') & (df['end_node_name']=='서중2교'),'road_name']='서성로' 
  df.loc[(df['start_node_name']=='신하교') & (df['end_node_name']=='신하교'),'road_name']='서성로' 
  df.loc[(df['start_node_name']=='상위미') | (df['end_node_name']=='상위미'),'road_name']='위미항구로'
  df.loc[(df['start_node_name']=='진은교차로') | (df['end_node_name']=='진은교차로'),'road_name']='일주동로' 
  df.loc[(df['start_node_name']=='제2태흥교') | (df['end_node_name']=='제2태흥교'),'road_name']='일주동로' 
  df.loc[(df['start_node_name']=='금성동교차로') | (df['end_node_name']=='금성동교차로'),'road_name']='일주동로' 
  df.loc[(df['start_node_name']=='하나로교') & (df['end_node_name']=='하나로교'),'road_name']='일주동로'
  df.loc[(df['start_node_name']=='우사') | (df['end_node_name']=='우사'),'road_name']='동광로'

  return df

In [10]:
train = change_road_name(train)
test = change_road_name(test)

### 2.2 Feature Engineering

#### 2.2.1 Create Month column

In [11]:
train["base_date"] = pd.to_datetime(train["base_date"],format='%Y%m%d')
test["base_date"] = pd.to_datetime(test["base_date"],format='%Y%m%d')

train['month']= train['base_date'].dt.month
test['month']= test['base_date'].dt.month

#### 2.2.2 Create Avg Speed per Hour by Road column

In [12]:
train["base_hour"] = [str(x) for x in train["base_hour"]]
test["base_hour"] = [str(x) for x in test["base_hour"]]

train["road_hours"] =  train['road_name']+train['base_hour']
test["road_hours"] =  test['road_name']+test['base_hour']

In [13]:
tmp_train = train[["road_hours", "target"]]
tmp_train_mean = tmp_train.groupby("road_hours").mean().reset_index()

road_hours_list = list(tmp_train_mean["road_hours"])
road_hours_targets = list(tmp_train_mean["target"])

train_road_hours_targets = []

for idx in range(len(train)):
  road_hour_index = road_hours_list.index(train.iloc[idx,:][24])
  train_road_hours_targets.append(road_hours_targets[road_hour_index])

train["road_hours_target"] = train_road_hours_targets

test_road_hours_targets = []

for idx in range(len(test)):
  road_hour_index = road_hours_list.index(test.iloc[idx,:][23])
  test_road_hours_targets.append(road_hours_targets[road_hour_index])

test["road_hours_target"] = test_road_hours_targets

train["base_hour"] = [int(x) for x in train["base_hour"]]
test["base_hour"] = [int(x) for x in test["base_hour"]]

#### 2.2.3 Create Avg Speed per Day of the Week by Road column

In [14]:
train["day_of_week"] = [str(x) for x in train["day_of_week"]]
test["day_of_week"] = [str(x) for x in test["day_of_week"]]

train["road_day"] =  train['road_name']+train['day_of_week']
test["road_day"] =  test['road_name']+test['day_of_week']

In [15]:
tmp_road_day = train[["road_day", "target"]]
tmp_road_day_mean = tmp_road_day.groupby("road_day").mean().reset_index()

road_day_list = list(tmp_road_day_mean["road_day"])
road_day_targets = list(tmp_road_day_mean["target"])

train_road_day_means = []

for idx in range(len(train)):
  road_hour_index = road_day_list.index(train.iloc[idx,:][26])
  train_road_day_means.append(road_day_targets[road_hour_index])

train["road_day_mean"] = train_road_day_means

test_road_day_means = []

for idx in range(len(test)):
  road_hour_index = road_day_list.index(test.iloc[idx,:][25])
  test_road_day_means.append(road_day_targets[road_hour_index])

test["road_day_mean"] = test_road_day_means

#### 2.2.4 Create avg speed by Road, Hour and Day of the Week column

In [16]:
train["base_hour"] = [str(x) for x in train["base_hour"]]
test["base_hour"] = [str(x) for x in test["base_hour"]]

train["road_day_hour"] =  train['road_name']+train['day_of_week'] + train['base_hour']
test["road_day_hour"] =  test['road_name']+test['day_of_week'] + test['base_hour']

In [17]:
tmp_road_day = train[["road_day_hour", "target"]]
tmp_road_day_mean = tmp_road_day.groupby("road_day_hour").mean().reset_index()

road_day_list = list(tmp_road_day_mean["road_day_hour"])
road_day_targets = list(tmp_road_day_mean["target"])

train_road_day_means = []

for idx in range(len(train)):
  road_hour_index = road_day_list.index(train.iloc[idx,:][28])
  train_road_day_means.append(road_day_targets[road_hour_index])

train["road_day_hour_mean"] = train_road_day_means

test_road_day_means = []

for idx in range(len(test)):
  try:
    road_hour_index = road_day_list.index(test.iloc[idx,:][27])
  except:
    tmo_str = test.iloc[idx,:][27][:-1]
    tmp_num = int(test.iloc[idx,:][27][-1])
    if tmp_num == 0:
      new_num = '23'
    else:
      new_num = str(tmp_num - 1)
    tmp = tmo_str + new_num
    road_hour_index = road_day_list.index(tmp)


  test_road_day_means.append(road_day_targets[road_hour_index])

test["road_day_hour_mean"] = test_road_day_means

train["base_hour"] = [int(x) for x in train["base_hour"]]
test["base_hour"] = [int(x) for x in test["base_hour"]]

In [18]:
train.head()

Unnamed: 0,id,base_date,day_of_week,base_hour,lane_count,road_rating,road_name,multi_linked,connect_code,maximum_speed_limit,...,end_longitude,end_turn_restricted,target,month,road_hours,road_hours_target,road_day,road_day_mean,road_day_hour,road_day_hour_mean
0,TRAIN_0000000,2022-06-23,목,17,1,106,지방도1112호선,0,0,60.0,...,126.662335,없음,52.0,6,지방도1112호선17,44.945994,지방도1112호선목,47.421487,지방도1112호선목17,44.918919
1,TRAIN_0000001,2022-07-28,목,21,2,103,일반국도11호선,0,0,60.0,...,126.52624,없음,30.0,7,일반국도11호선21,39.388762,일반국도11호선목,39.904327,일반국도11호선목21,39.335878
2,TRAIN_0000002,2021-10-10,일,7,2,103,일반국도16호선,0,0,80.0,...,126.362147,없음,61.0,10,일반국도16호선7,44.789037,일반국도16호선일,44.996507,일반국도16호선일7,46.530935
3,TRAIN_0000003,2022-03-11,금,13,2,107,태평로,0,0,50.0,...,126.566228,없음,20.0,3,태평로13,22.146268,태평로금,25.107775,태평로금13,21.907925
4,TRAIN_0000004,2021-10-05,화,8,2,103,일반국도12호선,0,0,80.0,...,126.330152,없음,38.0,10,일반국도12호선8,42.03626,일반국도12호선화,43.464148,일반국도12호선화8,41.254252


In [19]:
train.to_csv(DATA_PATH + 'mean_train.csv', index=False)
test.to_csv(DATA_PATH + 'mean_test.csv', index=False)

#### 2.2.5 Creating a moving average derived variable by road and time

In [20]:
tmp_road_day_mean

Unnamed: 0,road_day_hour,target
0,경찰로금0,30.991453
1,경찰로금1,32.100446
2,경찰로금10,22.527778
3,경찰로금11,22.395753
4,경찰로금12,22.671968
...,...,...
13372,호서중앙로화5,36.909091
13373,호서중앙로화6,32.928571
13374,호서중앙로화7,38.000000
13375,호서중앙로화8,26.800000


In [21]:
print("현재 도로+요일+시간 종류: ",len(tmp_road_day_mean.road_day_hour.unique()))
print("전체도로 숫자: ", len(train.road_name.unique()))
print("전체 도로+요일+시간 수: ", len(train.road_name.unique())*7*24)
print("없는 값의 수: ", len(train.road_name.unique())*7*24 - len(tmp_road_day_mean.road_day_hour.unique()))

현재 도로+요일+시간 종류:  13377
전체도로 숫자:  80
전체 도로+요일+시간 수:  13440
없는 값의 수:  63


In [22]:
road_day_hour = []
values = []

train_road_day_hour = list(tmp_road_day_mean.road_day_hour)
train_value = list(tmp_road_day_mean.target)

road_names = list(train.road_name.unique())
days = ["월", "화", "수", "목", "금", "토", "일"]
hours = [x for x in range(24)]

for road_name in road_names:
  for idx, day in enumerate(days):
    for hour in hours:
      tmp_name = road_name + day + str(hour)
      
      try:
        tmp_index = train_road_day_hour.index(tmp_name)
        tmp_value = train_value[tmp_index]
      except:
        tmp_value = 0
      
      road_day_hour.append(tmp_name)
      values.append(tmp_value)

In [23]:
for road_name in road_names:
  for idx, day in enumerate(days):
    for hour in hours:
      tmp_name = road_name + day + str(hour)
      tmp_index = road_day_hour.index(tmp_name)
      tmp_value = values[tmp_index]

      if tmp_value == 0:
        values[tmp_index] = values[tmp_index-1]

In [24]:
data = {"road_day_hour": road_day_hour,
        "target": values}

df = pd.DataFrame(data=data)

In [25]:
# 3시간 이동평균
road_day_hour_list = list(df.road_day_hour)
target_list = list(df.target)
rolling3 = []

road_names = list(train.road_name.unique())
days = ["월", "화", "수", "목", "금", "토", "일"]
hours = [x for x in range(24)]

for road_name in road_names:
  for idx, day in enumerate(days):
    for hour in hours:

      if hour > 2:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+day+str(hour-3))]
      elif hour == 2:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
      elif hour == 1:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
      else:
        tmp1 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp2 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"21")]

      rolling3.append((tmp1+tmp2+tmp3)/3)

df["ma3"] = rolling3

In [26]:
# 4 hour moving average
road_day_hour_list = list(df.road_day_hour)
target_list = list(df.target)
rolling4 = []

for road_name in road_names:
  for idx, day in enumerate(days):
    for hour in hours:
      if hour > 3:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+day+str(hour-3))]
        tmp4 = target_list[road_day_hour_list.index(road_name+day+str(hour-4))]
      elif hour == 3:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+day+str(hour-3))]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
      elif hour == 2:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
      elif hour == 1:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"21")]
      else:
        tmp1 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp2 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"21")]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"20")]

      rolling4.append((tmp1+tmp2+tmp3+tmp4)/4)

df["ma4"] = rolling4

In [27]:
# 6 hour moving average
road_day_hour_list = list(df.road_day_hour)
target_list = list(df.target)
rolling6 = []

for road_name in road_names:
  for idx, day in enumerate(days):
    for hour in hours:
      if hour > 5:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+day+str(hour-3))]
        tmp4 = target_list[road_day_hour_list.index(road_name+day+str(hour-4))]
        tmp5 = target_list[road_day_hour_list.index(road_name+day+str(hour-5))]
        tmp6 = target_list[road_day_hour_list.index(road_name+day+str(hour-6))]
      elif hour == 5:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+day+str(hour-3))]
        tmp4 = target_list[road_day_hour_list.index(road_name+day+str(hour-4))]
        tmp5 = target_list[road_day_hour_list.index(road_name+day+str(hour-5))]
        tmp6 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
      elif hour == 4:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+day+str(hour-3))]
        tmp4 = target_list[road_day_hour_list.index(road_name+day+str(hour-4))]
        tmp5 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp6 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
      elif hour == 3:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+day+str(hour-3))]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp5 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
        tmp6 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"21")]
      elif hour == 2:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+day+str(hour-2))]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
        tmp5 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"21")]
        tmp6 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"20")]
      elif hour == 1:
        tmp1 = target_list[road_day_hour_list.index(road_name+day+str(hour-1))]
        tmp2 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"21")]
        tmp5 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"20")]
        tmp6 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"19")]
      else:
        tmp1 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"23")]
        tmp2 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"22")]
        tmp3 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"21")]
        tmp4 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"20")]
        tmp5 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"19")]
        tmp6 = target_list[road_day_hour_list.index(road_name+days[idx-1]+"18")]

      rolling6.append((tmp1+tmp2+tmp3+tmp4+tmp5+tmp6)/6)

df["ma6"] = rolling6

In [28]:
train_mean_values = []
train_rolling3_values = []
train_rolling4_values = []
train_rolling6_values = []

for idx in range(len(train)):
  index = road_day_hour_list.index(train.iloc[idx,:][28])
  train_rolling3_values.append(rolling3[index])
  train_rolling4_values.append(rolling4[index])
  train_rolling6_values.append(rolling6[index])

train["rdh_ma3"] = train_rolling3_values
train["rdh_ma4"] = train_rolling4_values
train["rdh_ma6"] = train_rolling6_values

test_mean_values = []
test_rolling3_values = []
test_rolling4_values = []
test_rolling6_values = []

for idx in range(len(test)):
  index = road_day_hour_list.index(test.iloc[idx,:][27])
  test_rolling3_values.append(rolling3[index])
  test_rolling4_values.append(rolling4[index])
  test_rolling6_values.append(rolling6[index])

test["rdh_ma3"] = test_rolling3_values
test["rdh_ma4"] = test_rolling4_values
test["rdh_ma6"] = test_rolling6_values

In [29]:
train.to_csv(DATA_PATH + 'param_train.csv', index=False)
test.to_csv(DATA_PATH + 'param_test.csv', index=False)

#### 2.2.6 Day of the Week Encoding

In [30]:
from sklearn import preprocessing

str_col = ['day_of_week']
for i in str_col:
    le = preprocessing.LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

#### 2.2.7 Change Road Name

In [31]:
train['road_name']= train['road_name']+train['start_node_name']+train['end_node_name']
test['road_name']= test['road_name']+test['start_node_name']+test['end_node_name']

### 2.2.8 Avg of New Road Name

In [32]:
tmp_train = train[["road_name", "target"]]
tmp_train_mean = tmp_train.groupby("road_name").mean().reset_index()

road_name_list = list(tmp_train_mean["road_name"])
road_name_targets = list(tmp_train_mean["target"])

In [33]:
train_road_name_means = []

for idx in range(len(train)):
  road_hour_index = road_name_list.index(train.iloc[idx,:][6])
  train_road_name_means.append(road_name_targets[road_hour_index])

train["road_name_mean"] = train_road_name_means

test_road_name_means = []

for idx in range(len(test)):
  road_hour_index = road_name_list.index(test.iloc[idx,:][6])
  test_road_name_means.append(road_name_targets[road_hour_index])

test["road_name_mean"] = test_road_name_means

#### 2.2.9 Drop useless features

In [34]:
train.drop(["id", "base_date", "road_rating", "multi_linked", "connect_code", "vehicle_restricted", 
            "height_restricted", "start_node_name", "start_turn_restricted", "end_node_name", "end_turn_restricted", 
            "road_hours", "road_day", "road_day_hour"], axis=1, inplace=True)
test.drop(["id", "base_date", "road_rating", "multi_linked", "connect_code", "vehicle_restricted", 
            "height_restricted", "start_node_name", "start_turn_restricted", "end_node_name", "end_turn_restricted", 
            "road_hours", "road_day", "road_day_hour"], axis=1, inplace=True)

In [35]:
train.head()

Unnamed: 0,day_of_week,base_hour,lane_count,road_name,maximum_speed_limit,weight_restricted,road_type,start_latitude,start_longitude,end_latitude,end_longitude,target,month,road_hours_target,road_day_mean,road_day_hour_mean,rdh_ma3,rdh_ma4,rdh_ma6,road_name_mean
0,1,17,1,지방도1112호선제3교래교제3교래교,60.0,32400.0,3,33.427747,126.662612,33.427749,126.662335,52.0,6,44.945994,47.421487,44.918919,44.112361,44.051142,44.16191,49.511422
1,1,21,2,일반국도11호선광양사거리KAL사거리,60.0,0.0,0,33.50073,126.529107,33.504811,126.52624,30.0,7,39.388762,39.904327,39.335878,37.985248,37.765336,37.695086,26.400712
2,4,7,2,일반국도16호선창고천교상창육교,80.0,0.0,0,33.279145,126.368598,33.280072,126.362147,61.0,10,44.789037,44.996507,46.530935,49.607907,50.253697,50.690067,59.10172
3,0,13,2,태평로남양리조트서현주택,50.0,0.0,0,33.246081,126.567204,33.245565,126.566228,20.0,3,22.146268,25.107775,21.907925,22.233427,22.492023,23.03561,25.024923
4,6,8,2,일반국도12호선애월샷시애월입구,80.0,0.0,0,33.462214,126.326551,33.462677,126.330152,38.0,10,42.03626,43.464148,41.254252,46.695545,47.661913,48.869734,39.87367


In [36]:
train.to_csv(DATA_PATH + 'cleaned_train.csv', index=False)
test.to_csv(DATA_PATH + 'cleaned_test.csv', index=False)

## 3. Modelling

### 3.1 Load Data

In [7]:
train = pd.read_csv(DATA_PATH + 'cleaned_train.csv')
test = pd.read_csv(DATA_PATH + 'cleaned_test.csv')

### 3.2 Create Model Function

In [10]:
def train_predict_model(train, test, type,features):
  models = []
  min_values = []
  max_values = []

  for road_name in train.road_name.unique():
    sample_train_data = train[train["road_name"]==road_name]

    min_values.append(sample_train_data.target.min())
    max_values.append(sample_train_data.target.max())

    if type == "lgbm":
      train_model = lgb.LGBMRegressor(seed=42, metric="mae")
    elif type == "cat":
      train_model = cb.CatBoostRegressor(random_state=42, logging_level ="Silent")
    else:
      train_model = HistGradientBoostingRegressor(random_state=42)
    
    train_model.fit(sample_train_data[features],sample_train_data.target)

    models.append(train_model)

  print("{} training done".format(type))
  
  train_road_name = list(train.road_name.unique())

  test_features = copy.copy(features)
  test_features.append("road_name")
  sample_test = test[test_features]


  predictions = []
  road_min = []
  road_max = []

  for idx in range(len(sample_test)):
    model_index = train_road_name.index(sample_test.iloc[idx,:][len(test_features)-1])
    #print(test.iloc[idx,:][len(test_features)-1])
    tmp = []
    sample = []

    for x in sample_test.iloc[idx,:].to_numpy().reshape(1,-1):
      tmp.append(x)

    tmp = tmp[0]

    for i, x in enumerate(tmp):
      if i != (len(test_features)-1):
        sample.append(x)
      #else:
      #  print(x)

    sample = np.array(sample).reshape(1, -1)

    prediction = models[model_index].predict(sample)
    predictions.append(prediction)
    road_min.append(min_values[model_index])
    road_max.append(max_values[model_index])

  return predictions, road_min, road_max

In [13]:
def modeling(train, test, lgbm, cat, hgb, filename, feature):
  lgbm_pred, min_values, max_values = train_predict_model(train, test, "lgbm",feature)
  print("LGBM Model Done")
  cat_pred, min_values, max_values = train_predict_model(train, test, "cat",feature)
  print("Cat Model Done")
  if hgb != 0:
    hgb_pred, min_values, max_values = train_predict_model(train, test, "hgb",feature)
  print("HGB Model Done")

  sample_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')

  pred = []
  
  if hgb != 0:
    for idx in range(len(lgbm_pred)):
      pred.append(lgbm_pred[idx][0]*lgbm + cat_pred[idx][0]*cat + hgb_pred[idx][0]*hgb)
  else:
    for idx in range(len(lgbm_pred)):
      pred.append(lgbm_pred[idx][0]*lgbm + cat_pred[idx][0]*cat)
  
  #사후처리
  n = 0
  
  for idx in range(len(pred)):
    if pred[idx] < min_values[idx]:
      n += 1
      pred[idx] = min_values[idx]
    elif pred[idx] > max_values[idx]:
      n += 1
      pred[idx] = max_values[idx]

  print(pred)
  print(len(pred))

  sample_submission['target'] = pred

  #소수점이 0.3이하일 때 내림, 0.7 이상일 때 올림
  sample_submission['decimal'] = sample_submission['target'].astype(str).str.split('.').str[1]
  sample_submission['decimal'] = sample_submission['decimal'].astype(str).str[0].astype(int)
  sample_submission.loc[sample_submission['decimal'] < 4,'target']= np.floor(sample_submission['target'])
  sample_submission.loc[sample_submission['decimal'] > 6,'target']= np.ceil(sample_submission['target'])
  sample_submission = sample_submission.drop(columns=['decimal'], axis = 1)

  sample_submission.to_csv(DATA_PATH + filename, index = False)

### 3.3 Create multiple csv files

In [14]:
feature1 =['day_of_week', 'base_hour', 'lane_count',
          'maximum_speed_limit', 'weight_restricted', 'road_type',
          'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
          'month', 'road_hours_target', 'road_day_mean', 'road_name_mean']
modeling(train, test, 0.7, 0.3, 0, "file1.csv", feature1)

lgbm training done
LGBM Model Done
cat training done
Cat Model Done
HGB Model Done


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [16]:
feature2 =['day_of_week', 'base_hour', 'lane_count',
          'maximum_speed_limit', 'weight_restricted', 'road_type',
          'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
          'month', 'road_hours_target', 'road_day_mean', 'road_name_mean']
modeling(train, test, 0.6, 0.2, 0.2, "file2.csv", feature2)

lgbm training done
LGBM Model Done
cat training done
Cat Model Done
hgb training done
HGB Model Done


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [19]:
feature3 =['day_of_week', 'base_hour', 'lane_count',
          'maximum_speed_limit', 'weight_restricted', 'road_type',
          'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
          'month', 'road_hours_target', 'road_day_mean']
modeling(train, test, 0.7, 0.3, 0, "file3.csv", feature3)

lgbm training done
LGBM Model Done
cat training done
Cat Model Done
HGB Model Done


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [21]:
feature4 =['day_of_week', 'base_hour', 'lane_count',
       'maximum_speed_limit', 'weight_restricted', 'road_type',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'month', 'road_hours_target', 'road_day_mean',
       'road_day_hour_mean', 'rdh_ma3', 'rdh_ma4', 'rdh_ma6',
       'road_name_mean']

modeling(train, test, 0.7, 0.3, 0, "file4.csv", feature4)

lgbm training done
LGBM Model Done
cat training done
Cat Model Done
HGB Model Done


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [23]:
feature5 =['day_of_week', 'base_hour', 'lane_count',
       'maximum_speed_limit', 'weight_restricted', 'road_type',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'month', 'rdh_ma3', 'rdh_ma4', 'rdh_ma6',]

modeling(train, test, 0.7, 0.3, 0, "file5.csv", feature5)

lgbm training done
LGBM Model Done
cat training done
Cat Model Done
HGB Model Done


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [26]:
feature6 =['day_of_week', 'base_hour', 'lane_count',
       'maximum_speed_limit', 'weight_restricted', 'road_type',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude',
       'month', 'road_hours_target', 'road_day_mean', 'road_name_mean','rdh_ma3']

modeling(train, test, 0.7, 0.3, 0, "file6.csv", feature6)

lgbm training done
LGBM Model Done
cat training done
Cat Model Done
HGB Model Done


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [28]:
feature7 =['day_of_week', 'base_hour', 'lane_count',
       'maximum_speed_limit', 'weight_restricted', 'road_type',
       'start_latitude', 'start_longitude', 'end_latitude', 'end_longitude','month']

modeling(train, test, 0.7, 0.3, 0, "file7.csv", feature7)

lgbm training done
LGBM Model Done
cat training done
Cat Model Done
HGB Model Done


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



### 3.4 Ensemble

In [30]:
file1 = pd.read_csv(DATA_PATH + 'file1.csv')
file2 = pd.read_csv(DATA_PATH + 'file2.csv')
file3 = pd.read_csv(DATA_PATH + 'file3.csv')
file4 = pd.read_csv(DATA_PATH + 'file4.csv')
file5 = pd.read_csv(DATA_PATH + 'file5.csv')
file6 = pd.read_csv(DATA_PATH + 'file6.csv')
file7 = pd.read_csv(DATA_PATH + 'file7.csv')

target1 = file1.target
target2 = file2.target
target3 = file3.target
target4 = file4.target
target5 = file5.target
target6 = file6.target
target7 = file7.target

target = list((target1 + target2 + target3 + target4 + target5 + target6 + target7)/7)


In [31]:
sample_submission = pd.read_csv(DATA_PATH + 'sample_submission.csv')

pred = []

for x in target:
  pred.append(x)

sample_submission['target'] = pred

# rounds down when the decimal point is less than 0.3, rounds up when the decimal point is greater than or equal to 0.7
sample_submission['decimal'] = sample_submission['target'].astype(str).str.split('.').str[1]
sample_submission['decimal'] = sample_submission['decimal'].astype(str).str[0].astype(int)
sample_submission.loc[sample_submission['decimal'] < 4,'target']= np.floor(sample_submission['target'])
sample_submission.loc[sample_submission['decimal'] > 6,'target']= np.ceil(sample_submission['target'])
sample_submission = sample_submission.drop(columns=['decimal'], axis = 1)

In [32]:
sample_submission.to_csv(DATA_PATH + "final_version.csv", index = False)

In [33]:
sample_submission

Unnamed: 0,id,target
0,TEST_000000,26.000000
1,TEST_000001,42.575787
2,TEST_000002,68.000000
3,TEST_000003,38.000000
4,TEST_000004,43.517414
...,...,...
291236,TEST_291236,46.632773
291237,TEST_291237,51.000000
291238,TEST_291238,22.000000
291239,TEST_291239,22.000000
