<a href="https://colab.research.google.com/github/namwootree/Portfolio/blob/main/Competition/Dacon/JeJu_Traffic/High%20Rank%20Code%20Review/%5B3nd_Private_3_08467%5D_LGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

## Library

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import gc
import matplotlib.pyplot as plt
from tqdm import tqdm
import random
import os

from glob import glob
from scipy import interpolate
from sklearn.metrics import mean_squared_error, mean_absolute_error

import datetime
from datetime import timedelta
from pytz import timezone

import warnings

from sklearn.model_selection import train_test_split

import seaborn as sns 

import datetime
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
pd.set_option('display.max_columns',None)

# Load Data

In [None]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_submission_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/sample_submission.csv'
df_train_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/train.csv'
df_test_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/test.csv'

In [None]:
csv_to_parquet(df_train_path, 'train')
csv_to_parquet(df_test_path, 'test')

In [None]:
train = pd.read_parquet('/content/train.parquet')
test = pd.read_parquet('/content/test.parquet')

# Preprocessing

## Data type -> int

In [None]:
# validation 할 때는 True, 제출 시에는 False
EVAL = False

In [None]:
if EVAL:
    train['base_date'] = train['base_date'].astype('str') 
    train['year'] = train['base_date'].apply(lambda x: x[:4]).astype('int')    
    train['month'] = train['base_date'].apply(lambda x: x[4:6]).astype('int')
    train['day'] = train['base_date'].apply(lambda x: x[6:8]).astype('int')   
    
    test = train.query('month==7 and year==2022 and day>15').reset_index(drop=True)
    train = train.query('month!=7 or year!=2022 or day<=15').reset_index(drop=True)
    train.shape, test.shape
    
else:
    train['base_date'] = train['base_date'].astype('str')     
    train['year'] = train['base_date'].apply(lambda x: x[:4]).astype('int')    
    train['month'] = train['base_date'].apply(lambda x: x[4:6]).astype('int')    
    train['day'] = train['base_date'].apply(lambda x: x[6:8]).astype('int')
    
    test['base_date'] = test['base_date'].astype('str') 
    test['year'] = test['base_date'].apply(lambda x: x[:4]).astype('int')    
    test['month'] = test['base_date'].apply(lambda x: x[4:6]).astype('int')    
    test['day'] = test['base_date'].apply(lambda x: x[6:8]).astype('int')
    train.shape, test.shape

# Label Encoding

In [None]:
str_col = ['day_of_week','start_turn_restricted','end_turn_restricted',
           'road_name','road_type','road_rating','start_node_name','end_node_name',
           'start_latitude','end_latitude','start_longitude','end_longitude']

In [None]:
for i in str_col:
    le = LabelEncoder()
    le=le.fit(train[i])
    train[i]=le.transform(train[i])
    
    for label in np.unique(test[i]):
        if label not in le.classes_: 
            le.classes_ = np.append(le.classes_, label)
    test[i]=le.transform(test[i])

## 파생변수 생성

주요 항목별 target의 train 데이터셋 평균과 표준편차를 파생변수로 생성

* [시간, 제한속도]별 target 평균은 별도 파생변수로 생성
* [시간]별 targe평균에 대한 [시간,제한속도]별 target평균의 비율 추가
* 파생변수는 train데이터셋만으로 생성한 이후, test 데이터셋으로 merge

### 주요 컬럼에 대해서 컬럼별 target통계(평균) 추가
 train 데이터셋만으로 통계를 내고, test에는 merge만 함

In [None]:
names = ['day_of_week', 'base_hour','road_name','start_node_name',
         'end_node_name','maximum_speed_limit','start_latitude',
         'end_latitude','start_longitude','end_longitude']

In [None]:
for name in names:
    print(name)
    df1 = train.groupby(name).mean().reset_index()[[name,'target']].rename(columns={'target':f'{name}_mean_target'})
    train = pd.merge(train,df1,on=name,how='left')
    
    test = pd.merge(test,df1,on=name,how='left')       

df1 = train.groupby(['base_hour','maximum_speed_limit']).mean().reset_index()[[ 'base_hour','maximum_speed_limit','target']].rename(columns={'target':'whs_mean_target'})
train = pd.merge(train,df1,on=[ 'base_hour','maximum_speed_limit'],how='left')

test = pd.merge(test,df1,on=[ 'base_hour','maximum_speed_limit'],how='left')  

### ['base_hour'] 통계와 ['base_hour','maximun_speed_limit']통계간의 차이 비율 추가

In [None]:
df1 = train.groupby(['base_hour']).mean().reset_index()[[ 'base_hour','target']].rename(columns={'target':'whs_mean_target2'})
train = pd.merge(train,df1,on=[ 'base_hour'],how='left')
train['whs_delta_targe'] = (train['whs_mean_target']-train['whs_mean_target2'])/train['whs_mean_target2']
train = train.drop('whs_mean_target2',axis=1)

test = pd.merge(test,df1,on=['base_hour'],how='left')    
test['whs_delta_targe'] = (test['whs_mean_target']-test['whs_mean_target2'])/test['whs_mean_target2']
test = test.drop('whs_mean_target2',axis=1)

### 주요 컬럼에 대해서 컬럼별 target통계(표준편차)추가

 train 데이터셋만으로 통계를 내고, test에는 merge만 함

In [None]:
names = ['day_of_week', 'base_hour','road_name','start_node_name',
         'end_node_name','maximum_speed_limit','start_latitude',
         'end_latitude','start_longitude','end_longitude']

In [None]:
for name in names:
    print(name)
    df1 = train.groupby(name).std().reset_index()[[name,'target']].rename(columns={'target':f'{name}_std_target'})
    train = pd.merge(train,df1,on=name,how='left')
    
    test = pd.merge(test,df1,on=name,how='left')       

df1 = train.groupby([ 'base_hour','maximum_speed_limit']).std().reset_index()[['base_hour','maximum_speed_limit','target']].rename(columns={'target':'whs_std_target'})
train = pd.merge(train,df1,on=[ 'base_hour','maximum_speed_limit'],how='left')

test = pd.merge(test,df1,on=[ 'base_hour','maximum_speed_limit'],how='left')      

### 시간 cos/sin 변환 추가

In [None]:
train['cos_time'] = np.cos(2*np.pi*(train['base_hour']/24))
train['sin_time'] = np.sin(2*np.pi*(train['base_hour']/24))

test['cos_time'] = np.cos(2*np.pi*(test['base_hour']/24))
test['sin_time'] = np.sin(2*np.pi*(test['base_hour']/24))

## Fill NaN

In [None]:
train = train.fillna(0)
test = test.fillna(0)

## Data type -> category

In [None]:
train[str_col] = train[str_col].astype('category')
test[str_col] = test[str_col].astype('category')

# Modeling

## LGBM

In [None]:
def lgbm_model_train(x_train, y_train, x_valid, y_valid, lr,seed) :
    params = {'learning_rate': lr, 
              'max_depth': 16, 
              'boosting': 'gbdt', 
              'objective': 'regression',  
              'is_training_metric': True, 
              'num_leaves': 5000, 
              'feature_fraction': 0.9, 
              'bagging_fraction': 0.8, 
              'seed':seed,
              'num_threads':8,
              'metric':{'l2','l1'},
              'num_iterations':2000,
             }

    model = lgb.train(params, 
                   train_set = lgb.Dataset(data = x_train, label = y_train),
                   num_boost_round = 2000, 
                   valid_sets = lgb.Dataset(data = x_valid, label = y_valid), 
                   init_model = None, 
                   early_stopping_rounds = 200,
                   verbose_eval = 50
                    )    
    return model

In [None]:
# 경고 끄기
pd.set_option('mode.chained_assignment', None)
warnings.filterwarnings(action='ignore')

## Ensemble

In [None]:
def run(train, test, rs, ts):
  
    seed_l = [1000,1001,51,51,191,1001,67,51,51,191]
    lr_l = [0.025, 0.03, 0.11,0.05, 0.07, 0.08, 0.02, 0.06, 0.12, 0.1]
    model = []
    
    train = train.drop(['id','base_date','year','day','multi_linked','vehicle_restricted','height_restricted', 'connect_code'], axis=1)

    st = datetime.datetime.now()
    print('start time : ',st)

    # train 데이터셋을 99%로 랜덤 분할하여, 20번 수행
    if EVAL:
        split = StratifiedShuffleSplit(n_splits=10, test_size=ts, random_state=rs)
    else:
        split = StratifiedShuffleSplit(n_splits=10, test_size=ts, random_state=rs)   

    i=0
    for train_idx, valid_idx in split.split(train ,train[['month']]): 
        X_train = train.loc[train_idx].drop(['target'], axis=1)
        y_train = train.loc[train_idx]['target']

        X_valid = train.loc[valid_idx].drop(['target'], axis=1)
        y_valid = train.loc[valid_idx]['target']

        seed = seed_l[i%10]
        lr = lr_l[i%10]
    #학습 
        print('random state : ',rs, 'test size : ',ts, 'seed : ',seed, 'lr : ',lr, 'seq : ',i)
        model_ = lgbm_model_train(X_train,y_train,X_valid,y_valid, lr,seed)   
        model.append(model_)
        
        ed = datetime.datetime.now()
        print(i, 'Elapsed time : ', ed -st )
        st = ed
        i += 1   
  
    return model

## 공휴일 관련 변수 별 모델 학습

공휴일과 인접한 주말도 포함한 버전, 공휴일만 포함한 버전, 극성수기 포함 등 3개를 사용
* 관광지 특성상 공휴일과 인접 주말에 교통이 혼잡할 것으로 예상
* 공휴일은 출퇴근 시간대가 없으므로 일반적인 주중 패턴과 다를 것으로 예상
공유일 변수를 추가했을 때 예측정확도가 올라가는 것을 확인함

### 공휴일 인접 주말 포함

In [None]:
ts_l = [0.01,  0.05]
rs_l = [51, 1000]

# 공휴일 인접 주말 포함
hday = ['20210918','20210919','20210920','20210921','20210922','20211002','20211003','20211004','20211009',
        '20211010','20211011','20211225','20211226','20220101','20220102','20220129','20220130','20220131',
        '20220201','20220202','20220301','20220309','20220505','20220506','20220507','20220508','20220601',
        '20220604','20220605','20220606','20220813','20220814','20220815'
]

ncol_l = []
model_1 = []

for i in range(2):
    train['hday'] = train['base_date'].apply(lambda x: 1 if x in hday else 0)
    test['hday'] = test['base_date'].apply(lambda x: 1 if x in hday else 0)   
    
    model_ = run(train, test, rs_l[i] , ts_l[i])
    model_1.extend(model_)

### 공휴일만

In [None]:
ts_l = [0.01,  0.05]
rs_l = [51, 1000]

#공휴일만
hday = ['20210920','20210921','20210922', '20211004','20211011','20220131','20220201','20220202','20220301',
        '20220309','20220505','20220601','20220606','20220815']

ncol_l = []
model_2 = []

for i in range(2):
    train['hday'] = train['base_date'].apply(lambda x: 1 if x in hday else 0)
    
    model_ = run(train, test, rs_l[i] , ts_l[i])
    model_2.extend(model_)

### 제주 극성수기 포함

In [None]:
ts_l = [0.01,  0.05]
rs_l = [51, 1000]

#제주 극성수기 포함
hday = ['20210918','20210919','20210920','20210921','20210922','20211002','20211003','20211004','20211009',
        '20211010','20211011','20211225','20211226','20220101','20220102','20220129','20220130','20220131',
        '20220201','20220202','20220301','20220309','20220505','20220506','20220507','20220508','20220601',
        '20220604','20220605','20220606','20220729','20220730','20220731','20220801','20220802','20220803',
        '20220804','20220805','20220813','20220814','20220815'
]

ncol_l = []
model_3 = []

for i in range(2):
    train['hday'] = train['base_date'].apply(lambda x: 1 if x in hday else 0)
    
    model_ = run(train, test, rs_l[i] , ts_l[i])
    model_3.extend(model_)

## 추론

### 공휴일 인접 주말 포함

In [None]:
hday = ['20210918','20210919','20210920','20210921','20210922','20211002','20211003','20211004','20211009',
        '20211010','20211011','20211225','20211226','20220101','20220102','20220129','20220130','20220131',
        '20220201','20220202','20220301','20220309','20220505','20220506','20220507','20220508','20220601',
        '20220604','20220605','20220606','20220813','20220814','20220815'
]

test_ = test.copy()
test_['hday'] = test_['base_date'].apply(lambda x: 1 if x in hday else 0)   
test_ = test_.drop(['id','base_date','year','day','multi_linked','vehicle_restricted', 'height_restricted','connect_code'], axis=1)  

df_pred = pd.DataFrame()
i = 0

for i in range(20):    
    model_ = model_1[i]
    pred = model_.predict(test_)        
    df_pred[f'pred_{i}'] = pred 

    i += 1   

### 공휴일만

In [None]:
hday = ['20210920','20210921','20210922', '20211004','20211011','20220131','20220201','20220202','20220301',
        '20220309','20220505','20220601','20220606','20220815']

test_ = test.copy()
test_['hday'] = test_['base_date'].apply(lambda x: 1 if x in hday else 0)   
test_ = test_.drop(['id','base_date','year','day','multi_linked','vehicle_restricted', 'height_restricted','connect_code'], axis=1)  

i = 0
for i in range(20):    
    model_ = model_2[i]
    pred = model_.predict(test_)        
    df_pred[f'pred_{i+20}'] = pred 

    i += 1   

### 제주 극성수기 포함

In [None]:
hday = ['20210918','20210919','20210920','20210921','20210922','20211002','20211003','20211004','20211009',
        '20211010','20211011','20211225','20211226','20220101','20220102','20220129','20220130','20220131',
        '20220201','20220202','20220301','20220309','20220505','20220506','20220507','20220508','20220601',
        '20220604','20220605','20220606','20220729','20220730','20220731','20220801','20220802','20220803',
        '20220804','20220805','20220813','20220814','20220815'
]

test_ = test.copy()
test_['hday'] = test_['base_date'].apply(lambda x: 1 if x in hday else 0)   
test_ = test_.drop(['id','base_date','year','day','multi_linked','vehicle_restricted', 'height_restricted','connect_code'], axis=1)  

i = 0
for i in range(20):    
    model_ = model_3[i]
    pred = model_.predict(test_)        
    df_pred[f'pred_{i+40}'] = pred 

    i += 1   

## 결과 종합 (앙상블)

In [None]:
df_pred['pred'] = df_pred.mean(axis=1)  
df_pred

## 중요도 확인

### Split 중요도

In [None]:
FEATURES = train.drop('target',axis=1).columns
fig, ax = plt.subplots(figsize=(10, 15))
ax = lgb.plot_importance(model_, max_num_features=len(FEATURES), importance_type='split',ax=ax)

ax.set(title=f'Feature Importance (split)',
    xlabel='Feature Importance',
    ylabel='Features')

### Gain 중요도

In [None]:
FEATURES = train.drop('target',axis=1).columns
fig, ax = plt.subplots(figsize=(10, 15))
ax = lgb.plot_importance(model_, max_num_features=len(FEATURES), importance_type='gain',ax=ax)
ax.set(title=f'Feature Importance (gain)',
    xlabel='Feature Importance',
    ylabel='Features')

# Submission

In [None]:
sample_submission = pd.read_csv('./sample_submission.csv')

In [None]:
sample_submission['target'] = df_pred.pred.values
sample_submission.to_csv("./submit.csv", index = False)

In [None]:
sample_submission
