<a href="https://colab.research.google.com/github/namwootree/Portfolio/blob/main/Competition/Dacon/JeJu_Traffic/High%20Rank%20Code%20Review/%5B2nd_Private_3_08350%5D_Optuna_Catboost_XGBoost_LGBM_Ensemble.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Setting

## GPU

In [None]:
!nvidia-smi

## Library

In [None]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

import gc

from sklearn.preprocessing import LabelEncoder
from haversine import haversine
from sklearn.cluster import KMeans

import math

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_absolute_error
from catboost import CatBoostRegressor, Pool
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Load Data

In [None]:
def csv_to_parquet(csv_path, save_name):
    df = pd.read_csv(csv_path)
    df.to_parquet(f'./{save_name}.parquet')
    del df
    gc.collect()
    print(save_name, 'Done.')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df_submission_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/sample_submission.csv'
df_train_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/train.csv'
df_test_path = '/content/drive/MyDrive/머신러닝 엔지니어링/데이콘/제주도 도로 교통량 예측/data/test.csv'

In [None]:
csv_to_parquet(df_train_path, 'train')
csv_to_parquet(df_test_path, 'test')

In [None]:
train = pd.read_parquet('/content/train.parquet')
test = pd.read_parquet('/content/test.parquet')

# Preprocessing

## Drop

In [None]:
train.drop(['vehicle_restricted', 'id', 'height_restricted'], axis = 1, inplace = True)
test.drop(['vehicle_restricted', 'id', 'height_restricted'], axis = 1, inplace = True)

## start_node_name과 end_node_name을 key값으로 만들어 LabelEncoding

In [None]:
le = LabelEncoder()

In [None]:
train['node_combination'] = train['start_node_name'] + '_' + train['end_node_name']
test['node_combination'] = test['start_node_name'] + '_' + test['end_node_name']

In [None]:
train['node_combination'] = le.fit_transform(train['node_combination'])

In [None]:
for category in np.unique(test['node_combination']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['node_combination'] = le.transform(test['node_combination'])

## 위경도 좌표만으로 Clustering(KMeans)

Clustering Plotting 결과 군집 수가 6일 때 각 좌표가 명확히 구분되어 6으로 설정

In [None]:
km = KMeans(n_clusters = 6, max_iter = 1000, random_state = 42, n_init = 15)

In [None]:
train['gps_cls'] = km.fit_predict(train[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']])
test['gps_cls'] = km.predict(test[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']])

## 공휴일 전후 1 ~ 2일 여부

일반적인 공휴일 기준으로 전후 1 ~ 2일을 기간을 더 두어 binary화

In [None]:
train['base_date'] = train['base_date'].astype(str)
test['base_date'] = test['base_date'].astype(str)

In [None]:
train['date'] = train['base_date'].str[4:]
test['date'] = test['base_date'].str[4:]

In [None]:
h_days = ['1231', '0101', '0102', '0129', '0130', '0131', '0201', '0202', '0228', '0229', '0230', '0301', '0302', 
          '0505', '0506', '0507', '0508', '0605', '0607', '0606', '0814', '0815', '0816', '0920', '0921', '0504',
          '0922', '1002', '1003', '1004', '1008', '1009', '1010', '1224', '1225', '1226']

In [None]:
train['in_h_days'] = train['date'].isin(h_days)
test['in_h_days'] = test['date'].isin(h_days)

## 년도

In [None]:
train['base_date'] = pd.to_datetime(train['base_date'])
test['base_date'] = pd.to_datetime(test['base_date'])

In [None]:
train['year'] = train['base_date'].dt.year
test['year'] = test['base_date'].dt.year

## 월

In [None]:
train['month'] = train['base_date'].dt.month
test['month'] = test['base_date'].dt.month

## 최고 제한 속도로 도로 주행시 소요 시간

In [None]:
dist = []
for i, v in enumerate(train[['start_latitude', 'end_latitude', 'start_longitude', 'end_longitude']].values) :
    dist.append(haversine((v[0], v[2]), (v[1], v[3]), unit = 'km'))

In [None]:
train['at_time'] = 60 * pd.Series(dist) / train['maximum_speed_limit']

In [None]:
dist = []
for i, v in enumerate(test[['start_latitude', 'end_latitude', 'start_longitude', 'end_longitude']].values) :
    dist.append(haversine((v[0], v[2]), (v[1], v[3]), unit = 'km'))

In [None]:
test['at_time'] = 60 * pd.Series(dist) / test['maximum_speed_limit']

In [None]:
gc.collect()

## 방위각

In [None]:
def Azimuth(lat1, lng1, lat2, lng2):
    Lat1 = math.radians(lat1)
    Lat2 = math.radians(lat2)
    Lng1 = math.radians(lng1)
    Lng2 = math.radians(lng2)
    
    y = math.sin(Lng2 - Lng1) * math.cos(Lat2)
    x = math.cos(Lat1) * math.sin(Lat2) - math.sin(Lat1) * math.cos(Lat2) * math.cos(Lng2-Lng1)
    z = math.atan2(y, x)

    a = np.rad2deg(z)
    
    if(a < 0):
        a = 180 + (180 + a)
    return a

In [None]:
train['degree'] = [Azimuth(v[0], v[1], v[2], v[3]) for i, v in enumerate(train[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']].values)]
test['degree'] = [Azimuth(v[0], v[1], v[2], v[3]) for i, v in enumerate(test[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']].values)]

In [None]:
def get_season(x) :
    
    if x in [9, 10, 11] :
        return 3
    elif x in [12, 1, 2] :
        return 2
    elif x in [3, 4, 5, 6] :
        return 1
    else :
        return 0

In [None]:
train['season'] = train['month'].apply(get_season)
test['season'] = test['month'].apply(get_season)

## 계절

In [None]:
def get_season(x) :
    
    if x in [9, 10, 11] :
        return 3
    elif x in [12, 1, 2] :
        return 2
    elif x in [3, 4, 5, 6] :
        return 1
    else :
        return 0

In [None]:
train['season'] = train['month'].apply(get_season)
test['season'] = test['month'].apply(get_season)

## 요일

일반적인 요일 순서대로가 아닌 LabelEncoding으로 진행

In [None]:
train['day_of_week'] = le.fit_transform(train['day_of_week'])

In [None]:
for category in np.unique(test['day_of_week']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['day_of_week'] = le.transform(test['day_of_week'])

## 도로명

도로명 LabelEncoding

In [None]:
train['road_name'] = le.fit_transform(train['road_name'])

In [None]:
for category in np.unique(test['road_name']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['road_name'] = le.transform(test['road_name'])

## 시작 노드 == 종료 노드 여부

In [None]:
train['node_same'] = train['start_node_name'] == train['end_node_name']
test['node_same'] = test['start_node_name'] == test['end_node_name']

## 기타 칼럼 LabelEncoding

In [None]:
train['start_turn_restricted'] = le.fit_transform(train['start_turn_restricted'])

In [None]:
for category in np.unique(test['start_turn_restricted']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
        
test['start_turn_restricted'] = le.transform(test['start_turn_restricted'])

In [None]:
train['end_turn_restricted'] = le.fit_transform(train['end_turn_restricted'])

In [None]:
for category in np.unique(test['end_turn_restricted']) :
    if category not in le.classes_ :
        le.classes_ = np.append(le.classes_, label)
test['end_turn_restricted'] = le.transform(test['end_turn_restricted'])

## 모델링 사용 제외 칼럼 

In [None]:
train.drop(['start_node_name', 'end_node_name', 'date', 'base_date'], axis = 1, inplace = True)
test.drop(['start_node_name', 'end_node_name', 'date', 'base_date'], axis = 1, inplace = True)

## External Data

공공 데이터 포럼 (2022년 8월 이전)

* 무인교통단속카메라
* 전국초중등학교기본정보
* 어린이보호구역
* 제주시 주차장 정보
* 서귀포시 주차장 정보

GPS 값

In [None]:
gps_comb = train[['start_latitude', 'start_longitude', 'end_latitude', 'end_longitude']].drop_duplicates()

### 무인 단속 카메라

In [None]:
cctv = pd.read_csv('경찰청_제주특별자치도경찰청_무인교통단속카메라_20220616.csv', encoding = 'cp949')

In [None]:
cctv = cctv.iloc[:, 3:-7].drop(['소재지도로명주소', '소재지지번주소'], axis = 1)

### 전국 초중등학교 기본 정보

In [None]:
school = pd.read_csv('초중등학교.csv', encoding = 'cp949')

In [None]:
school = school[(school['데이터기준일자'] <= '2022-07-31') & (school['시도교육청명'].str.contains('제주'))]

### 어린이 보호 구역

In [None]:
child = pd.read_csv('제주특별자치도_어린이보호구역_20220513.csv', encoding = 'cp949')

### 제주시 주차장

In [None]:
parking1 = pd.read_csv("제주특별자치도_제주시_주차장정보_20210818_1630391997093_77385.csv", encoding = 'cp949')

In [None]:
parking1.dropna(subset = ['위도', '경도'], inplace = True)

###서귀포시 주차장

In [None]:
parking2 = pd.read_csv("제주특별자치도_서귀포시_주차장정보_20220425_1650966840250_33855.csv", encoding = 'cp949')

### 거리 계산

In [None]:
def cal_dist(x1, y1, x2, y2, a, b):
    
    area = abs((x1 - a) * (y2 - b) - (y1 - b) * (x2 - a))
    AB = ((x1 - x2) ** 2 + (y1 - y2) ** 2) ** 0.5
    distance = area / AB
    
    return distance

train 데이터의 도로와 각 시설 및 구역의 위경도 좌표의 거리(위경도 좌표상 거리)가 0.0005이내일 경우 count +



In [None]:
def get_node_cnt(gps_values, infra_values) :
    
    cnt = []

    for y1, x1, y2, x2 in gps_values.values :

        i = 0

        for a, b in infra_values.values :

            dist = cal_dist(x1, y1, x2, y2, a, b)

            if dist < 0.0005 :
                i += 1

            else :
                pass
        cnt.append(i)
    return cnt

In [None]:
cctv_cnt = get_node_cnt(gps_comb, cctv[['경도', '위도']])
school_cnt = get_node_cnt(gps_comb, school[['경도', '위도']])
child_cnt = get_node_cnt(gps_comb, child[['경도', '위도']])
parking1_cnt = get_node_cnt(gps_comb, parking1[['경도', '위도']])
parking2_cnt = get_node_cnt(gps_comb, parking2[['경도', '위도']])
parking_cnt = list(np.array(parking1_cnt) + np.array(parking2_cnt))

In [None]:
gps_comb['CCTV_cnt'] = cctv_cnt
gps_comb['school_cnt'] = school_cnt
gps_comb['child_cnt'] = child_cnt
gps_comb['parking_cnt'] = parking_cnt

### Fil NaN

In [None]:
train = pd.merge(train, gps_comb, how = 'left')
test = pd.merge(test, gps_comb, how = 'left').fillna(0)

### 제주 공항까지 거리 (Km)

train과 test의 시작 위경도 좌표와 제주 공항 위경도 좌표까지의 거리(km)

In [None]:
jeju_air = 33.506683, 126.493177

In [None]:
train['j_a_dist'] = [haversine((v[0], v[1]), jeju_air, unit = 'km') for v in train[['start_latitude', 'start_longitude']].values]
test['j_a_dist'] = [haversine((v[0], v[1]), jeju_air, unit = 'km') for v in test[['start_latitude', 'start_longitude']].values]

### 한라산까지 거리 (Km)

train과 test의 시작 위경도 좌표와 한라산 위경도 좌표까지의 거리(km)

In [None]:
hanla = 33.36168194, 126.5291548

In [None]:
train['h_a_dist'] = [haversine((v[0], v[1]), hanla, unit = 'km') for v in train[['start_latitude', 'start_longitude']].values]
test['h_a_dist'] = [haversine((v[0], v[1]), hanla, unit = 'km') for v in test[['start_latitude', 'start_longitude']].values]

# Modeling

* lane_count를 1, 2, 3으로 나누어 모델링
* LGBM, XGBoost는 optuna로 파라미터 튜닝

## Split X / y

In [None]:
X = train.drop(['target'], axis = 1)
y = train.target

target = test[X.columns]

## StratifiedKFold

In [None]:
skf = StratifiedKFold(n_splits = 10, random_state = 42, shuffle = True)

In [None]:
X1 = X[X['lane_count'] == 1].drop(['lane_count'], axis = 1)
X2 = X[X['lane_count'] == 2].drop(['lane_count'], axis = 1)
X3 = X[X['lane_count'] == 3].drop(['lane_count'], axis = 1)

In [None]:
y1 = y[X1.index]
y2 = y[X2.index]
y3 = y[X3.index]

In [None]:
standard1 = X1['day_of_week']
standard2 = X2['day_of_week']
standard3 = X3['day_of_week']

In [None]:
target = test[X.columns]

In [None]:
target1 = target.loc[target['lane_count'] == 1, X1.columns]
target2 = target.loc[target['lane_count'] == 2, X2.columns]
target3 = target.loc[target['lane_count'] == 3, X3.columns]

## Catboost

### Catboost 1

In [None]:
cb_pred1 = np.zeros(target1.shape[0])
i = 0
cb_mae = []

for tr_idx, val_idx in skf.split(X1, standard1):
    
    tr_x, tr_y = X1.iloc[tr_idx], y1.iloc[tr_idx]
    val_x, val_y = X1.iloc[val_idx], y1.iloc[val_idx]

    cb = CatBoostRegressor(max_depth = 8, learning_rate = 0.033, use_best_model = True, iterations = 10000, eval_metric = 'MAE')

    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 5, verbose = 2500)

    val_pred = cb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    cb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = cb.predict(target1) / skf.n_splits
    cb_pred1 += fold_pred

print(f"\nAVG of MAE = {np.mean(cb_mae)}")

In [None]:
np.save('./cb_pred1', cb_pred1)

### Catboost 2

In [None]:
cb_pred2 = np.zeros(target2.shape[0])
i = 0
cb_mae = []

for tr_idx, val_idx in skf.split(X2, standard2):
    
    tr_x, tr_y = X2.iloc[tr_idx], y2.iloc[tr_idx]
    val_x, val_y = X2.iloc[val_idx], y2.iloc[val_idx]

    cb = CatBoostRegressor(max_depth = 8, learning_rate = 0.033, use_best_model = True, iterations = 10000, eval_metric = 'MAE')

    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 5, verbose = 2500)

    val_pred = cb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    cb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = cb.predict(target2) / skf.n_splits
    cb_pred2 += fold_pred

print(f"\nAVG of MAE = {np.mean(cb_mae)}")

In [None]:
np.save('./cb_pred2', cb_pred2)

### Catboost 3

In [None]:
cb_pred3 = np.zeros(target3.shape[0])
i = 0
cb_mae = []

for tr_idx, val_idx in skf.split(X3, standard3):
    
    tr_x, tr_y = X3.iloc[tr_idx], y3.iloc[tr_idx]
    val_x, val_y = X3.iloc[val_idx], y3.iloc[val_idx]

    cb = CatBoostRegressor(max_depth = 8, learning_rate = 0.033, use_best_model = True, iterations = 10000, eval_metric = 'MAE')

    cb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 2500)

    val_pred = cb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    cb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = cb.predict(target3) / skf.n_splits
    cb_pred3 += fold_pred

print(f"\nAVG of MAE = {np.mean(cb_mae)}")

In [None]:
np.save('./cb_pred3', cb_pred3)

## LGBMRegressor

### LGBMRegressor 1

In [None]:
lgbm_param1 = {
    'objective' : 'regression',
    'device' : 'gpu',
    'metric' : 'mae',
    'verbose' : -1,
    'random_state' : 42,
    'n_estimators' : 1468,
    'learning_rate' : 0.033,
    'max_depth' : 29,
    'min_child_samples' : 16,
    'subsample' : 0.7,
    'colsample_bytree' : 0.9,
    'num_leaves' : 979
}

In [None]:
lgbm_pred1 = np.zeros(target1.shape[0])
i = 0
lgbm_mae = []

for tr_idx, val_idx in skf.split(X1, standard1):
    
    tr_x, tr_y = X1.iloc[tr_idx], y1.iloc[tr_idx]
    val_x, val_y = X1.iloc[val_idx], y1.iloc[val_idx]

    lgbm = LGBMRegressor(**lgbm_param1)
    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 5, verbose = 2500)

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = lgbm.predict(target1) / skf.n_splits
    lgbm_pred1 += fold_pred

print(f"\nAVG of MAE = {np.mean(lgbm_mae)}")

In [None]:
np.save('./lgbm_pred1', lgbm_pred1)

### LGBMRegressor 2

In [None]:
lgbm_param2 = {
    'objective' : 'regression',
    'device' : 'gpu',
    'metric' : 'mae',
    'verbose' : -1,
    'random_state' : 42,
    'n_estimators' : 1498,
    'learning_rate' : 0.043,
    'max_depth' : 21,
    'min_child_samples' : 23,
    'subsample' : 1.0,
    'colsample_bytree' : 1.0,
    'num_leaves' : 662
}

In [None]:
lgbm_pred2 = np.zeros(target2.shape[0])
i = 0
lgbm_mae = []

for tr_idx, val_idx in skf.split(X2, standard2):
    
    tr_x, tr_y = X2.iloc[tr_idx], y2.iloc[tr_idx]
    val_x, val_y = X2.iloc[val_idx], y2.iloc[val_idx]

    lgbm = LGBMRegressor(**lgbm_param2)

    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 5, verbose = 2500)

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = lgbm.predict(target2) / skf.n_splits
    lgbm_pred2 += fold_pred

print(f"\nAVG of MAE = {np.mean(lgbm_mae)}")

In [None]:
np.save('./lgbm_pred2', lgbm_pred2)

### LGBMRegressor 3

In [None]:
lgbm_param3 = {
    'objective' : 'regression',
    'device' : 'gpu',
    'metric' : 'mae',
    'verbose' : -1,
    'random_state' : 42,
    'n_estimators' : 627,
    'learning_rate' : 0.047,
    'max_depth' : 23,
    'min_child_samples' : 28,
    'subsample' : 0.6,
    'colsample_bytree' : 1.0,
    'num_leaves' : 819
}

In [None]:


lgbm_pred3 = np.zeros(target3.shape[0])
i = 0
lgbm_mae = []

for tr_idx, val_idx in skf.split(X3, standard3):
    
    tr_x, tr_y = X3.iloc[tr_idx], y3.iloc[tr_idx]
    val_x, val_y = X3.iloc[val_idx], y3.iloc[val_idx]

    lgbm = LGBMRegressor(**lgbm_param3)

    lgbm.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 2500)

    val_pred = lgbm.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    lgbm_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}\n")

    i += 1

    fold_pred = lgbm.predict(target3) / skf.n_splits
    lgbm_pred3 += fold_pred

print(f"\nAVG of MAE = {np.mean(lgbm_mae)}")

In [None]:
np.save('./lgbm_pred3', lgbm_pred3)

## XGBboost

### XGBboost 1

In [None]:
xgb_param1 = {
    'objective' : 'reg:squarederror',
    'n_estimators' : 574,
    'learning_rate' : 0.031000000000000003,
    'nthread' : -1,
    'max_depth' : 15,
    'min_child_weight' : 7,
    'gamma' : 0.35000000000000003,
    'colsample_bytree' : 0.8,
    'lambda' : 0.8500000000000001,
    'alpha' : 0.0,
    'subsample' : 1.0,
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor'
}

In [None]:
xgb_pred1 = np.zeros(target1.shape[0])
i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X1, standard1):
    
    tr_x, tr_y = X1.iloc[tr_idx], y1.iloc[tr_idx]
    val_x, val_y = X1.iloc[val_idx], y1.iloc[val_idx]

    xgb = XGBRegressor(**xgb_param1)
    xgb.fit(tr_x, tr_y, eval_set = [(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000, eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target1) / skf.n_splits
    xgb_pred1 += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

In [None]:
np.save('./xgb_pred1', xgb_pred1)

### XGBboost 2

In [None]:
xgb_param2 = {
    'objective' : 'reg:squarederror',
    'n_estimators' : 999,
    'learning_rate' : 0.047,
    'nthread' : -1,
    'max_depth' : 16,
    'min_child_weight' : 10,
    'gamma' : 0.8500000000000001,
    'colsample_bytree' : 0.7,
    'lambda' : 0.65,
    'alpha' : 0.45,
    'subsample' : 1.0,
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor'
}

In [None]:
xgb_pred2 = np.zeros(target2.shape[0])
i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X2, standard2):
    
    tr_x, tr_y = X2.iloc[tr_idx], y2.iloc[tr_idx]
    val_x, val_y = X2.iloc[val_idx], y2.iloc[val_idx]

    xgb = XGBRegressor(**xgb_param2)
    xgb.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000, eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target2) / skf.n_splits
    xgb_pred2 += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

In [None]:
np.save('./xgb_pred2', xgb_pred2)

### XGBboost 3

In [None]:
xgb_param3 = {
    'objective' : 'reg:squarederror',
    'n_estimators' : 310,
    'learning_rate' : 0.031000000000000003,
    'nthread' : -1,
    'max_depth' : 14,
    'min_child_weight' : 29,
    'gamma' : 0.8,
    'colsample_bytree' : 0.8,
    'lambda' : 0.6000000000000001,
    'alpha' : 0.35000000000000003,
    'subsample' : 1.0,
    'tree_method' : 'gpu_hist',
    'predictor' : 'gpu_predictor'
}

In [None]:
xgb_pred3 = np.zeros(target3.shape[0])
i = 0
xgb_mae = []

for tr_idx, val_idx in skf.split(X3, standard3):
    
    tr_x, tr_y = X3.iloc[tr_idx], y3.iloc[tr_idx]
    val_x, val_y = X3.iloc[val_idx], y3.iloc[val_idx]

    xgb = XGBRegressor(**xgb_param3)
    xgb.fit(tr_x, tr_y, eval_set=[(tr_x, tr_y), (val_x, val_y)], early_stopping_rounds = 8, verbose = 1000, eval_metric = 'mae')

    val_pred = xgb.predict(val_x).astype(int)
    fold_mae = mean_absolute_error(val_y, val_pred)
    xgb_mae.append(fold_mae)
    print(f"{i + 1} Fold MAE = {fold_mae}")

    i += 1

    fold_pred = xgb.predict(target3) / skf.n_splits
    xgb_pred3 += fold_pred

print(f"\nAVG of MAE = {np.mean(xgb_mae)}")

In [None]:
np.save('./xgb_pred3', xgb_pred3)

# Submission

In [None]:
submission = pd.read_csv('./sample_submission.csv')

## Ensemble

In [None]:
### XGBoost
xgb_pred1 = np.load('./xgb_pred1.npy')
xgb_pred2 = np.load('./xgb_pred2.npy')
xgb_pred3 = np.load('./xgb_pred3.npy')

### LGBM
lgbm_pred1 = np.load('./lgbm_pred1.npy')
lgbm_pred2 = np.load('./lgbm_pred2.npy')
lgbm_pred3 = np.load('./lgbm_pred3.npy')

### CatBoost
cb_pred1 = np.load('./cb_pred1.npy')
cb_pred2 = np.load('./cb_pred2.npy')
cb_pred3 = np.load('./cb_pred3.npy')

In [None]:
# Ensemble - LGBM : XGBoost : CatBoost = 0.65 : 0.25 : 0.1
submission.loc[target1.index, 'target'] = lgbm_pred1 * 0.65 + xgb_pred1 * 0.25 + cb_pred1 * 0.1
submission.loc[target2.index, 'target'] = lgbm_pred2 * 0.65 + xgb_pred2 * 0.25 + cb_pred2 * 0.1
submission.loc[target3.index, 'target'] = lgbm_pred3 * 0.65 + xgb_pred3 * 0.25 + cb_pred3 * 0.1

In [None]:
submission['target'] = round(submission['target'], 0)
submission.to_csv('./lgbm_xgb_cb.csv', index = False)