In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import datetime
import os
import torch
import random

DATA_PATH = "/content/drive/MyDrive/DACON/대구/data/"
SUBMISSION_PATH = "/content/drive/MyDrive/DACON/대구/submission/"
SEED = 42

In [None]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


seed_everything(SEED)

In [None]:
train_df = pd.read_csv(f"{DATA_PATH}train.csv")
accident_df = pd.read_csv(f'{DATA_PATH}countrywide_accident.csv')

test_df = pd.read_csv(f"{DATA_PATH}test.csv")

train_df.shape, test_df.shape, accident_df.shape

((39609, 23), (10963, 8), (602775, 23))

In [None]:
# 안개 삭제
train_df.drop(train_df[train_df['기상상태'] == '안개'].index, inplace=True)
accident_df.drop(accident_df[accident_df['기상상태'] == '안개'].index, inplace=True)

# 해빙, nan 삭제
accident_df.drop(accident_df[accident_df['노면상태'] == '해빙'].index, inplace=True)
accident_df.drop(accident_df[accident_df['노면상태'].isnull()].index, inplace=True)

# 철길건널목 삭제
accident_df.drop(accident_df[accident_df['사고유형'] == '철길건널목'].index, inplace=True)

In [None]:
cols = ['ID', '사고일시', '요일', '기상상태', '시군구', '도로형태', '노면상태', '사고유형', 'ECLO']
train_df = train_df.loc[:, cols].reset_index(drop=True)
accident_df = accident_df.loc[:, cols].reset_index(drop=True)

train_df.shape, accident_df.shape

((39601, 9), (602140, 9))

In [None]:
import holidays
kr_holidays = holidays.KR()
# 시간 관련
train_df['date'] = pd.to_datetime(train_df['사고일시'])
train_df['연'] = train_df['date'].dt.year
train_df['월'] = train_df['date'].dt.month
train_df['일'] = train_df['date'].dt.day
train_df['시간'] = train_df['date'].dt.hour
train_df['요일'] = train_df['date'].dt.day_of_week
train_df['공휴일'] = train_df['date'].apply(lambda x : int(x in kr_holidays))

accident_df['date'] = pd.to_datetime(accident_df['사고일시'])
accident_df['연'] = accident_df['date'].dt.year
accident_df['월'] = accident_df['date'].dt.month
accident_df['일'] = accident_df['date'].dt.day
accident_df['시간'] = accident_df['date'].dt.hour
accident_df['요일'] = accident_df['date'].dt.day_of_week
accident_df['공휴일'] = accident_df['date'].apply(lambda x : int(x in kr_holidays))

test_df['date'] = pd.to_datetime(test_df['사고일시'])
test_df['연'] = test_df['date'].dt.year
test_df['월'] = test_df['date'].dt.month
test_df['일'] = test_df['date'].dt.day
test_df['시간'] = test_df['date'].dt.hour
test_df['요일'] = test_df['date'].dt.day_of_week
test_df['공휴일'] = test_df['date'].apply(lambda x : int(x in kr_holidays))

# 장소 관련
pat = r'(\S+) (\S+) (\S+)'
train_df[['시', '구', '동']] = train_df['시군구'].str.extract(pat)
accident_df[['시', '구', '동']] = accident_df['시군구'].str.extract(pat)
test_df[['시', '구', '동']] = test_df['시군구'].str.extract(pat)


# 도로형태
pat =  r'(.+) - (.+)'
train_df[['도로형태_1', '도로형태_2']] = train_df['도로형태'].str.extract(pat)
accident_df[['도로형태_1', '도로형태_2']] = accident_df['도로형태'].str.extract(pat)
test_df[['도로형태_1', '도로형태_2']] = test_df['도로형태'].str.extract(pat)

train_cols = ['ID', 'ECLO', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형']
test_cols = ['ID', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형']
train_df = train_df[train_cols]
accident_df = accident_df[train_cols]
test_df = test_df[test_cols]

train_df.shape, accident_df.shape, test_df.shape

((39601, 15), (602140, 15), (10963, 14))

In [None]:
city = ['서울특별시', '인천광역시', '광주광역시', '부산광역시', '울산광역시', '대전광역시']
mask = accident_df['시'].isin(city)
accident_df = accident_df.loc[mask, :].reset_index(drop=True)
accident_df

Unnamed: 0,ID,ECLO,연,월,시간,요일,공휴일,시,구,동,기상상태,도로형태_1,도로형태_2,노면상태,사고유형
0,COUNTRYWIDE_ACCIDENT_000000,3,2019,1,0,1,1,서울특별시,강서구,방화동,맑음,교차로,교차로횡단보도내,건조,차대사람
1,COUNTRYWIDE_ACCIDENT_000004,7,2019,1,0,1,1,인천광역시,부평구,부평동,맑음,교차로,교차로안,건조,차대차
2,COUNTRYWIDE_ACCIDENT_000005,3,2019,1,0,1,1,인천광역시,부평구,부평동,맑음,교차로,교차로부근,건조,차대사람
3,COUNTRYWIDE_ACCIDENT_000008,3,2019,1,0,1,1,광주광역시,광산구,수완동,맑음,단일로,기타,건조,차대차
4,COUNTRYWIDE_ACCIDENT_000011,10,2019,1,0,1,1,부산광역시,해운대구,우동,맑음,교차로,교차로안,건조,차대사람
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
227694,COUNTRYWIDE_ACCIDENT_602766,3,2021,12,23,4,0,서울특별시,서초구,반포동,맑음,단일로,기타,건조,차대사람
227695,COUNTRYWIDE_ACCIDENT_602769,5,2021,12,23,4,0,부산광역시,연제구,연산동,맑음,단일로,기타,건조,차대차
227696,COUNTRYWIDE_ACCIDENT_602770,8,2021,12,23,4,0,부산광역시,해운대구,우동,흐림,교차로,교차로안,건조,차대차
227697,COUNTRYWIDE_ACCIDENT_602771,6,2021,12,23,4,0,서울특별시,성동구,행당동,맑음,단일로,기타,건조,차대차


In [None]:
# cctv 정보
cctv_df = pd.read_csv(f"{DATA_PATH}대구 CCTV 정보.csv", encoding='cp949')

cctv_df = cctv_df[['소재지지번주소', '단속구분', '설치연도', '제한속도']]
cctv_df = pd.get_dummies(cctv_df, columns=['단속구분'])

pat = r'(\S+) (\S+) (\S+) (\S+)'

cctv_df[['시', '구', '동', '번지']] = cctv_df['소재지지번주소'].str.extract(pat)
cctv_df = cctv_df.drop(columns=['소재지지번주소', '번지'])

agg_dict = {
    '제한속도' : 'mean',
    '단속구분_1' : 'sum',
    '단속구분_2' : 'sum',
    '단속구분_4' : 'sum',
    '단속구분_99' : 'sum',
}
cctv_df = cctv_df.groupby(['시', '구', '동', '설치연도']).agg(agg_dict).reset_index()
cctv_df

Unnamed: 0,시,구,동,설치연도,제한속도,단속구분_1,단속구분_2,단속구분_4,단속구분_99
0,대구,달성군,가창면,2021.0,45.000000,2,0,0,0
1,대구,달성군,구지면,2021.0,30.000000,1,0,0,0
2,대구,달성군,다사읍,2021.0,37.500000,2,6,0,0
3,대구,달성군,옥포읍,2021.0,41.666667,3,2,0,1
4,대구,달성군,유가읍,2021.0,46.666667,0,2,0,1
...,...,...,...,...,...,...,...,...,...
529,대구광역시,중구,종로1가,2009.0,0.000000,0,0,1,0
530,대구광역시,중구,종로1가,2013.0,50.000000,0,1,0,0
531,대구광역시,중구,종로2가,2021.0,0.000000,0,0,1,0
532,대구광역시,중구,태평로1가,2006.0,0.000000,0,0,1,0


In [None]:
agg_dict = {
    '제한속도' : 'mean',
    '단속구분_1' : 'sum',
    '단속구분_2' : 'sum',
    '단속구분_4' : 'sum',
    '단속구분_99' : 'sum',
}

# cctv 19년도 설치현황
mask_2019 = cctv_df['설치연도'] <= 2019
cctv_2019 = cctv_df[mask_2019].drop(columns=['설치연도']).groupby(['시', '구', '동']).agg(agg_dict).reset_index()

# cctv 20년도 설치현황
mask_2020 = cctv_df['설치연도'] <= 2020
cctv_2020 = cctv_df[mask_2020].drop(columns=['설치연도']).groupby(['시', '구', '동']).agg(agg_dict).reset_index()

# cctv 21년도 설치현황
mask_2021 = cctv_df['설치연도'] <= 2021
cctv_2021 = cctv_df[mask_2021].drop(columns=['설치연도']).groupby(['시', '구', '동']).agg(agg_dict).reset_index()

In [None]:
# 2019년도 교통사고 정보
mask_2019 = train_df['연'] == 2019
train_2019 = train_df[mask_2019].merge(cctv_2019, how='left', on=['시', '구', '동'])

# # 2020년도 교통사고 정보
mask_2020 = train_df['연'] == 2020
train_2020 = train_df[mask_2020].merge(cctv_2020, how='left', on=['시', '구', '동'])


# # 2021년도 교통사고 정보
mask_2021 = train_df['연'] == 2021
train_2021 = train_df[mask_2021].merge(cctv_2021, how='left', on=['시', '구', '동'])

train_df = pd.concat([train_2019, train_2020, train_2021]).reset_index(drop=True)

# # 2022년도 교통사고 정보
test_df = test_df.merge(cctv_2021, how='left', on=['시', '구', '동'])

train_df.shape, test_df.shape

((39601, 20), (10963, 19))

In [None]:
# 보안등 정보
security_df = pd.read_csv(f"{DATA_PATH}대구 보안등 정보.csv", encoding='cp949')

security_df = security_df[['소재지지번주소', '설치개수']]

pat = r'(\S+) (\S+) (\S+) (\S+)'

security_df[['시', '구', '동', '번지']] = security_df['소재지지번주소'].str.extract(pat)
security_df = security_df.drop(columns=['소재지지번주소', '번지'])
security_df = security_df.groupby(['시', '구', '동']).sum().reset_index()
security_df

  security_df = pd.read_csv(f"{DATA_PATH}대구 보안등 정보.csv", encoding='cp949')


Unnamed: 0,시,구,동,설치개수
0,대구광역시,남구,대명동,5377
1,대구광역시,남구,봉덕동,1424
2,대구광역시,남구,이천동,556
3,대구광역시,달서구,갈산동,349
4,대구광역시,달서구,감삼동,932
...,...,...,...,...
223,대구광역시,중구,태평로2가,38
224,대구광역시,중구,태평로3가,47
225,대구광역시,중구,포정동,18
226,대구광역시,중구,향촌동,28


In [None]:
train_df = pd.merge(train_df, security_df, how='left', on=['시', '구', '동'])
test_df = pd.merge(test_df, security_df, how='left', on=['시', '구', '동'])

train_df.shape, test_df.shape

((39601, 21), (10963, 20))

In [None]:
# cctv 정보
kr_cctv_df = pd.read_csv(f"{DATA_PATH}외부_전국CCTV표준데이터.csv", encoding='cp949')

kr_cctv_df = kr_cctv_df[['소재지지번주소', '단속구분', '설치연도', '제한속도']]

# 01+02 분리
kr_cctv_df_2 = kr_cctv_df[kr_cctv_df['단속구분'] == '01+02'].copy()
kr_cctv_df_2['단속구분'] = 2

kr_cctv_df.loc[kr_cctv_df['단속구분'] == '01+02', '단속구분'] = 1
kr_cctv_df.loc[kr_cctv_df['단속구분'] == '04', '단속구분'] = 4
kr_cctv_df.loc[kr_cctv_df['단속구분'] == '02', '단속구분'] = 2
kr_cctv_df.loc[kr_cctv_df['단속구분'] == '01', '단속구분'] = 1
kr_cctv_df.loc[kr_cctv_df['단속구분'] == '03', '단속구분'] = 3

kr_cctv_df = pd.concat([kr_cctv_df, kr_cctv_df_2])
kr_cctv_df['단속구분'] = kr_cctv_df['단속구분'].astype(int)
kr_cctv_df.drop(kr_cctv_df[kr_cctv_df['단속구분'] == 3].index, inplace=True)

kr_cctv_df = pd.get_dummies(kr_cctv_df, columns=['단속구분'])

pat = r'(\S+) (\S+) (\S+) (\S+)'

kr_cctv_df[['시', '구', '동', '번지']] = kr_cctv_df['소재지지번주소'].str.extract(pat)
kr_cctv_df = kr_cctv_df.drop(columns=['소재지지번주소', '번지'])

agg_dict = {
    '제한속도' : 'mean',
    '단속구분_1' : 'sum',
    '단속구분_2' : 'sum',
    '단속구분_4' : 'sum',
    '단속구분_99' : 'sum',
}

kr_cctv_df = kr_cctv_df.groupby(['시', '구', '동', '설치연도']).agg(agg_dict).reset_index()
kr_cctv_df

  kr_cctv_df = pd.read_csv(f"{DATA_PATH}외부_전국CCTV표준데이터.csv", encoding='cp949')


Unnamed: 0,시,구,동,설치연도,제한속도,단속구분_1,단속구분_2,단속구분_4,단속구분_99
0,강원도,강릉시,강문동,2021.0,0.0,0,0,2,0
1,강원도,강릉시,강문동,2022.0,0.0,0,0,1,0
2,강원도,강릉시,견소동,2017.0,0.0,0,0,4,0
3,강원도,강릉시,견소동,2018.0,0.0,0,0,2,0
4,강원도,강릉시,견소동,2019.0,0.0,0,0,2,0
...,...,...,...,...,...,...,...,...,...
11039,충청북도,충주시,호암동,2017.0,60.0,0,1,0,0
11040,충청북도,충주시,호암동,2019.0,60.0,0,1,0,0
11041,충청북도,충주시,호암동,2020.0,0.0,0,0,7,0
11042,충청북도,충주시,호암동,2021.0,30.0,0,1,0,0


In [None]:
agg_dict = {
    '제한속도' : 'mean',
    '단속구분_1' : 'sum',
    '단속구분_2' : 'sum',
    '단속구분_4' : 'sum',
    '단속구분_99' : 'sum',
}

# cctv 19년도 설치현황
mask_2019 = kr_cctv_df['설치연도'] <= 2019
kr_cctv_2019 = kr_cctv_df[mask_2019].drop(columns=['설치연도']).groupby(['시', '구', '동']).agg(agg_dict).reset_index()

# cctv 20년도 설치현황
mask_2020 = kr_cctv_df['설치연도'] <= 2020
kr_cctv_2020 = kr_cctv_df[mask_2020].drop(columns=['설치연도']).groupby(['시', '구', '동']).agg(agg_dict).reset_index()

# cctv 21년도 설치현황
mask_2021 = kr_cctv_df['설치연도'] <= 2021
kr_cctv_2021 = kr_cctv_df[mask_2021].drop(columns=['설치연도']).groupby(['시', '구', '동']).agg(agg_dict).reset_index()

In [None]:
# 2019년도 교통사고 정보
mask_2019 = accident_df['연'] == 2019
accident_2019 = accident_df[mask_2019].merge(kr_cctv_2019, how='left', on=['시', '구', '동'])

# # 2020년도 교통사고 정보
mask_2020 = accident_df['연'] == 2020
accident_2020 = accident_df[mask_2020].merge(kr_cctv_2020, how='left', on=['시', '구', '동'])


# # 2021년도 교통사고 정보
mask_2021 = accident_df['연'] == 2021
accident_2021 = accident_df[mask_2021].merge(kr_cctv_2021, how='left', on=['시', '구', '동'])

accident_df = pd.concat([accident_2019, accident_2020, accident_2021]).reset_index(drop=True)

accident_df.shape

(227699, 20)

In [None]:
# 보안등 정보
kr_security_df = pd.read_csv(f"{DATA_PATH}외부_전국보안등정보표준데이터.csv", encoding='cp949')

kr_security_df = kr_security_df[['소재지지번주소', '설치개수', '설치연도']]
kr_security_df = kr_security_df.fillna(0)
pat = r'(\S+) (\S+) (\S+) (\S+)'

kr_security_df[['시', '구', '동', '번지']] = kr_security_df['소재지지번주소'].str.extract(pat)
kr_security_df = kr_security_df.drop(columns=['소재지지번주소', '번지'])
kr_security_df = kr_security_df.groupby(['시', '구', '동', '설치연도']).sum().reset_index()
kr_security_df

Unnamed: 0,시,구,동,설치연도,설치개수
0,강원도,강릉시,내곡동,0.0,6
1,강원도,강릉시,사천면,0.0,1
2,강원도,강릉시,회산동,0.0,1
3,강원도,삼척시,노곡면,0.0,10
4,강원도,삼척시,노곡면,2018.0,18
...,...,...,...,...,...
985,충청북도,제천시,두학동,0.0,25
986,충청북도,제천시,모산동,0.0,8
987,충청북도,제천시,봉양읍,0.0,328
988,충청북도,제천시,송학면,0.0,62


In [None]:
# 보안등 19년도 설치현황
mask_2019 = kr_security_df['설치연도'] <= 2019
kr_security_2019 = kr_security_df[mask_2019].drop(columns=['설치연도']).groupby(['시', '구', '동']).sum().reset_index()

# 보안등 20년도 설치현황
mask_2020 = kr_security_df['설치연도'] <= 2020
kr_security_2020 = kr_security_df[mask_2020].drop(columns=['설치연도']).groupby(['시', '구', '동']).sum().reset_index()

# 보안등 21년도 설치현황
mask_2021 = kr_security_df['설치연도'] <= 2021
kr_security_2021 = kr_security_df[mask_2021].drop(columns=['설치연도']).groupby(['시', '구', '동']).sum().reset_index()

In [None]:
# 2019년도 교통사고 정보
mask_2019 = accident_df['연'] == 2019
accident_2019 = accident_df[mask_2019].merge(kr_security_2019, how='left', on=['시', '구', '동'])

# # 2020년도 교통사고 정보
mask_2020 = accident_df['연'] == 2020
accident_2020 = accident_df[mask_2020].merge(kr_security_2020, how='left', on=['시', '구', '동'])

# # 2021년도 교통사고 정보
mask_2021 = accident_df['연'] == 2021
accident_2021 = accident_df[mask_2021].merge(kr_security_2021, how='left', on=['시', '구', '동'])

accident_df = pd.concat([accident_2019, accident_2020, accident_2021]).reset_index(drop=True)

accident_df.shape

(227699, 21)

In [None]:
train_df = pd.concat([train_df, accident_df]).reset_index(drop=True)
train_df.shape

(267300, 21)

In [None]:
train_df["ECLO"].quantile([0.95, 0.97, 0.99])

0.95    10.0
0.97    12.0
0.99    16.0
Name: ECLO, dtype: float64

In [None]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [None]:
# 요일별 시간별 평균 ECLO

ECLO_mean = train_df.groupby(['요일', '시간'])['ECLO'].agg([('ECLO_std', 'mean')]).reset_index()

train_df = pd.merge(train_df, ECLO_mean, how='left', on=['요일', '시간'])
test_df = pd.merge(test_df, ECLO_mean, how='left', on=['요일', '시간'])

train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

train_df.shape, test_df.shape

((267300, 22), (10963, 21))

In [None]:
train_ft = train_df.drop(columns = ['ID', 'ECLO']).copy()
test_ft = test_df.drop(columns = ['ID']).copy()

target = train_df['ECLO']

train_ft.shape, test_ft.shape, target.shape

((267300, 20), (10963, 20), (267300,))

In [None]:
!pip install category-encoders

Collecting category-encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.3


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [None]:
train_ft = train_ft.astype({'월':'object',
                            '시간':'object',
                            '요일':'object',
                            '공휴일':'object'})
test_ft = test_ft.astype({'월':'object',
                            '시간':'object',
                            '요일':'object',
                            '공휴일':'object'})

In [None]:
numeric_cols = train_ft.select_dtypes(exclude="object").columns.tolist()
category_cols = train_ft.select_dtypes(include="object").columns.tolist()

In [None]:
numeric_cols

['연', '제한속도', '단속구분_1', '단속구분_2', '단속구분_4', '단속구분_99', '설치개수', 'ECLO_std']

In [None]:
category_cols

['월',
 '시간',
 '요일',
 '공휴일',
 '시',
 '구',
 '동',
 '기상상태',
 '도로형태_1',
 '도로형태_2',
 '노면상태',
 '사고유형']

In [None]:
mask = train_ft[category_cols].nunique() <= 10
category_cols_1 = train_ft[category_cols].nunique().loc[mask].index.tolist()
category_cols_2 = train_ft[category_cols].nunique().loc[-mask].index.tolist()
category_cols_1, category_cols_2

(['요일', '공휴일', '시', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형'],
 ['월', '시간', '구', '동'])

In [None]:
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold, train_test_split, cross_val_score, StratifiedKFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor



In [None]:
def rmsle(y_actual, y_pred):
    diff = np.log1p(y_pred) - np.log1p(y_actual)
    mean_error = np.mean(np.square(diff))

    return np.sqrt(mean_error)

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [None]:
# def rmsle(y_valid, pred):
#     if pred < 0:
#         pred = 0
#     msle = mean_squared_log_error(y_valid, pred)
#     return np.sqrt(msle)

# rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [None]:
train_enc = train_ft.copy()
test_enc = test_ft.copy()

In [None]:
enc = OneHotEncoder(handle_unknown = 'ignore')
# 학습데이터
tmp = pd.DataFrame(
    enc.fit_transform(train_enc[category_cols_1]).toarray(),
    columns = enc.get_feature_names_out()
)
train_enc = pd.concat([train_enc,tmp],axis=1).drop(columns=category_cols_1)

# 테스트데이터
tmp = pd.DataFrame(
    enc.transform(test_enc[category_cols_1]).toarray(),
    columns = enc.get_feature_names_out()
)
test_enc = pd.concat([test_enc,tmp],axis=1).drop(columns=category_cols_1)

In [None]:
for col in category_cols_2:
    en = TargetEncoder(cols=[col])
    train_enc[col] = en.fit_transform(train_ft[col], target)
    test_enc[col] = en.transform(test_ft[col])

In [None]:
scaler = StandardScaler()

train_enc[numeric_cols] = scaler.fit_transform(train_enc[numeric_cols])
test_enc[numeric_cols] = scaler.transform(test_enc[numeric_cols])

In [None]:
xgb_params = {'n_estimators': 2455,
 'max_depth': 20,
 'learning_rate': 0.017200236313192504,
 'gamma': 0.8817632945421009,
 'min_child_weight': 2,
 'subsample': 0.6003357298234289,
 'colsample_bytree': 0.658489995685688,
 'reg_alpha': 0.23143851956849681,
 'reg_lambda': 0.0001984067331754491}

In [None]:
from tqdm.auto import tqdm

SEED_scores = []
SEED_models = []
for SEED in tqdm(range(51, 101)):
    print(f"{SEED}시드 시작")
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    scores = []
    models = []
    for tri , vai in cv.split(train_enc): # k fold 앙상블
        # 학습 데이터
        x_train = train_enc.iloc[tri]
        y_train = target.iloc[tri]

        # 검증 데이터
        x_valid = train_enc.iloc[vai]
        y_valid = target.iloc[vai]

        xgb_model = XGBRegressor(objective='reg:squaredlogerror', eval_metric='rmsle', **xgb_params, random_state=SEED)
        xgb_model.fit(x_train, y_train)
        pred = xgb_model.predict(x_valid)
        score = rmsle(y_valid,pred)
        scores.append(score)
        models.append(xgb_model) # 폴드별 학습된 모델 객체 담기

    SEED_scores.append(np.mean(scores))
    SEED_models.append(models)
    print(f"{SEED}번 시드 : {np.mean(scores)}")

  0%|          | 0/50 [00:00<?, ?it/s]

51시드 시작
51번 시드 : 0.4208288513144713
52시드 시작
52번 시드 : 0.4208474996403015
53시드 시작
53번 시드 : 0.42081167023223537
54시드 시작
54번 시드 : 0.42082788729496573
55시드 시작
55번 시드 : 0.4207952932761098
56시드 시작
56번 시드 : 0.4208145314731714
57시드 시작
57번 시드 : 0.4208016873881676
58시드 시작
58번 시드 : 0.42078144944259466
59시드 시작
59번 시드 : 0.4207976456243026
60시드 시작
60번 시드 : 0.420796270675304
61시드 시작
61번 시드 : 0.420821730256632
62시드 시작
62번 시드 : 0.42082168015274535
63시드 시작
63번 시드 : 0.4208289447604365
64시드 시작
64번 시드 : 0.42085070267022706
65시드 시작
65번 시드 : 0.4208192341028928
66시드 시작
66번 시드 : 0.4208481193829371
67시드 시작
67번 시드 : 0.42084096405079485
68시드 시작
68번 시드 : 0.4208021409694044
69시드 시작
69번 시드 : 0.4208164094560628
70시드 시작
70번 시드 : 0.4208111660409237
71시드 시작
71번 시드 : 0.42081410376304584
72시드 시작
72번 시드 : 0.4208142681567374
73시드 시작
73번 시드 : 0.42082320845353305
74시드 시작
74번 시드 : 0.42078468358817417
75시드 시작
75번 시드 : 0.42078095236882457
76시드 시작
76번 시드 : 0.42077624961196847
77시드 시작
77번 시드 : 0.4208275003570847
78시드 시작
78번 시드 : 0.

In [None]:
df = pd.DataFrame(SEED_scores).reset_index()
df['index'] = df['index'] + 51
df.columns = ['SEED', 'rmsle']
df.sort_values(by='rmsle', ascending=True).head()

Unnamed: 0,SEED,rmsle
41,92,0.420771
25,76,0.420776
44,95,0.420779
24,75,0.420781
7,58,0.420781


In [None]:
from scipy.stats.mstats import gmean

seed_list = [7, 24, 25, 41, 44]
seed_pred = []
for seed in tqdm(seed_list):
    pred_list = []
    for model in tqdm(SEED_models[seed]):
        pred = model.predict(test_enc)
        pred_list.append(pred)
    pred_gmean = gmean(pred_list)
    seed_pred.append(pred_gmean)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

In [None]:
df = pd.DataFrame(seed_pred).transpose()
df.to_csv(f"{SUBMISSION_PATH}xgboost_seed_ensemble_2.csv", index=False)

In [None]:
df_1 = pd.read_csv(f"{SUBMISSION_PATH}xgboost_seed_ensemble_1.csv")
df_2 = pd.read_csv(f"{SUBMISSION_PATH}xgboost_seed_ensemble_2.csv")

df_1.shape, df_2.shape

((10963, 5), (10963, 5))

In [None]:
pred_df = pd.concat([df_1, df_2], axis=1)
pred_df.columns = ['11', '14', '25', '26', '29', '58', '75', '76', '92', '95']
pred_df

Unnamed: 0,11,14,25,26,29,58,75,76,92,95
0,4.047260,4.037252,4.030489,4.061422,3.989610,4.018698,4.024758,4.010468,4.020039,4.044565
1,3.560719,3.572872,3.564795,3.584267,3.530520,3.579330,3.597771,3.554172,3.565999,3.580670
2,5.294594,5.300498,5.290721,5.274647,5.321956,5.280195,5.277908,5.275187,5.259026,5.293240
3,4.551689,4.562748,4.527700,4.577699,4.553953,4.548910,4.519914,4.538658,4.555993,4.536845
4,4.741784,4.753705,4.762837,4.759659,4.749365,4.727862,4.713924,4.742209,4.763188,4.748295
...,...,...,...,...,...,...,...,...,...,...
10958,4.957747,4.988351,5.006458,4.951049,4.985218,4.933262,4.930366,4.954448,4.972680,5.034380
10959,4.363650,4.348391,4.352368,4.334810,4.364995,4.366984,4.335772,4.363782,4.350953,4.355153
10960,4.887463,4.913072,4.918961,4.906999,4.905258,4.902282,4.940409,4.895525,4.900190,4.899249
10961,4.608330,4.596182,4.619480,4.624791,4.618173,4.623754,4.621063,4.638512,4.625694,4.615605


In [None]:
from scipy.stats.mstats import gmean

pred = gmean(pred_df.values, axis=1)
pred

array([4.02840911, 3.56906773, 5.28677159, ..., 4.90692107, 4.61914609,
       4.55237034])

In [None]:
submission = pd.read_csv(f'{SUBMISSION_PATH}sample_submission.csv')
submission['ECLO'] = pred
submission

Unnamed: 0,ID,ECLO
0,ACCIDENT_39609,4.028409
1,ACCIDENT_39610,3.569068
2,ACCIDENT_39611,5.286772
3,ACCIDENT_39612,4.547382
4,ACCIDENT_39613,4.746259
...,...,...
10958,ACCIDENT_50567,4.971299
10959,ACCIDENT_50568,4.353672
10960,ACCIDENT_50569,4.906921
10961,ACCIDENT_50570,4.619146


In [None]:
submission.to_csv(f'{SUBMISSION_PATH}xgboost_seed_ensemble_final.csv', index=False)