In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
import pandas as pd
import numpy as np
import datetime
import os
import torch
import random

DATA_PATH = SUBMISSION_PATH= '/content/drive/MyDrive/데이터 분석/데이콘 캐글 컴페티션/예슬 개인 참가/대구 교통사고 피해 예측/data/'
SEED = 42

In [5]:
def seed_everything(seed):
    os.environ['PYTHONHASHSEED'] = str(seed)
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.enabled = False


seed_everything(SEED)

In [6]:
train_df = pd.read_csv(f"{DATA_PATH}train.csv")
accident_df = pd.read_csv(f'{DATA_PATH}countrywide_accident.csv')

test_df = pd.read_csv(f"{DATA_PATH}test.csv")

train_df.shape, test_df.shape, accident_df.shape

((39609, 23), (10963, 8), (602775, 23))

In [7]:
# 안개 삭제
train_df.drop(train_df[train_df['기상상태'] == '안개'].index, inplace=True)
accident_df.drop(accident_df[accident_df['기상상태'] == '안개'].index, inplace=True)

# 해빙, nan 삭제
accident_df.drop(accident_df[accident_df['노면상태'] == '해빙'].index, inplace=True)
accident_df.drop(accident_df[accident_df['노면상태'].isnull()].index, inplace=True)

# 철길건널목 삭제
accident_df.drop(accident_df[accident_df['사고유형'] == '철길건널목'].index, inplace=True)

In [8]:
import holidays
kr_holidays = holidays.KR()
# 시간 관련
train_df['사고일시'] = pd.to_datetime(train_df['사고일시'])
train_df['연'] = train_df['사고일시'].dt.year
train_df['월'] = train_df['사고일시'].dt.month
train_df['일'] = train_df['사고일시'].dt.day
train_df['시간'] = train_df['사고일시'].dt.hour
train_df['요일'] = train_df['사고일시'].dt.day_of_week
train_df['공휴일'] = train_df['사고일시'].apply(lambda x : int(x in kr_holidays))

accident_df['사고일시'] = pd.to_datetime(accident_df['사고일시'])
accident_df['연'] = accident_df['사고일시'].dt.year
accident_df['월'] = accident_df['사고일시'].dt.month
accident_df['일'] = accident_df['사고일시'].dt.day
accident_df['시간'] = accident_df['사고일시'].dt.hour
accident_df['요일'] = accident_df['사고일시'].dt.day_of_week
accident_df['공휴일'] = accident_df['사고일시'].apply(lambda x : int(x in kr_holidays))

test_df['사고일시'] = pd.to_datetime(test_df['사고일시'])
test_df['연'] = test_df['사고일시'].dt.year
test_df['월'] = test_df['사고일시'].dt.month
test_df['일'] = test_df['사고일시'].dt.day
test_df['시간'] = test_df['사고일시'].dt.hour
test_df['요일'] = test_df['사고일시'].dt.day_of_week
test_df['공휴일'] = test_df['사고일시'].apply(lambda x : int(x in kr_holidays))

# 장소 관련
pat = r'(\S+) (\S+) (\S+)'
train_df[['시', '구', '동']] = train_df['시군구'].str.extract(pat)
accident_df[['시', '구', '동']] = accident_df['시군구'].str.extract(pat)
test_df[['시', '구', '동']] = test_df['시군구'].str.extract(pat)


# 도로형태
pat =  r'(.+) - (.+)'
train_df[['도로형태_1', '도로형태_2']] = train_df['도로형태'].str.extract(pat)
accident_df[['도로형태_1', '도로형태_2']] = accident_df['도로형태'].str.extract(pat)
test_df[['도로형태_1', '도로형태_2']] = test_df['도로형태'].str.extract(pat)

# 광역시만 선택
city = ['서울특별시', '인천광역시', '광주광역시', '부산광역시', '울산광역시', '대전광역시']
mask = accident_df['시'].isin(city)
accident_df = accident_df.loc[mask, :].reset_index(drop=True)

train_cols = ['ID', 'ECLO', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형']
test_cols = ['ID', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형']
org_cols = ['ID', 'ECLO', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '사고유형 - 세부분류', '법규위반', '가해운전자 차종', '가해운전자 성별',
            '가해운전자 연령', '가해운전자 상해정도', '피해운전자 차종', '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도']

train_org = train_df[org_cols].copy()
accident_org = accident_df[org_cols].copy()

train_df = train_df[train_cols]
accident_df = accident_df[train_cols]
test_df = test_df[test_cols]

train_df.shape, accident_df.shape, test_df.shape

((39601, 15), (227699, 15), (10963, 14))

In [9]:
train_df = pd.concat([train_df, accident_df]).reset_index(drop=True)
train_org = pd.concat([train_org, accident_org]).reset_index(drop=True)

train_df.shape, train_org.shape

((267300, 15), (267300, 20))

In [10]:
train_org.columns

Index(['ID', 'ECLO', '연', '월', '시간', '요일', '공휴일', '시', '구', '동', '사고유형 - 세부분류',
       '법규위반', '가해운전자 차종', '가해운전자 성별', '가해운전자 연령', '가해운전자 상해정도', '피해운전자 차종',
       '피해운전자 성별', '피해운전자 연령', '피해운전자 상해정도'],
      dtype='object')

In [11]:
train_df.isnull().sum().sum(), test_df.isnull().sum().sum()

(0, 0)

In [12]:
train_df = train_df.fillna(0)
test_df = test_df.fillna(0)

In [13]:
train_ft = train_df.drop(columns = ['ID', 'ECLO']).copy()
test_ft = test_df.drop(columns = ['ID']).copy()

target = train_df['ECLO']
target_log = np.log1p(target).copy()

train_ft.shape, test_ft.shape, target.shape, target_log.shape

((267300, 13), (10963, 13), (267300,), (267300,))

In [14]:
!pip install category-encoders

Collecting category-encoders
  Downloading category_encoders-2.6.3-py2.py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.9/81.9 kB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: category-encoders
Successfully installed category-encoders-2.6.3


In [15]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [16]:
train_ft = train_ft.astype({'월':'object',
                            '시간':'object',
                            '요일':'object'})
test_ft = test_ft.astype({'월':'object',
                            '시간':'object',
                            '요일':'object'})

In [17]:
numeric_cols = train_ft.select_dtypes(exclude="object").columns.tolist()
category_cols = train_ft.select_dtypes(include="object").columns.tolist()

In [18]:
numeric_cols

['연', '공휴일']

In [19]:
category_cols

['월', '시간', '요일', '시', '구', '동', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형']

In [20]:
mask = train_ft[category_cols].nunique() <= 10
category_cols_1 = train_ft[category_cols].nunique().loc[mask].index.tolist()
category_cols_2 = train_ft[category_cols].nunique().loc[-mask].index.tolist()
category_cols_1, category_cols_2

(['요일', '시', '기상상태', '도로형태_1', '도로형태_2', '노면상태', '사고유형'],
 ['월', '시간', '구', '동'])

In [21]:
from category_encoders.target_encoder import TargetEncoder
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.metrics import mean_squared_log_error, make_scorer
from sklearn.model_selection import KFold, train_test_split, cross_val_score, StratifiedKFold
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

cv = KFold(n_splits=5, shuffle=True, random_state=SEED)

In [22]:
def rmsle(y_actual, y_pred):
    diff = np.log(y_pred + 1) - np.log(y_actual + 1)
    mean_error = np.mean(np.square(diff))

    return np.sqrt(mean_error)

rmsle_score = make_scorer(rmsle, greater_is_better=False)

In [23]:
def rmsle_log(y_valid, pred):
    y_valid = np.expm1(y_valid)
    pred = np.expm1(pred)
    msle = mean_squared_log_error(y_valid, pred)
    return np.sqrt(msle)

rmsle_log_score = make_scorer(rmsle_log, greater_is_better=False)

In [24]:
train_enc = train_ft.copy()
test_enc = test_ft.copy()

In [25]:
enc = OneHotEncoder(handle_unknown = 'ignore')
# 학습데이터
tmp = pd.DataFrame(
    enc.fit_transform(train_enc[category_cols_1]).toarray(),
    columns = enc.get_feature_names_out()
)
train_enc = pd.concat([train_enc,tmp],axis=1).drop(columns=category_cols_1)

# 테스트데이터
tmp = pd.DataFrame(
    enc.transform(test_enc[category_cols_1]).toarray(),
    columns = enc.get_feature_names_out()
)
test_enc = pd.concat([test_enc,tmp],axis=1).drop(columns=category_cols_1)

In [26]:
for col in category_cols_2:
    en = TargetEncoder(cols=[col])
    train_enc[col] = en.fit_transform(train_ft[col], target)
    test_enc[col] = en.transform(test_ft[col])

In [27]:
scaler = StandardScaler()

train_ft[numeric_cols] = scaler.fit_transform(train_ft[numeric_cols])
test_ft[numeric_cols] = scaler.transform(test_ft[numeric_cols])

scaler = StandardScaler()

train_enc[numeric_cols] = scaler.fit_transform(train_enc[numeric_cols])
test_enc[numeric_cols] = scaler.transform(test_enc[numeric_cols])

In [28]:
train_ft.shape, test_ft.shape, train_enc.shape, test_enc.shape

((267300, 13), (10963, 13), (267300, 49), (10963, 49))

In [29]:
xgb_params = {'n_estimators': 4247,
 'max_depth': 20,
 'learning_rate': 0.007304496147577258,
 'gamma': 0.5228200973732728,
 'min_child_weight': 3,
 'subsample': 0.55066556364394,
 'colsample_bytree': 0.676980203104331,
 'reg_alpha': 0.5002071038065621,
 'reg_lambda': 0.6810531070362105}

lgbm_params = {'n_estimators': 1945,
               'num_leaves': 54,
               'learning_rate': 0.010037423010422736,
               'colsample_bytree': 0.2676628531748305,
               'subsample': 0.09376966224835909,
               'min_child_samples': 1,
               'reg_alpha': 2.827334339720659,
               'reg_lambda': 1.977469291396739}

cat_params = {'n_estimators': 1968,
 'depth': 6,
 'learning_rate': 0.04556188853866844,
 'min_data_in_leaf': 95,
 'l2_leaf_reg': 3.0220940762404784}

In [None]:
from tqdm.auto import tqdm

SEED_list = [15, 2, 78, 94, 45]
# xgb
SEED_xgb_scores = []
SEED_xgb_models = []
for SEED in tqdm(SEED_list):
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    xgb_scores = []
    xgb_models = []
    for tri , vai in tqdm(cv.split(train_enc), total=5):
        x_train = train_enc.iloc[tri]
        y_train = target.iloc[tri]

        x_valid = train_enc.iloc[vai]
        y_valid = target.iloc[vai]

        xgb_model = XGBRegressor(objective='reg:squaredlogerror', eval_metric='rmsle', **xgb_params, random_state=SEED)
        xgb_model.fit(x_train, y_train)
        pred = xgb_model.predict(x_valid)
        score = rmsle(y_valid,pred)
        xgb_scores.append(score)
        xgb_models.append(xgb_model)

    SEED_xgb_scores.append(np.mean(xgb_scores))
    SEED_xgb_models.append(xgb_models)


In [None]:
# lgbm
SEED_lgbm_scores = []
SEED_lgbm_models = []
for SEED in tqdm(SEED_list):
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    lgbm_scores = []
    lgbm_models = []
    for tri , vai in tqdm(cv.split(train_enc), total=5):
        x_train = train_enc.iloc[tri]
        y_train = target_log.iloc[tri]

        x_valid = train_enc.iloc[vai]
        y_valid = target_log.iloc[vai]

        lgbm_model = LGBMRegressor(**lgbm_params, random_state=SEED, objective='rmse', metric='rmse', verbosity=-1)
        lgbm_model.fit(x_train, y_train)
        pred = lgbm_model.predict(x_valid)
        score = rmsle_log(y_valid,pred)
        lgbm_scores.append(score)
        lgbm_models.append(lgbm_model)

    SEED_lgbm_scores.append(np.mean(lgbm_scores))
    SEED_lgbm_models.append(lgbm_models)


In [30]:
# catboost
SEED_cat_scores = []
SEED_cat_models = []
for SEED in tqdm(SEED_list):
    cv = KFold(n_splits=5, shuffle=True, random_state=SEED)
    cat_scores = []
    cat_models = []
    for tri , vai in tqdm(cv.split(train_ft), total=5):
        x_train = train_ft.iloc[tri]
        y_train = target_log.iloc[tri]

        x_valid = train_ft.iloc[vai]
        y_valid = target_log.iloc[vai]

        cat_model = CatBoostRegressor(**cat_params, random_state=SEED, eval_metric='RMSE', cat_features=category_cols, task_type='GPU', verbose=False)
        cat_model.fit(x_train, y_train)
        pred = cat_model.predict(x_valid)
        score = rmsle_log(y_valid,pred)
        cat_scores.append(score)
        cat_models.append(cat_model)

    SEED_cat_scores.append(np.mean(cat_scores))
    SEED_cat_models.append(cat_models)

  0%|          | 0/5 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

KeyboardInterrupt: ignored

In [None]:
from scipy.stats.mstats import gmean

xgb_seed_pred = []
for i in tqdm(range(5)):
    xgb_pred_list = []
    for model in SEED_xgb_models[i]:
        pred = model.predict(test_enc)
        xgb_pred_list.append(pred)
    xgb_pred_gmean = np.mean(xgb_pred_list, axis=0)
    xgb_seed_pred.append(xgb_pred_gmean)

lgbm_seed_pred = []
for i in tqdm(range(5)):
    lgbm_pred_list = []
    for model in SEED_lgbm_models[i]:
        pred = model.predict(test_enc)
        pred = np.expm1(pred)
        lgbm_pred_list.append(pred)
    lgbm_pred_gmean = np.mean(lgbm_pred_list, axis=0)
    lgbm_seed_pred.append(lgbm_pred_gmean)

cat_seed_pred = []
for i in tqdm(range(5)):
    cat_pred_list = []
    for model in SEED_cat_models[i]:
        pred = model.predict(test_ft)
        pred = np.expm1(pred)
        cat_pred_list.append(pred)
    cat_pred_gmean = np.mean(cat_pred_list, axis=0)
    cat_seed_pred.append(cat_pred_gmean)

In [None]:
pred = (np.mean(xgb_seed_pred, axis=0) + np.mean(lgbm_seed_pred, axis=0) + np.mean(cat_seed_pred, axis=0))/3
pred

In [None]:
pred = (gmean(lgbm_seed_pred)*((gmean(xgb_seed_pred)*gmean(cat_seed_pred))**(1/2)))**(1/2)
pred

In [None]:
submission = pd.read_csv(f'{SUBMISSION_PATH}sample_submission.csv')
submission['ECLO'] = pred
submission

In [None]:
submission[submission['ECLO'] < 1]

In [None]:
submission.to_csv(f"{SUBMISSION_PATH}seed_ensemble_1_3.csv", index=False)

In [None]:
from google.colab import runtime
import time

time.sleep(5)
runtime.unassign()