In [4]:
!pip install xgboost
!pip install catboost
!pip install lightgbm
!pip install category_encoders



### Stacking Ensemble 
- k-fold
- dict 부분 feature 적용
- meta model
- catboost, lgbm, xgboost 세개의 모델을 stacking 방식으로 앙상블하여 최종 결과를 도출
- person_rn과 content_rn이 높은 영향력을 가지는 변수로 파악되었음. data leak 문제가 발생하지 않도록 train data에서 출현한 person_rn의 key값과 content_rn의 key값으로 dictionary를 만들고, test data에 해당 key값이 있을 경우 value를 할당
- 시간(Hour)에 따른 target 분포를 파악하여 그룹화 진행. Target의 비율에 따라서 상-중-하 그룹을 나눔

In [5]:
DATA_PATH = "data/"
SUBMIT_PATH = "submit/"
SEED = 42

import os
import sys
import platform
import random
import math
from typing import List ,Dict, Tuple

import pandas as pd
import numpy as np
 
import sklearn 
from sklearn.model_selection import StratifiedKFold , KFold, train_test_split
from sklearn.metrics import f1_score 

from catboost import Pool,CatBoostClassifier
from datetime import datetime

### 데이터셋 불러오기

In [6]:
train_data = pd.read_csv(f'{DATA_PATH}train.csv')
test_data = pd.read_csv(f'{DATA_PATH}test.csv')

train_data.shape , test_data.shape

((501951, 35), (46404, 34))

### 데이터 전처리

In [7]:
# hour 변수를 넣기 위해서 추가
train_data['contents_open_dt'] = train_data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
test_data['contents_open_dt'] = test_data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

train_data['hour'] = train_data['contents_open_dt'].apply(lambda x : x.timetuple()[3])
test_data['hour'] = test_data['contents_open_dt'].apply(lambda x : x.timetuple()[3])

train_data.head(3)

Unnamed: 0,id,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_f,person_prefer_g,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,contents_open_dt,target,hour
0,0,True,True,True,False,False,False,1,4,3,5,275,370,369,8,1,1,4,95,59,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,2020-01-17 12:09:36,1,12
1,1,False,False,False,True,True,False,1,3,4,1,114,181,175,4,1,1,131,101,96,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,2020-06-18 17:48:52,0,17
2,2,False,False,False,True,False,False,2,0,3,5,464,175,452,3,1,1,54,263,56,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,2020-07-08 20:00:10,0,20


### 시간 변수추가
- 시간에 따른 target 분포를 파악하여 그룹화 진행. Target 분포에 따라 23-07시는  low, 17-22 시는 mid, 8-17 시는 good 으로 분류

In [8]:
train_data.loc[train_data['hour']<=7, 'hour_band'] = 'low_time'
train_data.loc[train_data['hour']==23, 'hour_band'] = 'low_time'

train_data.loc[(train_data['hour']>7) & (train_data['hour'] <= 16), 'hour_band'] = 'good_time'
train_data.loc[(train_data['hour']>=17) & (train_data['hour']<=22), 'hour_band'] =  'mid_time'
train_data = train_data.drop(columns = ['hour'])

test_data.loc[test_data['hour']<=7, 'hour_band'] = 'low_time'
test_data.loc[test_data['hour']==23, 'hour_band'] = 'low_time'

test_data.loc[(test_data['hour']>7) & (test_data['hour'] <= 16), 'hour_band'] =  'good_time'
test_data.loc[(test_data['hour']>=17) & (test_data['hour']<=22), 'hour_band'] =  'mid_time'
test_data = test_data.drop(columns = ['hour'])

add_code: train,test 데이터의 속성 코드와 속성 분류 코드 파일의 코드가 일치하면 속성 코드를 기존 데이터에 추가해주는 함수

In [9]:
from typing import Dict
import numpy as np
import pandas as pd

def add_code(
    df: pd.DataFrame,
    d_code: Dict[int, Dict[str, int]], 
    h_code: Dict[int, Dict[str, int]], 
    l_code: Dict[int, Dict[str, int]],
) -> pd.DataFrame:
    
    # Copy input data
    df = df.copy()   

    # D Code
    df['person_prefer_d_1_n'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_1_s'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_1_m'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_1_l'] = df['person_prefer_d_1'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_2_n'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_2_s'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_2_m'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_2_l'] = df['person_prefer_d_2'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['person_prefer_d_3_n'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['person_prefer_d_3_s'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['person_prefer_d_3_m'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['person_prefer_d_3_l'] = df['person_prefer_d_3'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    df['contents_attribute_d_n'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 세분류코드'])
    df['contents_attribute_d_s'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 소분류코드'])
    df['contents_attribute_d_m'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 중분류코드'])
    df['contents_attribute_d_l'] = df['contents_attribute_d'].apply(lambda x: d_code[x]['속성 D 대분류코드'])

    # H Code
    df['person_prefer_h_1_l'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_1_m'] = df['person_prefer_h_1'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_2_l'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_2_m'] = df['person_prefer_h_2'].apply(lambda x: h_code[x]['속성 H 중분류코드'])
    
    df['person_prefer_h_3_l'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['person_prefer_h_3_m'] = df['person_prefer_h_3'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    df['contents_attribute_h_l'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 대분류코드'])
    df['contents_attribute_h_m'] = df['contents_attribute_h'].apply(lambda x: h_code[x]['속성 H 중분류코드'])

    # L Code
    df['contents_attribute_l_n'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 세분류코드'])
    df['contents_attribute_l_s'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 소분류코드'])
    df['contents_attribute_l_m'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 중분류코드'])
    df['contents_attribute_l_l'] = df['contents_attribute_l'].apply(lambda x: l_code[x]['속성 L 대분류코드'])
    
    return df

In [10]:
d_code = pd.read_csv('data/속성_D_코드.csv', index_col=0).T.to_dict()
h_code = pd.read_csv('data/속성_H_코드.csv', index_col=0).T.to_dict()
l_code = pd.read_csv('data/속성_L_코드.csv', index_col=0).T.to_dict()


train_data = add_code(train_data, d_code, h_code, l_code)
test_data = add_code(test_data, d_code, h_code, l_code)



In [11]:
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_s" , "contents_attribute_d_s"),
    ("person_prefer_d_2_m" , "contents_attribute_d_m"),
    ("person_prefer_d_2_l" , "contents_attribute_d_l"),
    ("person_prefer_d_3_s" , "contents_attribute_d_s"),
    ("person_prefer_d_3_m" , "contents_attribute_d_m"),
    ("person_prefer_d_3_l" , "contents_attribute_d_l"),

    ("person_prefer_h_1_m" , "contents_attribute_h_m"),
    ("person_prefer_h_2_m" , "contents_attribute_h_m"),
    ("person_prefer_h_3_m" , "contents_attribute_h_m"),
    ("person_prefer_h_1_l" , "contents_attribute_h_l"),
    ("person_prefer_h_2_l" , "contents_attribute_h_l"),
    ("person_prefer_h_3_l" , "contents_attribute_h_l"),
]

    

preprocess_data: bool 타입의 데이터를 숫자로 바꾸어주는 함수

In [12]:
bool_cols = ['d_l_match_yn',	'd_m_match_yn',	'd_s_match_yn'	,'h_l_match_yn','h_m_match_yn',	'h_s_match_yn',	'person_attribute_a']
def preprocess_data(
                    df:pd.DataFrame,
                    cols_equi:List[Tuple[str,str]]= [] )->Tuple[pd.DataFrame,np.ndarray]:
    df = df.copy()


    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2] ).astype(int)

    return (df)

In [13]:
train_data = preprocess_data(train_data, cols_equi)
test_data = preprocess_data(test_data, cols_equi)

In [14]:
cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt", ]

train_data = train_data.drop(columns = cols_drop)
test_data = test_data.drop(columns = cols_drop)

In [15]:
y_train = train_data.pop('target')
x_train = train_data
x_test = test_data

In [16]:
x_train.shape, x_test.shape

((501951, 73), (46404, 73))

In [17]:
def Get_dict(df):
    content_dict = {}
    person_dict = {}
    for id in df['contents_rn']:
        try:
            content_dict[id] = content_dict[id] + 1
        except:
            content_dict[id] = 1

    for id in df['person_rn']:
        try:
            person_dict[id] = person_dict[id] + 1
        except:
            person_dict[id] = 1
    return content_dict, person_dict
    

In [18]:
def Variable_add(df, pd, cd):
    df = df.copy()
    
    threshold_often = 10
    threshold_somtine = 5
    threshold_few = 3
    
    contents_lst = []
    for id in df['contents_rn']:
        try:
            if cd[id] > threshold_often:
                group = 3
            elif cd[id] > threshold_somtime:
                group = 2
            elif cd[id] > threshold_few:
                group = 1
            else:
                group = 0
        except:
            group = 0
        contents_lst.append(group)
    df['contents_freq'] = contents_lst

    person_lst = []
    for id in df['person_rn']:
        try:
            if pd[id] > threshold_often:
                group = 3
            elif pd[id] > threshold_somtime:
                group = 2
            elif pd[id] > threshold_few:
                group = 1
            else:
                group = 0
        except:
            group = 0 
        person_lst.append(group)
    df['person_freq'] = person_lst
    
    return df
    

### Model - Stacking

### 1) Catboost - Stacking

In [19]:
test_data['contents_rn']

0        236865
1        236572
2        704612
3        704652
4        704413
          ...  
46399    726084
46400    156948
46401    175069
46402    174849
46403    173406
Name: contents_rn, Length: 46404, dtype: int64

In [20]:
is_holdout = False
n_splits = 5
iterations = 10000
patience = 50

cv = KFold(n_splits=n_splits, shuffle=True, random_state=SEED)

In [21]:
train_fold_pred = np.zeros((x_train.shape[0], 1))
test_pred = np.zeros((x_test.shape[0], n_splits))


for folder_counter, (train_idx, valid_idx) in enumerate(cv.split(x_train)) :
    # fold data
    train_df = x_train.iloc[train_idx]
    eval_df = x_train.iloc[valid_idx]
    
    # Dict 적용
    pdict, cdict = Get_dict(train_df)
    train_df = Variable_add(train_df, pdict, cdict)
    eval_df = Variable_add(eval_df, pdict, cdict)
    cols_drop = ["contents_rn", "person_rn"]
    train_df = train_df.drop(columns = cols_drop)
    eval_df = eval_df.drop(columns = cols_drop)
    
    x_test = Variable_add(test_data, pdict, cdict)
    x_test = x_test.drop(columns = cols_drop)
    
    cat_features = train_df.columns[train_df.nunique() > 2].tolist()
    
    model = CatBoostClassifier(iterations=iterations,
                               random_state=SEED,
                               task_type="GPU",
                               eval_metric="F1",
                               cat_features=cat_features,
                               one_hot_max_size=4,
                               depth=10,
                               use_best_model=True)

    model.fit(train_df, y_train[train_idx], 
            eval_set=[(eval_df, y_train[valid_idx])], 
            early_stopping_rounds=patience ,
            verbose = 100)

    # 폴드 세트 내부에서 다시 만들어지 검증 데이터로 기반 모델 예측 후 데이터 저장
    train_fold_pred[valid_idx, :] = model.predict_proba(eval_df)[:,1].reshape(-1, 1)
    # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
    
    test_pred[:, folder_counter] = model.predict_proba(x_test)[:,1]

# 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)

# train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터
# test_pred_mean은 테스트 데이터
    

Learning rate set to 0.016489
0:	learn: 0.6273892	test: 0.6314457	best: 0.6314457 (0)	total: 679ms	remaining: 1h 53m 6s
100:	learn: 0.6436172	test: 0.6439084	best: 0.6439084 (100)	total: 58.9s	remaining: 1h 36m 16s
200:	learn: 0.6555482	test: 0.6567929	best: 0.6567929 (200)	total: 1m 56s	remaining: 1h 34m 30s
300:	learn: 0.6646575	test: 0.6678032	best: 0.6678032 (300)	total: 2m 47s	remaining: 1h 29m 47s
400:	learn: 0.6716572	test: 0.6736080	best: 0.6736879 (399)	total: 3m 37s	remaining: 1h 26m 35s
500:	learn: 0.6782159	test: 0.6764998	best: 0.6767030 (496)	total: 4m 25s	remaining: 1h 24m 2s
600:	learn: 0.6839880	test: 0.6796647	best: 0.6796647 (600)	total: 5m 15s	remaining: 1h 22m 17s
700:	learn: 0.6896043	test: 0.6807923	best: 0.6810125 (688)	total: 6m 4s	remaining: 1h 20m 34s
800:	learn: 0.6942286	test: 0.6812699	best: 0.6816298 (768)	total: 6m 50s	remaining: 1h 18m 35s
bestTest = 0.681629789
bestIteration = 768
Shrink model to first 769 iterations.
Learning rate set to 0.016489
0:	l

In [22]:
catboost_stacking = pd.DataFrame(train_fold_pred, columns = ["proba"])
catboost_stacking

Unnamed: 0,proba
0,0.508024
1,0.198327
2,0.293256
3,0.448970
4,0.416960
...,...
501946,0.583197
501947,0.650264
501948,0.665137
501949,0.638685


In [23]:
catboost_stacking['target'] = y_train
catboost_stacking.to_csv(f"{SUBMIT_PATH}catboost_for_stacking_final.csv")

In [24]:
catboost_test_stacking = pd.DataFrame(test_pred_mean, columns = ["proba"])

catboost_test_stacking.to_csv(f"{SUBMIT_PATH}catboost_for_stacking_test_final.csv")

### 2) lgbm + stacking

In [25]:
x_train_lgbm = x_train.copy()
x_test_lgbm = test_data.copy()
x_train_lgbm.head()

Unnamed: 0,d_l_match_yn,d_m_match_yn,d_s_match_yn,h_l_match_yn,h_m_match_yn,h_s_match_yn,person_attribute_a,person_attribute_a_1,person_attribute_b,person_prefer_c,person_prefer_d_1,person_prefer_d_2,person_prefer_d_3,person_prefer_e,person_prefer_h_1,person_prefer_h_2,person_prefer_h_3,contents_attribute_i,contents_attribute_a,contents_attribute_j_1,contents_attribute_j,contents_attribute_c,contents_attribute_k,contents_attribute_l,contents_attribute_d,contents_attribute_m,contents_attribute_e,contents_attribute_h,person_rn,contents_rn,hour_band,person_prefer_d_1_n,person_prefer_d_1_s,person_prefer_d_1_m,person_prefer_d_1_l,person_prefer_d_2_n,person_prefer_d_2_s,person_prefer_d_2_m,person_prefer_d_2_l,person_prefer_d_3_n,person_prefer_d_3_s,person_prefer_d_3_m,person_prefer_d_3_l,contents_attribute_d_n,contents_attribute_d_s,contents_attribute_d_m,contents_attribute_d_l,person_prefer_h_1_l,person_prefer_h_1_m,person_prefer_h_2_l,person_prefer_h_2_m,person_prefer_h_3_l,person_prefer_h_3_m,contents_attribute_h_l,contents_attribute_h_m,contents_attribute_l_n,contents_attribute_l_s,contents_attribute_l_m,contents_attribute_l_l,contents_attribute_c_person_prefer_c,contents_attribute_e_person_prefer_e,person_prefer_d_2_s_contents_attribute_d_s,person_prefer_d_2_m_contents_attribute_d_m,person_prefer_d_2_l_contents_attribute_d_l,person_prefer_d_3_s_contents_attribute_d_s,person_prefer_d_3_m_contents_attribute_d_m,person_prefer_d_3_l_contents_attribute_d_l,person_prefer_h_1_m_contents_attribute_h_m,person_prefer_h_2_m_contents_attribute_h_m,person_prefer_h_3_m_contents_attribute_h_m,person_prefer_h_1_l_contents_attribute_h_l,person_prefer_h_2_l_contents_attribute_h_l,person_prefer_h_3_l_contents_attribute_h_l
0,1,1,1,0,0,0,1,4,3,5,275,370,369,8,4,95,59,3,3,10,2,1,2,1608,275,1,4,139,618822,354805,good_time,275,274,274,216,369,368,297,216,369,368,297,216,275,274,274,216,3,316,94,398,58,368,94,422,1607,1606,1605,2016,0,0,0,0,1,0,0,1,0,0,0,0,1,0
1,0,0,0,1,1,0,1,3,4,1,114,181,175,4,131,101,96,1,3,5,1,1,2,1608,275,1,4,133,571659,346213,mid_time,114,109,56,1,175,152,56,1,175,152,56,1,275,274,274,216,94,417,94,400,94,399,94,417,1607,1606,1605,2016,1,1,0,0,0,0,0,0,1,0,0,1,1,1
2,0,0,0,1,0,0,2,0,3,5,464,175,452,3,54,263,56,3,1,10,2,1,1,1600,94,1,4,53,399816,206408,mid_time,464,463,450,377,175,152,56,1,452,451,450,377,92,91,56,1,48,364,250,528,48,366,48,363,1599,1595,1572,2016,0,0,0,1,1,0,0,0,0,0,0,1,0,1
3,0,0,0,1,0,0,2,0,2,5,703,705,704,3,72,227,2,1,3,5,1,1,2,1608,275,5,3,74,827967,572323,mid_time,703,703,690,618,703,703,690,618,703,703,690,618,275,274,274,216,71,379,226,495,1,315,71,381,1607,1606,1605,2016,0,1,0,0,0,0,0,0,0,0,0,1,0,0
4,1,1,1,0,0,0,1,3,4,5,275,370,369,4,214,210,209,1,1,10,2,1,2,1608,275,1,4,74,831614,573899,mid_time,275,274,274,216,369,368,297,216,369,368,297,216,275,274,274,216,208,483,208,481,208,480,71,381,1607,1606,1605,2016,0,1,0,0,1,0,0,1,0,0,0,0,0,0


In [26]:
x_train_lgbm.hour_band.head()

0    good_time
1     mid_time
2     mid_time
3     mid_time
4     mid_time
Name: hour_band, dtype: object

In [27]:
new_time = []
for time in x_train_lgbm['hour_band']:
    if time == 'good_time':
        new_time.append(3)
    elif time == 'mid_time':
        new_time.append(2)
    else:
        new_time.append(1)
x_train_lgbm['hour_band'] = new_time

In [28]:
new_time = []
for time in x_test_lgbm['hour_band']:
    if time == 'good_time':
        new_time.append(3)
    elif time == 'mid_time':
        new_time.append(2)
    else:
        new_time.append(1)
x_test_lgbm['hour_band'] = new_time

In [29]:
x_test_lgbm.hour_band.head()

0    1
1    1
2    1
3    2
4    2
Name: hour_band, dtype: int64

In [30]:
x_train_lgbm.hour_band.head()

0    3
1    2
2    2
3    2
4    2
Name: hour_band, dtype: int64

In [31]:
x_test_lgbm.hour_band.dtype

dtype('int64')

In [32]:
import lightgbm as lgb
from sklearn.metrics import f1_score

train_fold_pred = np.zeros((x_train.shape[0], 1))
test_pred = np.zeros((x_test.shape[0], n_splits))


for folder_counter, (train_idx, valid_idx) in enumerate(cv.split(x_train)) :
    # fold data
    train_df = x_train_lgbm.iloc[train_idx].copy()
    eval_df = x_train_lgbm.iloc[valid_idx].copy()
    
    # Dict 적용
    pdict, cdict = Get_dict(train_df)
    train_df = Variable_add(train_df, pdict, cdict)
    eval_df = Variable_add(eval_df, pdict, cdict)
    cols_drop = ["contents_rn", "person_rn"]
    train_df = train_df.drop(columns = cols_drop).copy()
    eval_df = eval_df.drop(columns = cols_drop).copy()
    
    x_test = Variable_add(x_test_lgbm, pdict, cdict)
    x_test = x_test.drop(columns = cols_drop)
    
    # cat_features = train_df.columns[train_df.nunique() > 2].tolist()
    
    model = lgb.LGBMClassifier(n_estimators = 3000)

    model.fit(train_df, y_train[train_idx], 
            eval_set=[(eval_df, y_train[valid_idx])], 
            eval_metric = "F1",
            early_stopping_rounds=patience ,
            verbose = 100)

    y_pred = model.predict(eval_df)
    f1 = f1_score(y_train[valid_idx], y_pred)    
    print(f1)
    
    # 폴드 세트 내부에서 다시 만들어지 검증 데이터로 기반 모델 예측 후 데이터 저장
    train_fold_pred[valid_idx, :] = model.predict_proba(eval_df)[:,1].reshape(-1, 1)
    # 입력된 원본 테스트 데이터를 폴드 세트내 학습된 기반 모델에서 예측 후 데이터 저장
    
    test_pred[:, folder_counter] = model.predict_proba(x_test)[:,1]

# 폴드 세트 내에서 원본 테스트 데이터를 예측한 데이터를 평균하여 테스트 데이터로 생성
test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)

# train_fold_pred는 최종 메타 모델이 사용하는 학습 데이터
# test_pred_mean은 테스트 데이터
    

Training until validation scores don't improve for 50 rounds.
[100]	valid_0's binary_logloss: 0.64532
[200]	valid_0's binary_logloss: 0.641654
[300]	valid_0's binary_logloss: 0.639621
[400]	valid_0's binary_logloss: 0.63841
[500]	valid_0's binary_logloss: 0.637511
[600]	valid_0's binary_logloss: 0.63694
[700]	valid_0's binary_logloss: 0.636415
[800]	valid_0's binary_logloss: 0.636078
[900]	valid_0's binary_logloss: 0.635534
Early stopping, best iteration is:
[941]	valid_0's binary_logloss: 0.635302
0.651602004403614
Training until validation scores don't improve for 50 rounds.
[100]	valid_0's binary_logloss: 0.646388
[200]	valid_0's binary_logloss: 0.642696
[300]	valid_0's binary_logloss: 0.640816
[400]	valid_0's binary_logloss: 0.639325
[500]	valid_0's binary_logloss: 0.63837
[600]	valid_0's binary_logloss: 0.637654
[700]	valid_0's binary_logloss: 0.637295
[800]	valid_0's binary_logloss: 0.636729
[900]	valid_0's binary_logloss: 0.636405
[1000]	valid_0's binary_logloss: 0.635942
[1100]

In [33]:
lgbm_stacking = pd.DataFrame(train_fold_pred, columns = ["proba"])
lgbm_stacking
# pd.DataFrame({"proba":train_fold_pred, "target" : y_train}, columns = ["proba","target"])

lgbm_stacking['target'] = y_train
lgbm_stacking.to_csv(f"{SUBMIT_PATH}lgbm_for_stacking_final.csv")

lgbm_test_stacking = pd.DataFrame(test_pred_mean, columns = ["proba"])
lgbm_test_stacking.to_csv(f"{SUBMIT_PATH}lgbm_for_stacking_test_final.csv")

### XGboost + stacking

In [34]:
xgb_x_train = x_train.copy()
xgb_x_test = x_test.copy()

In [35]:
# 나머지 범주형 변수들 모두 target encoding 변환
from category_encoders import TargetEncoder

for col in xgb_x_train.columns:
    if col not in bool_cols:
        if col == "target" or col == "person_rn" or col == "contents_rn":
            pass
        else :
            encoder = TargetEncoder()
            xgb_x_train[col] = encoder.fit_transform(xgb_x_train[col].astype('str'), y_train) 
            xgb_x_test[col] = encoder.transform(xgb_x_test[col].astype('str'))

ModuleNotFoundError: ignored

In [None]:
from xgboost import XGBClassifier

train_fold_pred = np.zeros((xgb_x_train.shape[0], 1))
test_pred = np.zeros((xgb_x_test.shape[0], n_splits))
ms_pred = []
models = []

for folder_counter, (train_idx, valid_idx) in enumerate(cv.split(xgb_x_train)) :
    
    # fold data
    train_df = xgb_x_train.iloc[train_idx]
    eval_df = xgb_x_train.iloc[valid_idx]
    
    # Dict 적용
    pdict, cdict = Get_dict(train_df)
    train_df = Variable_add(train_df, pdict, cdict)
    eval_df = Variable_add(eval_df, pdict, cdict)
    cols_drop = ["contents_rn", "person_rn"]
    train_df = train_df.drop(columns = cols_drop)
    eval_df = eval_df.drop(columns = cols_drop)
    
    evals = [(eval_df, y_train[valid_idx])]
    
    model = XGBClassifier(n_estimators=4000, learning_rate=0.05, max_depth=10, #objective = 'binary:logistic',
                                subsample=0.8,
                                colsample_bytree = 0.5,
                                reg_lambda = 10,
                                gamma=0.25)

    model.fit(train_df , y_train[train_idx],  
              early_stopping_rounds=200,
              eval_set=evals, 
              eval_metric="logloss",  
              verbose=100)
    
    y_pred = model.predict(eval_df)
    f1 = f1_score(y_train[valid_idx], y_pred)    
    print(f1)

    train_fold_pred[valid_idx, :] = model.predict_proba(eval_df)[:,1].reshape(-1, 1)
    
    test_pred[:, folder_counter] = model.predict_proba(xgb_x_test)[:,1]
    ms_pred.append(model.predict_proba(xgb_x_test)[:,1])
    models.append(model)
test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    
    

In [None]:
xgb_stacking = pd.DataFrame(train_fold_pred, columns = ["xgboost_proba"])

xgb_stacking['target'] = y_train
xgb_stacking.to_csv(f"{SUBMIT_PATH}xgboost_for_stacking_final.csv")

xgb_test_stacking = pd.DataFrame(test_pred_mean, columns = ["xgboost_proba"])
xgb_test_stacking.to_csv(f"{SUBMIT_PATH}xgboost_for_stacking_test_final.csv")

### stacking ensemble (LGBM model)

In [None]:
cat_pred = pd.read_csv("submit/catboost_for_stacking_final.csv", index_col = 0)
lgbm_pred = pd.read_csv("submit/lgbm_for_stacking_final.csv", index_col = 0)
xgb_pred = pd.read_csv("submit/xgboost_for_stacking_final.csv", index_col = 0)

train_y = cat_pred.target


df = pd.concat([cat_pred.proba, lgbm_pred.proba, xgb_pred.xgboost_proba], axis = 1)
df.columns = ['cat_pred', 'lgbm_pred', 'xgb_pred']
df

In [None]:
cat_pred_test = pd.read_csv("submit/catboost_for_stacking_test_final.csv", index_col = 0)
lgbm_pred_test = pd.read_csv("submit/lgbm_for_stacking_test_final.csv", index_col = 0)
xgb_pred_test = pd.read_csv("submit/xgboost_for_stacking_test_final.csv", index_col = 0)
df_test = pd.concat([cat_pred_test.proba, lgbm_pred_test.proba, xgb_pred_test.xgboost_proba], axis = 1)
df_test.columns = ['cat_pred', 'lgbm_pred', 'xgb_pred']
df_test

In [None]:
lgbm_models = []
for tri, val in cv.split(df):
    model2 = lgb.LGBMClassifier(n_estimators = 3000)

    model2.fit(df.iloc[tri], train_y[tri], 
            eval_set=[(df.iloc[val], train_y[val])], 
            eval_metric = "F1",
            early_stopping_rounds=patience ,
            verbose = 100)

    y_pred = model2.predict(df.iloc[val])
    f1 = f1_score(train_y[val], y_pred)    
    print(f1)
    
    lgbm_models.append(model2)

In [None]:
pred_list = []

for i in range(n_splits):
    pred_list.append(lgbm_models[i].predict_proba(df_test)[:,1])
    
final_pred = np.mean(pred_list, axis = 0)

In [None]:
threshold = 0.39
ans = np.where(final_pred > threshold, 1, 0)
sample_submission = pd.read_csv('data/sample_submission.csv')
sample_submission['target'] = ans
sample_submission
print('target 1 개수 : ',sum(sample_submission['target'] / len(sample_submission)))
sample_submission

In [None]:
sample_submission.to_csv("submit/job_recommendation_final-submission.csv", index = False)