In [1]:
import os
import sys
import platform
import random
import math
from typing import List, Dict, Tuple

import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score

from catboost import Pool, CatBoostClassifier

In [2]:
print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Linux-5.4.0-91-generic-x86_64-with-glibc2.10
- python: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
- pandas: 1.1.3
- numpy: 1.19.2
- sklearn: 0.23.2


In [3]:
train_df = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/train.csv")
test_df = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/test.csv")

code_d = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/속성_D_코드.csv")
code_h = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/속성_H_코드.csv").iloc[:,:-1]
code_l = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/속성_L_코드.csv")

print(train_df.shape, test_df.shape)

(501951, 35) (46404, 34)


In [4]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h", "attribute_h_p"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

In [6]:
def Merge_codes(df:pd.DataFrame, df_code:pd.DataFrame, col:str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how = 'left', on = col)

### Feature_engineering과 Feature_enginerring2가 핵심적인 변화

1. Feature_engineering 함수를 활성화 할 경우
contents_rn이라는 변수를 제외하는 기존 모델과 다르게 변수로 사용함
다만 그대로 사용하지 않고
contents_rn을 몇번씩 등장했는지 카운팅해서 그 값을 변수로 사용

2. Feature_engineering2 함수를 활성화 할 경우
contents_rn과 같은 방식으로 person_rn변수를 전처리함
기존 모델은 person_rn 변수를 통으로 사용했음


In [89]:
#content_rn이라는 변수도 사용하되, 그대로 사용하는 것이 아니라 출현 빈도로 값을 재할당

def Feature_engineering(df):
    content_freq = df.groupby('contents_rn').count()['id']
    df = pd.merge(df, content_freq, how = 'left', on = 'contents_rn')
    # df.id_y = np.where(
    #     df['id_y'] == 1, 1,
    #     np.where(df['id_y'] < 5, 5,
    #     np.where(df['id_y'] < 10, 10, 'over10')))
    return df

In [7]:
## person_rn도 count해서 사용 (overfitting 방지)

def Feature_engineering2(df):
    content_freq = df.groupby('contents_rn').count()['id']
    df_temp = pd.merge(df, content_freq, how = 'left', on = 'contents_rn')
    user_freq = df.groupby('person_rn').count()['id']
    df = pd.merge(df_temp, user_freq, how = 'left', on = 'person_rn')
    return df

In [9]:
def Preprocess_data(
    df:pd.DataFrame, is_train:bool = True, cols_merge:List[Tuple[str, pd.DataFrame]] = [], cols_equi:List[Tuple[str, str]] = [],
    cols_drop:List[str] = ['id_x', 'person_prefer_f', 'person_perfer_g', 'contents_open_dt']) -> Tuple[pd.DataFrame, np.ndarray]:

    df = df.copy()

    #기존과의 차이점

    ########################################
    #####어떤 방식으로 모델을 학습시킬지는 여기서 결정
    df = Feature_engineering2(df)#########
    ######################################
    y_data = None
    if is_train:
        y_data = df['target'].to_numpy()
        df = df.drop(columns='target')

    for col, df_code in cols_merge:
        df = Merge_codes(df, df_code, col)
    
    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2]).astype(int)
    df = df.drop(columns= cols_drop)
    
    
    return (df, y_data)

In [10]:
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),

]
#########################################################################################################
#######################앞에서 선택한 모델 학습법에 따라서 3개의 line중 적합한 것을 주석 해제하여 사용####################
########################################################################################################

# 학습에 필요없는 컬럼 리스트
#cols_drop = ["id_x","person_prefer_f","person_prefer_g" ,"contents_open_dt", "contents_rn"] #FE 사용할 경우
#cols_drop = ["id","person_prefer_f","person_prefer_g" ,"contents_open_dt"] #content_rn을 통으로 넣을 경우
cols_drop = ["id_x","person_prefer_f","person_prefer_g" ,"contents_open_dt", "person_rn", "contents_rn"] #FE2를 사용할 경우

In [11]:
x_train, y_train = Preprocess_data(train_df, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _ = Preprocess_data(test_df,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop)
x_train.shape , y_train.shape , x_test.shape

((501951, 65), (501951,), (46404, 65))

In [12]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()

In [15]:
is_holdout = False
n_splits = 5
iterations = 3000
patience = 100
SEED = 42

cv = KFold(n_splits = n_splits, shuffle = True, random_state=SEED)

In [16]:
#Train

scores = []
models = []
epochs = 1

for train, validation in cv.split(x_train):
    print("====="*10)
    print(epochs)
    epochs += 1
    preds = []

    model = CatBoostClassifier(
        iterations, random_state = SEED, task_type = "GPU", eval_metric = 'F1',
        cat_features = cat_features, one_hot_max_size = 5
    )

    model.fit(
        x_train.iloc[train], y_train[train],
        eval_set = [(x_train.iloc[validation], y_train[validation])],
        early_stopping_rounds = patience, #########################무엇?
        verbose = 100 #################무엇?
    )

    models.append(model)
    scores.append(model.get_best_score()['validation']['F1'])

    if is_holdout:
        break

print("EOT")

Learning rate set to 0.027144
0:	learn: 0.5969243	test: 0.6001954	best: 0.6001954 (0)	total: 38ms	remaining: 1m 54s
100:	learn: 0.6362366	test: 0.6398923	best: 0.6398923 (100)	total: 3.66s	remaining: 1m 44s
200:	learn: 0.6495012	test: 0.6549794	best: 0.6550574 (198)	total: 7.23s	remaining: 1m 40s
300:	learn: 0.6569610	test: 0.6644349	best: 0.6644349 (300)	total: 10.8s	remaining: 1m 36s
400:	learn: 0.6611480	test: 0.6696163	best: 0.6696996 (398)	total: 14.4s	remaining: 1m 33s
500:	learn: 0.6643620	test: 0.6745153	best: 0.6745657 (498)	total: 17.9s	remaining: 1m 29s
600:	learn: 0.6671323	test: 0.6776968	best: 0.6777282 (599)	total: 21.3s	remaining: 1m 25s
700:	learn: 0.6693414	test: 0.6801721	best: 0.6802212 (687)	total: 24.8s	remaining: 1m 21s
800:	learn: 0.6710782	test: 0.6815557	best: 0.6816868 (797)	total: 28.2s	remaining: 1m 17s
900:	learn: 0.6723805	test: 0.6827085	best: 0.6827085 (900)	total: 31.6s	remaining: 1m 13s
1000:	learn: 0.6736406	test: 0.6838113	best: 0.6839564 (995)	tota

In [17]:
print(scores)

[0.6858979263906957, 0.6903441892420584, 0.6847310012756936, 0.6847138047138048, 0.6811853046801482]


In [19]:
#feature importance

for model in models:
    idx = model.get_feature_importance().argsort()[-10:][::-1]
    print(x_train.columns[idx])


### id_y  = contetent_rn을 count로 변환한 값
### id = person_rn을 count로 변환한 값

Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1',
       'contents_attribute_h', 'd_m_match_yn',
       'person_prefer_d_1_attribute_d_s', 'contents_attribute_d',
       'contents_attribute_l', 'person_attribute_a_1'],
      dtype='object')
Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1',
       'contents_attribute_h', 'd_m_match_yn', 'contents_attribute_d',
       'contents_attribute_l', 'person_prefer_d_1_attribute_d_s',
       'person_attribute_a_1'],
      dtype='object')
Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1',
       'contents_attribute_h', 'contents_attribute_d', 'd_m_match_yn',
       'contents_attribute_l', 'person_attribute_a_1',
       'contents_attribute_h_attribute_h_p'],
      dtype='object')
Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1', 'd_m_match_yn',
       'contents_attribute_d', 'contents_attribute_h',
       'person_prefer_d_1_attribute_d_s', 'contents_attribute_h_attribute_h_p',
       'contents_attr

In [20]:
#Find Best Threshold

pred_list = []
scores = []

thresholds = np.arange(0.35, 0.55, 0.02)
for threshold in thresholds:
    for i, (train, validation) in enumerate(cv.split(x_train)):
        pred = models[i].predict_proba(x_train.iloc[validation])[:,1]
        pred = np.where(pred >= threshold, 1, 0)
        score = f1_score(y_train[validation], pred)
        scores.append(score)
        pred = models[i].predict_proba(x_test)[:,1]
        pred_list.append(pred)
    #print(scores)
    print(f"{threshold} : {np.mean(scores)} ")

0.35 : 0.7144911610976059 
0.37 : 0.7147750731978745 
0.39 : 0.7148339556201475 
0.41000000000000003 : 0.7144862622079143 
0.43000000000000005 : 0.7137123541555728 
0.45000000000000007 : 0.712411878470429 
0.4700000000000001 : 0.7105170989682268 
0.4900000000000001 : 0.7080043474622492 
0.5100000000000001 : 0.704773229072693 
0.5300000000000001 : 0.7007380596965256 
0.5500000000000002 : 0.6957672681529266 


In [21]:
threshold = 0.4
pred = np.mean(pred_list, axis = 0)
pred = np.where(pred >= threshold, 1, 0)

In [22]:
sample_submission = pd.read_csv('/home/mglee/VSCODE/Dacon/Jobcare_data/sample_submission.csv')
sample_submission['target'] = pred
sample_submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,1
3,3,0
4,4,1
...,...,...
46399,46399,1
46400,46400,1
46401,46401,1
46402,46402,1


In [133]:
sample_submission.to_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/prediction_0116_.csv", index=False)