In [24]:
import os
import sys
import platform
import random
import math
from typing import List, Dict, Tuple
from datetime import datetime

import pandas as pd
import numpy as np

import sklearn
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score

from catboost import Pool, CatBoostClassifier

In [25]:
print(f"- os: {platform.platform()}")
print(f"- python: {sys.version}")
print(f"- pandas: {pd.__version__}")
print(f"- numpy: {np.__version__}")
print(f"- sklearn: {sklearn.__version__}")

- os: Linux-5.4.0-91-generic-x86_64-with-glibc2.10
- python: 3.8.5 (default, Sep  4 2020, 07:30:14) 
[GCC 7.3.0]
- pandas: 1.1.3
- numpy: 1.19.2
- sklearn: 0.23.2


In [26]:
train_df = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/train.csv")
test_df = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/test.csv")

code_d = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/속성_D_코드.csv")
code_h = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/속성_H_코드.csv").iloc[:,:-1]
code_l = pd.read_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/속성_L_코드.csv")

print(train_df.shape, test_df.shape)

(501951, 35) (46404, 34)


In [27]:
code_d.columns= ["attribute_d","attribute_d_d","attribute_d_s","attribute_d_m","attribute_d_l"]
code_h.columns= ["attribute_h", "attribute_h_p"]
code_l.columns= ["attribute_l","attribute_l_d","attribute_l_s","attribute_l_m","attribute_l_l"]

In [28]:
def Merge_codes(df:pd.DataFrame, df_code:pd.DataFrame, col:str) -> pd.DataFrame:
    df = df.copy()
    df_code = df_code.copy()
    df_code = df_code.add_prefix(f"{col}_")
    df_code.columns.values[0] = col
    return pd.merge(df, df_code, how = 'left', on = col)

### Feature_engineering과 Feature_enginerring2가 핵심적인 변화

1. Feature_engineering 함수를 활성화 할 경우
contents_rn이라는 변수를 제외하는 기존 모델과 다르게 변수로 사용함
다만 그대로 사용하지 않고
contents_rn을 몇번씩 등장했는지 카운팅해서 그 값을 변수로 사용

2. Feature_engineering2 함수를 활성화 할 경우
contents_rn과 같은 방식으로 person_rn변수를 전처리함
기존 모델은 person_rn 변수를 통으로 사용했음


In [29]:
## person_rn도 count해서 사용 (overfitting 방지)

def Feature_engineering2(df):
    content_dict = {}
    person_dict = {}
    for id in df['contents_rn']:
        try:
            content_dict[id] = content_dict[id] + 1
        except:
            content_dict[id] = 1

    for id in df['person_rn']:
        try:
            person_dict[id] = person_dict[id] + 1
        except:
            person_dict[id] = 1
    return content_dict, person_dict

In [30]:
def Date_pre(data):
    data['contents_open_dt'] = data['contents_open_dt'].astype('str')
    DATE = data['contents_open_dt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    
    DATE = pd.DataFrame(DATE)
    DATE = DATE.rename(columns = {'contents_open_dt': 'date'})
    
    DATE['Y'] = DATE['date'].apply(lambda x: x.timetuple()[0])
    DATE['M'] = DATE['date'].apply(lambda x: x.timetuple()[1])
    DATE['D'] = DATE['date'].apply(lambda x: x.timetuple()[2])
    DATE['id'] = data['id']
    
    data = data.merge(DATE, on = 'id', how = 'left')
    data = data.drop(columns = ['date', 'contents_open_dt'])
    return data

In [43]:
def Preprocess_data(
    df:pd.DataFrame, is_train:bool = True, cols_merge:List[Tuple[str, pd.DataFrame]] = [], cols_equi:List[Tuple[str, str]] = [],
    cols_drop:List[str] = ['id', 'person_prefer_f', 'person_perfer_g'], contents_dict = {}, person_dict = {}) -> Tuple[pd.DataFrame, np.ndarray]:

    df = df.copy()

    #기존과의 차이점

    df = Date_pre(df)

    if is_train:
        contents_dict, person_dict = Feature_engineering2(df)

    contents_lst = []
    for id in df['contents_rn']:
        try:
            freq = person_dict[id]
        except:
            freq = 1 
        contents_lst.append(freq)
    df['contents_freq'] = contents_lst

    person_lst = []
    for id in df['person_rn']:
        try:
            freq = person_dict[id]
        except:
            freq = 1 
        person_lst.append(freq)
    df['person_freq'] = person_lst

    y_data = None
    if is_train:
        y_data = df['target'].to_numpy()
        df = df.drop(columns='target')

    for col, df_code in cols_merge:
        df = Merge_codes(df, df_code, col)
    
    cols = df.select_dtypes(bool).columns.tolist()
    df[cols] = df[cols].astype(int)

    for col1, col2 in cols_equi:
        df[f"{col1}_{col2}"] = (df[col1] == df[col2]).astype(int)
    df = df.drop(columns= cols_drop)
    
    
    return (df, y_data, contents_dict, person_dict)

In [36]:
cols_merge = [
              ("person_prefer_d_1" , code_d),
              ("person_prefer_d_2" , code_d),
              ("person_prefer_d_3" , code_d),
              ("contents_attribute_d" , code_d),
              ("person_prefer_h_1" , code_h),
              ("person_prefer_h_2" , code_h),
              ("person_prefer_h_3" , code_h),
              ("contents_attribute_h" , code_h),
              ("contents_attribute_l" , code_l),
]

# 회원 속성과 콘텐츠 속성의 동일한 코드 여부에 대한 컬럼명 리스트
cols_equi = [

    ("contents_attribute_c","person_prefer_c"),
    ("contents_attribute_e","person_prefer_e"),

    ("person_prefer_d_2_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_2_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_2_attribute_d_l" , "contents_attribute_d_attribute_d_l"),
    ("person_prefer_d_3_attribute_d_s" , "contents_attribute_d_attribute_d_s"),
    ("person_prefer_d_3_attribute_d_m" , "contents_attribute_d_attribute_d_m"),
    ("person_prefer_d_3_attribute_d_l" , "contents_attribute_d_attribute_d_l"),

    ("person_prefer_h_1_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_2_attribute_h_p" , "contents_attribute_h_attribute_h_p"),
    ("person_prefer_h_3_attribute_h_p" , "contents_attribute_h_attribute_h_p"),

]
#########################################################################################################
#######################앞에서 선택한 모델 학습법에 따라서 3개의 line중 적합한 것을 주석 해제하여 사용####################
########################################################################################################

# 학습에 필요없는 컬럼 리스트
#cols_drop = ["id_x","person_prefer_f","person_prefer_g", "contents_rn"] #FE 사용할 경우
#cols_drop = ["id","person_prefer_f","person_prefer_g"] #content_rn을 통으로 넣을 경우
cols_drop = ["id", "person_prefer_f","person_prefer_g", "person_rn", "contents_rn"] #FE2를 사용할 경우

In [42]:
x_train, y_train, contents_dict, person_dict = Preprocess_data(train_df, cols_merge = cols_merge , cols_equi= cols_equi , cols_drop = cols_drop)
x_test, _, _, _ = Preprocess_data(test_df,is_train = False, cols_merge = cols_merge , cols_equi= cols_equi  , cols_drop = cols_drop, contents_dict = contents_dict ,person_dict = person_dict)
x_train.shape , y_train.shape , x_test.shape

contents_attribute_c person_prefer_c
contents_attribute_e person_prefer_e
person_prefer_d_2_attribute_d_s contents_attribute_d_attribute_d_s
person_prefer_d_2_attribute_d_m contents_attribute_d_attribute_d_m
person_prefer_d_2_attribute_d_l contents_attribute_d_attribute_d_l
person_prefer_d_3_attribute_d_s contents_attribute_d_attribute_d_s
person_prefer_d_3_attribute_d_m contents_attribute_d_attribute_d_m
person_prefer_d_3_attribute_d_l contents_attribute_d_attribute_d_l
person_prefer_h_1_attribute_h_p contents_attribute_h_attribute_h_p
person_prefer_h_2_attribute_h_p contents_attribute_h_attribute_h_p
person_prefer_h_3_attribute_h_p contents_attribute_h_attribute_h_p
contents_attribute_c person_prefer_c
contents_attribute_e person_prefer_e
person_prefer_d_2_attribute_d_s contents_attribute_d_attribute_d_s
person_prefer_d_2_attribute_d_m contents_attribute_d_attribute_d_m
person_prefer_d_2_attribute_d_l contents_attribute_d_attribute_d_l
person_prefer_d_3_attribute_d_s contents_attribu

((501951, 68), (501951,), (46404, 68))

In [44]:
cat_features = x_train.columns[x_train.nunique() > 2].tolist()

In [45]:
is_holdout = False
n_splits = 5
iterations = 3000
patience = 100
SEED = 42

cv = KFold(n_splits = n_splits, shuffle = True, random_state=SEED)

In [46]:
#Train

scores = []
models = []
epochs = 1

for train, validation in cv.split(x_train):
    print("====="*10)
    print(epochs)
    epochs += 1
    preds = []

    model = CatBoostClassifier(
        iterations, random_state = SEED, task_type = "GPU", eval_metric = 'F1',
        cat_features = cat_features, one_hot_max_size = 5
    )

    model.fit(
        x_train.iloc[train], y_train[train],
        eval_set = [(x_train.iloc[validation], y_train[validation])],
        early_stopping_rounds = patience, #########################무엇?
        verbose = 100 #################무엇?
    )

    models.append(model)
    scores.append(model.get_best_score()['validation']['F1'])

    if is_holdout:
        break

print("EOT")

1
Learning rate set to 0.027144
0:	learn: 0.6036887	test: 0.6071074	best: 0.6071074 (0)	total: 95.8ms	remaining: 4m 47s
100:	learn: 0.6434502	test: 0.6558652	best: 0.6558652 (100)	total: 14.6s	remaining: 6m 59s
200:	learn: 0.6524004	test: 0.6677098	best: 0.6677773 (197)	total: 29.1s	remaining: 6m 44s
300:	learn: 0.6571145	test: 0.6746992	best: 0.6747271 (297)	total: 43s	remaining: 6m 25s
400:	learn: 0.6608149	test: 0.6782788	best: 0.6783282 (399)	total: 57s	remaining: 6m 9s
500:	learn: 0.6637137	test: 0.6797272	best: 0.6797272 (500)	total: 1m 10s	remaining: 5m 50s
600:	learn: 0.6666587	test: 0.6794203	best: 0.6800190 (513)	total: 1m 23s	remaining: 5m 32s
bestTest = 0.6800190393
bestIteration = 513
Shrink model to first 514 iterations.
2
Learning rate set to 0.027144
0:	learn: 0.6131835	test: 0.6153311	best: 0.6153311 (0)	total: 85.7ms	remaining: 4m 17s
100:	learn: 0.6422923	test: 0.6554533	best: 0.6554533 (100)	total: 14.6s	remaining: 6m 58s
200:	learn: 0.6527827	test: 0.6680749	best: 

In [47]:
print(scores)

[0.6800190392638151, 0.6824480807927407, 0.6807524276475189, 0.6757392953464658, 0.6758724289345968]


In [None]:
############################여기까지 수정했음

In [19]:
#feature importance

for model in models:
    idx = model.get_feature_importance().argsort()[-10:][::-1]
    print(x_train.columns[idx])


### id_y  = contetent_rn을 count로 변환한 값
### id = person_rn을 count로 변환한 값

Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1',
       'contents_attribute_h', 'd_m_match_yn',
       'person_prefer_d_1_attribute_d_s', 'contents_attribute_d',
       'contents_attribute_l', 'person_attribute_a_1'],
      dtype='object')
Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1',
       'contents_attribute_h', 'd_m_match_yn',
       'person_prefer_d_1_attribute_d_s', 'contents_attribute_d',
       'contents_attribute_l', 'person_attribute_a_1'],
      dtype='object')
Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1',
       'contents_attribute_h', 'd_m_match_yn', 'contents_attribute_d',
       'contents_attribute_l', 'person_attribute_a_1',
       'contents_attribute_h_attribute_h_p'],
      dtype='object')
Index(['id_y', 'id', 'd_l_match_yn', 'contents_attribute_j_1',
       'contents_attribute_h', 'd_m_match_yn',
       'person_prefer_d_1_attribute_d_s', 'contents_attribute_d',
       'contents_attribute_l', 'person_attribute_a_1'],
   

In [20]:
#Find Best Threshold

pred_list = []
scores = []

thresholds = np.arange(0.39, 0.43, 0.01)
for threshold in thresholds:
    for i, (train, validation) in enumerate(cv.split(x_train)):
        pred = models[i].predict_proba(x_train.iloc[validation])[:,1]
        pred = np.where(pred >= threshold, 1, 0)
        score = f1_score(y_train[validation], pred)
        scores.append(score)
        pred = models[i].predict_proba(x_test)[:,1]
        pred_list.append(pred)
    #print(scores)
    print(f"{threshold} : {np.mean(scores)} ")

0.39 : 0.7152005690696145 
0.4 : 0.7149087513082059 
0.41000000000000003 : 0.7144867389873769 
0.42000000000000004 : 0.7139532760644545 


In [40]:
threshold = 0.39

for i, (train, validation) in enumerate(cv.split(x_train)):
    pred = models[i].predict_proba(x_train.iloc[validation])[:,1]
    pred = np.where(pred >= threshold, 1, 0)
    score = f1_score(y_train[validation], pred)
    scores.append(score)
    pred = models[i].predict_proba(x_test)[:,1]
    pred_list.append(pred)
pred = np.mean(pred_list, axis = 0)

#pred

In [41]:
train_df['target'].mean()

0.4997400144635632

In [47]:
threshold = 0.5
ans = np.where(pred >= threshold, 1, 0)
print(pred.mean(), ans.mean())

0.4775806916713207 0.4860356865787432


##########LGBM이랑 섞어보기

In [22]:
lgbm = pd.read_pickle("lgbm.pkl")

final = []
for i in range(len(pred)):
    final.append(pred[i] + lgbm[i] / 2)

#final


In [26]:
ans = []
for item in final:
    if item >= 0.5:
        ans.append(1)
    else:
        ans.append(0)
sum(ans) / len(ans)

0.8257693302301525

In [48]:
sample_submission = pd.read_csv('/home/mglee/VSCODE/Dacon/Jobcare_data/sample_submission.csv')
sample_submission['target'] = ans
sample_submission

Unnamed: 0,id,target
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0
...,...,...
46399,46399,1
46400,46400,0
46401,46401,1
46402,46402,1


In [32]:
sample_submission.to_csv("/home/mglee/VSCODE/Dacon/Jobcare_data/prediction_0119_t5.csv", index=False)