# 라이브러리 및 데이터 로드

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression #로지스틱 회귀
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier #랜덤포레스트
import tensorflow as tf
from tensorflow.keras import layers
from catboost import Pool,CatBoostClassifier
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt
import seaborn as sns
pd.set_option('max_columns', 100)
pd.set_option('display.max_rows', 10000)
import optuna
from optuna import Trial, visualization
from optuna.samplers import TPESampler

In [2]:
# 인덱스 설정
tmp_train=pd.read_csv("train.csv").set_index(keys='id')
tmp_test=pd.read_csv("test.csv").set_index(keys='id')
d_code=pd.read_csv("속성_D_코드.csv").set_index(keys='속성 D 코드')
h_code=pd.read_csv("속성_H_코드.csv").set_index(keys='속성 H 코드')
l_code=pd.read_csv("속성_L_코드.csv").set_index(keys='속성 L 코드')


# DataFrame 전처리 - 회원과 컨텐츠 속성 매칭

## DF 통합 함수

In [3]:
def createDF(main_df,d_code=d_code,h_code=h_code,l_code=l_code): #n:세분류,s:소분류,m:중분류,l:대분류
    main_df=main_df.copy() #df 복사본 만들기

    
    # 하위 속성이 존재하는 속성에 대해 데이터프레임 통합
    main_df["person_prefer_d_1_n"]=main_df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    main_df["person_prefer_d_1_s"]=main_df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    main_df["person_prefer_d_1_m"]=main_df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    main_df["person_prefer_d_1_l"]=main_df["person_prefer_d_1"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    main_df["person_prefer_d_2_n"]=main_df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    main_df["person_prefer_d_2_s"]=main_df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    main_df["person_prefer_d_2_m"]=main_df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    main_df["person_prefer_d_2_l"]=main_df["person_prefer_d_2"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    main_df["person_prefer_d_3_n"]=main_df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    main_df["person_prefer_d_3_s"]=main_df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    main_df["person_prefer_d_3_m"]=main_df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    main_df["person_prefer_d_3_l"]=main_df["person_prefer_d_3"].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])
    
    main_df['person_prefer_h_1_m']=main_df['person_prefer_h_1'].apply(lambda x: h_code.loc[x,"속성 H 중분류코드"])
    main_df['person_prefer_h_1_l']=main_df['person_prefer_h_1'].apply(lambda x: h_code.loc[x,"속성 H 대분류코드"])
    
    main_df['person_prefer_h_2_m']=main_df['person_prefer_h_2'].apply(lambda x: h_code.loc[x,"속성 H 중분류코드"])
    main_df['person_prefer_h_2_l']=main_df['person_prefer_h_2'].apply(lambda x: h_code.loc[x,"속성 H 대분류코드"])
    
    main_df['person_prefer_h_3_m']=main_df['person_prefer_h_3'].apply(lambda x: h_code.loc[x,"속성 H 중분류코드"])
    main_df['person_prefer_h_3_l']=main_df['person_prefer_h_3'].apply(lambda x: h_code.loc[x,"속성 H 대분류코드"])
    
    main_df['contents_attribute_l_n']=main_df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 세분류코드"])
    main_df['contents_attribute_l_s']=main_df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 소분류코드"])
    main_df['contents_attribute_l_m']=main_df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 중분류코드"])
    main_df['contents_attribute_l_l']=main_df['contents_attribute_l'].apply(lambda x: l_code.loc[x,"속성 L 대분류코드"])
    
    main_df['contents_attribute_d_n']=main_df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 세분류코드"])
    main_df['contents_attribute_d_s']=main_df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 소분류코드"])
    main_df['contents_attribute_d_m']=main_df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 중분류코드"])
    main_df['contents_attribute_d_l']=main_df['contents_attribute_d'].apply(lambda x: d_code.loc[x,"속성 D 대분류코드"])

    main_df['contents_attribute_h_m']=main_df['contents_attribute_h'].apply(lambda x: h_code.loc[x,"속성 H 중분류코드"])
    main_df['contents_attribute_h_l']=main_df['contents_attribute_h'].apply(lambda x: h_code.loc[x,"속성 H 대분류코드"])
    
    return main_df
    

## DF 회원과 컨텐츠 속성 매칭 전처리 함수

In [4]:
def preprocessing(df:pd.DataFrame)->pd.DataFrame: #'->'는 Return 값이 어떤 상태인지를 명시하기 위함
    df=df.copy()
    #df 복사본 만들기
    df.drop(['person_prefer_f','person_prefer_g','contents_rn','person_rn'],axis=1,inplace=True)
    #person_prefer_f,g는 값이 모두 1임
    
    #회원 속성과 컨텐츠속성 매칭해서 True, False 명시
    
    #d 코드 매칭여부, d 세분류 매칭여부, d 소분류 매칭여부, d 중분류 매칭여부, d 대분류 매칭여부
    df['d_match_yn']=df['d_s_match_yn'] 
    df['d_n_match_yn']=df['d_m_match_yn'] 
    df['d_s_match_yn']=(df['person_prefer_d_1_s']==df['contents_attribute_d_s']) 
    df['d_m_matck_yn']=(df['person_prefer_d_1_m']==df['contents_attribute_d_m']) 
    #df['d_l_match_yn']은 이미 존재
    
    #d_2 코드 매칭여부, d_2 세분류 매칭여부, d_2 소분류 매칭여부, d_2 중분류 매칭여부, d_2 대분류 매칭여부
    df['d_2_match_yn']=(df['person_prefer_d_2']==df['contents_attribute_d'])
    df['d_2_n_match_yn']=(df['person_prefer_d_2_n']==df['contents_attribute_d_n'])
    df['d_2_s_match_yn']=(df['person_prefer_d_2_s']==df['contents_attribute_d_s'])
    df['d_2_m_match_yn']=(df['person_prefer_d_2_m']==df['contents_attribute_d_m'])
    df['d_2_l_match_yn']=(df['person_prefer_d_2_l']==df['contents_attribute_d_l'])
    
    #d_3 코드 매칭여부, d_3 세분류 매칭여부, d_3 소분류 매칭여부, d_3 중분류 매칭여부, d_3 대분류 매칭여부
    df['d_3_match_yn']=(df['person_prefer_d_3']==df['contents_attribute_d'])
    df['d_3_n_match_yn']=(df['person_prefer_d_3_n']==df['contents_attribute_d_n'])
    df['d_3_s_match_yn']=(df['person_prefer_d_3_s']==df['contents_attribute_d_s'])
    df['d_3_m_match_yn']=(df['person_prefer_d_3_m']==df['contents_attribute_d_m'])
    df['d_3_l_match_yn']=(df['person_prefer_d_3_l']==df['contents_attribute_d_l'])
    
    #h 코드 매칭여부, h 소분류 매칭여부, h 중분류 매칭여부, h 대분류 매칭여부
    df['h_match_yn']=df['h_s_match_yn']
    #df['h_s_match_yn'], df['h_m_match_yn'], df['h_l_match_yn'] 은 이미 존재
    
    #나머지 코드에 대해서도 모두 회원-컨텐츠 매칭
    df['h_2_match_yn']=(df['person_prefer_h_2']==df['contents_attribute_h'])
    df['h_2_m_match_yn']=(df['person_prefer_h_2_m']==df['contents_attribute_h_m'])
    df['h_2_l_match_yn']=(df['person_prefer_h_2_l']==df['contents_attribute_h_l'])
    df['h_3_match_yn']=(df['person_prefer_h_3']==df['contents_attribute_h'])
    df['h_3_m_match_yn']=(df['person_prefer_h_3_m']==df['contents_attribute_h_m'])
    df['h_3_l_match_yn']=(df['person_prefer_h_3_l']==df['contents_attribute_h_l'])
    df['a_match_yn']=(df['person_attribute_a']==df['contents_attribute_a'])
    df['c_match_yn']=(df['person_prefer_c']==df['contents_attribute_c'])
    df['e_match_yn']=(df['person_prefer_e']==df['contents_attribute_e'])
    
    for column in df.keys():
        if column=='contents_open_dt':
            df[column]=df[column].astype('datetime64').dt.dayofweek #0~6(0부터 월요일~)
            #dt.dayofweek 에서 dt는 접근자
        if df[column].dtypes!='datetime64':
            df[column]=df[column].astype('int64') #날짜 데이터를 제외한 모든 칼럼 타입 int로 단일화

    df['contents_attribute_k']-=1 # 기존 1,2로 되어있던 칼럼 값은 0,1로 맞춤
    df.drop(['h_s_match_yn','contents_open_dt'],axis=1,inplace=True) 
    # h_s_match_yn은 h_match_yn으로 대체했기 때문에 삭제, 요일 데이터는 EDA 과정에서 변별력이 없음을 확인했기 때문에 삭제
    return df


## 각각 데이터 전처리 실시(train, test)

In [5]:
train_set=preprocessing(createDF(tmp_train))
test_set=preprocessing(createDF(tmp_test))

--------start---------
---------mid_1----------
---------mid_2----------
---------mid_3----------
-----------finish------------
--------start---------
---------mid_1----------
---------mid_2----------
---------mid_3----------
-----------finish------------


## Target Data Split

In [8]:
# 타겟 칼럼 인덱스값 반환
key_list=list(train_set.keys())
target_idx=key_list.index('target')
# x,y 데이터 분리
y_train=train_set['target']
x_train=train_set[key_list[:target_idx]+key_list[target_idx+1:]]

# Optuna 파라미터 최적화

In [12]:
def objective(trial):
    op_x_train,op_x_val,op_y_train,op_y_val=train_test_split(x_train,y_train,test_size=0.2,shuffle=True,stratify=y_train,random_state=50)
    op_cat_features=op_x_train.columns[x_train.nunique()>2].to_list()
    
    # 튜닝할 파라미터
    param={
        'random_state':50,
        'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'objective':trial.suggest_categorical('objective',['Logloss','CrossEntropy']),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),
        "depth": trial.suggest_int("depth", 4, 16),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]),
        "used_ram_limit": "3gb"
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 50)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        
    gbm = CatBoostClassifier(**param,
                             eval_metric='F1',
                            )

    gbm.fit(op_x_train, op_y_train,
            eval_set=[(op_x_val, op_y_val)], 
            verbose=100, early_stopping_rounds=100,
           cat_features=op_cat_features)
   
    preds = gbm.predict(op_x_val)
    pred_labels = np.rint(preds)
    accuracy = f1_score(op_y_val, pred_labels)
    return accuracy
    

F1스코어를 최대화하는 방향으로 optuna 진행

In [None]:
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=50))
study.optimize(objective, n_trials = 10)

[32m[I 2022-01-27 14:48:42,854][0m A new study created in memory with name: no-name-2b8552f4-1d6e-48e1-a204-076566b50f70[0m


0:	learn: 0.6193824	test: 0.6179773	best: 0.6179773 (0)	total: 2.43s	remaining: 40m 29s
100:	learn: 0.6794778	test: 0.6874739	best: 0.6880381 (95)	total: 5m	remaining: 44m 33s
200:	learn: 0.6962243	test: 0.6894330	best: 0.6894531 (157)	total: 9m 55s	remaining: 39m 26s
300:	learn: 0.7112682	test: 0.6888960	best: 0.6896881 (255)	total: 14m 49s	remaining: 34m 26s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.689688061
bestIteration = 255

Shrink model to first 256 iterations.


[32m[I 2022-01-27 15:06:36,436][0m Trial 0 finished with value: 0.6896880610412927 and parameters: {'learning_rate': 0.19287629564761058, 'objective': 'Logloss', 'colsample_bylevel': 0.2673201088811613, 'depth': 9, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.4621291559327826}. Best is trial 0 with value: 0.6896880610412927.[0m


0:	learn: 0.6154070	test: 0.6143346	best: 0.6143346 (0)	total: 2.87s	remaining: 47m 42s
100:	learn: 0.6450055	test: 0.6479731	best: 0.6481370 (99)	total: 5m 7s	remaining: 45m 38s
200:	learn: 0.6596400	test: 0.6705277	best: 0.6706607 (199)	total: 11m 18s	remaining: 44m 55s
300:	learn: 0.6663644	test: 0.6781436	best: 0.6781436 (300)	total: 17m 13s	remaining: 39m 59s
400:	learn: 0.6710065	test: 0.6823174	best: 0.6823588 (398)	total: 23m 7s	remaining: 34m 32s
500:	learn: 0.6749002	test: 0.6849186	best: 0.6850012 (492)	total: 28m 59s	remaining: 28m 52s
600:	learn: 0.6786947	test: 0.6867109	best: 0.6867109 (600)	total: 35m 5s	remaining: 23m 17s
700:	learn: 0.6816437	test: 0.6879609	best: 0.6879609 (700)	total: 41m 17s	remaining: 17m 36s
800:	learn: 0.6846443	test: 0.6891047	best: 0.6891849 (794)	total: 47m 25s	remaining: 11m 46s
900:	learn: 0.6874353	test: 0.6901514	best: 0.6902475 (896)	total: 53m 34s	remaining: 5m 53s
999:	learn: 0.6904550	test: 0.6911827	best: 0.6911827 (999)	total: 59m 5

[32m[I 2022-01-27 16:06:59,437][0m Trial 1 finished with value: 0.6911827320017031 and parameters: {'learning_rate': 0.014688411186605884, 'objective': 'CrossEntropy', 'colsample_bylevel': 0.24578659346044443, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.5648520681926749}. Best is trial 1 with value: 0.6911827320017031.[0m


0:	learn: 0.6003132	test: 0.5964669	best: 0.5964669 (0)	total: 1.5s	remaining: 24m 55s
100:	learn: 0.6443802	test: 0.6665567	best: 0.6665567 (100)	total: 3m 7s	remaining: 27m 45s
200:	learn: 0.6513106	test: 0.6744742	best: 0.6744991 (197)	total: 5m 56s	remaining: 23m 37s
300:	learn: 0.6536372	test: 0.6753302	best: 0.6754212 (258)	total: 8m 21s	remaining: 19m 25s
400:	learn: 0.6552231	test: 0.6766924	best: 0.6767863 (398)	total: 11m	remaining: 16m 26s
500:	learn: 0.6564574	test: 0.6774746	best: 0.6774746 (500)	total: 13m 48s	remaining: 13m 45s
600:	learn: 0.6573804	test: 0.6781494	best: 0.6784194 (565)	total: 16m 38s	remaining: 11m 2s
700:	learn: 0.6582628	test: 0.6784424	best: 0.6787862 (650)	total: 19m 30s	remaining: 8m 19s
800:	learn: 0.6586992	test: 0.6788745	best: 0.6789241 (782)	total: 22m 22s	remaining: 5m 33s
900:	learn: 0.6590323	test: 0.6790089	best: 0.6791776 (893)	total: 25m 11s	remaining: 2m 46s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6791775855

[32m[I 2022-01-27 16:35:04,077][0m Trial 2 finished with value: 0.6791775855429382 and parameters: {'learning_rate': 0.03679667116366644, 'objective': 'Logloss', 'colsample_bylevel': 0.6031479468956237, 'depth': 4, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS'}. Best is trial 1 with value: 0.6911827320017031.[0m


0:	learn: 0.5820987	test: 0.5727469	best: 0.5727469 (0)	total: 6.59s	remaining: 1h 49m 44s
100:	learn: 0.6850719	test: 0.6254564	best: 0.6256528 (94)	total: 10m 31s	remaining: 1h 33m 43s
200:	learn: 0.7343812	test: 0.6311270	best: 0.6317819 (182)	total: 21m 31s	remaining: 1h 25m 32s
300:	learn: 0.7750993	test: 0.6314046	best: 0.6318927 (269)	total: 34m 27s	remaining: 1h 20m
400:	learn: 0.8071744	test: 0.6337974	best: 0.6339236 (399)	total: 49m 8s	remaining: 1h 13m 24s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6339235716
bestIteration = 399

Shrink model to first 400 iterations.


[32m[I 2022-01-27 17:39:10,095][0m Trial 3 finished with value: 0.6339235715527062 and parameters: {'learning_rate': 0.09752108587642991, 'objective': 'Logloss', 'colsample_bylevel': 0.5732370671182497, 'depth': 11, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'bagging_temperature': 15.462879281827357}. Best is trial 1 with value: 0.6911827320017031.[0m


0:	learn: 0.6187690	test: 0.6199395	best: 0.6199395 (0)	total: 2.59s	remaining: 43m 11s
100:	learn: 0.6681243	test: 0.6793589	best: 0.6793589 (100)	total: 6m 41s	remaining: 59m 33s
200:	learn: 0.6814408	test: 0.6858235	best: 0.6858235 (200)	total: 13m 31s	remaining: 53m 43s
300:	learn: 0.6938861	test: 0.6895850	best: 0.6897356 (297)	total: 20m 44s	remaining: 48m 9s
400:	learn: 0.7051247	test: 0.6907496	best: 0.6908902 (379)	total: 28m 31s	remaining: 42m 36s
500:	learn: 0.7156661	test: 0.6914175	best: 0.6914175 (500)	total: 36m 28s	remaining: 36m 19s
600:	learn: 0.7246995	test: 0.6918458	best: 0.6921067 (559)	total: 45m 20s	remaining: 30m 6s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.6921067068
bestIteration = 559

Shrink model to first 560 iterations.


[32m[I 2022-01-27 18:29:59,134][0m Trial 4 finished with value: 0.6921067068351512 and parameters: {'learning_rate': 0.059482101891859186, 'objective': 'Logloss', 'colsample_bylevel': 0.3658887011221209, 'depth': 10, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'subsample': 0.39793944042783425}. Best is trial 4 with value: 0.6921067068351512.[0m


# Modeling(KFold)

값이 2가지 이상인 칼럼을 cat_features로 지정

In [None]:
#캣부스트(CatBoost)
cat_features=x_train.columns[x_train.nunique()>2].to_list()
is_holdout=False

cv=StratifiedKFold(
    n_splits=10,
    shuffle=True,
    random_state=50
)
scores=[]
models=[]

for train_idx,val_idx in cv.split(x_train,y_train):
    print('='*60)
    preds=[]
    
    model=CatBoostClassifier(
        **study.best_params,
        eval_metric='F1',
    )
    
    model.fit(x_train.iloc[train_idx],y_train[train_idx],
              eval_set=[(x_train.iloc[val_idx],y_train[val_idx])],
              early_stopping_rounds=100,
              cat_features=cat_features,
              verbose=100)
    
    models.append(model)
    scores.append(model.get_best_score()['validation']['F1'])
    if is_holdout:
        break

print(scores)
print(np.mean(scores))

## 최적 threshold 탐색

In [None]:
N=61

threshold가 0.3~0.4일때 f1 스코어가 가장 높게 나와서 그 사이 값 중 리더보드에 가장 높게 나오는 값을 선정

In [None]:
thresholds = 0.3475
scores_2=[]
y_graph=[]
for threshold in thresholds:
    print(f'threshold:{threshold}')
    for i,(train_idx,val_idx) in enumerate(cv.split(x_train,y_train)):
        pred=models[i].predict_proba(x_train.iloc[val_idx])[:,1]
        pred=np.where(pred>=threshold,1,0)
        score=f1_score(y_train[val_idx],pred)
        scores_2.append(score)
    y_graph.append(scores_2)
    scores_2=[]
print(y_graph)

## 모델 저장

In [None]:
for i in range(1,11):
    models[i-1].save_model(f'cat_boost_model_{i}')

## 모델 불러오기

In [None]:
l_models=[CatBoostClassifier().load_model(f'cat_boost_model_{i}') for i in range(1,11)]

## 최적 threshold를 이용하여 test set 예측

In [None]:
threshold=0.38
pred_list=[]
scores_2=[]

for i,(train_idx,val_idx) in enumerate(cv.split(x_train,y_train)):
    pred=l_models[i].predict_proba(x_train.iloc[val_idx])[:,1]
    pred=np.where(pred>=threshold,1,0)
    score=f1_score(y_train[val_idx],pred)
    scores_2.append(score)
    pred=l_models[i].predict_proba(test_set)[:,1]
    pred_list.append(pred)

print(scores_2)
print(np.mean(scores_2))

# Submission

## 산술평균 앙상블

In [None]:
preds=np.mean(pred_list,axis=0)
preds=np.where(preds>=threshold,1,0)

## 제출

In [None]:
submission=pd.read_csv('sample_submission.csv')
submission['target']=preds.round()
submission.to_csv('D:/Users/user/Desktop/submission.csv',index=False)