In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import StratifiedKFold , KFold
from sklearn.metrics import f1_score 
import warnings

In [None]:
warnings.filterwarnings(action = 'ignore')

In [None]:
# 데이터 로드
train = pd.read_csv('C:/Users/김민성\Desktop/PythonWorkspace/잡케어추천알고리즘공모전/train.csv', encoding='cp949')
test = pd.read_csv('C:/Users/김민성\Desktop/PythonWorkspace/잡케어추천알고리즘공모전/test.csv', encoding='cp949')
ss= pd.read_csv('C:/Users/김민성/Desktop/공모전/잡케어 추천 알고리즘 경진대회/Jobcare_data/sample_submission.csv', encoding='utf-8')

In [None]:
test.drop("id", axis=1., inplace=True)

## X, Y

In [None]:
X = train.drop(["target","id"], axis=1)
y = train["target"]

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X,y,test_size=0.2)

In [None]:
cat_cols = X_train.columns[X_train.nunique() > 2].tolist()

## optuna

In [None]:
# def objective(trial):
#     # Parameters
#     param = {
#         "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
#         "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
#         "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),
#         "depth": trial.suggest_int("depth", 4, 16),
#         "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
#         "bootstrap_type": trial.suggest_categorical(
#             "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
#         ),
#         "used_ram_limit": "3gb",
     
#     }

#     if param["bootstrap_type"] == "Bayesian":
#         param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 50)
#     elif param["bootstrap_type"] == "Bernoulli":
#         param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        
#     #Learning
    
#     cat = CatBoostClassifier(**param, eval_metric="F1", one_hot_max_size=3)
#     cat.fit(X_train, y_train,
            
#             eval_set=[(X_train, y_train), (X_val,y_val)],
#             early_stopping_rounds=100,cat_features=cat_cols,
#             verbose=100)
#     cat_pred = cat.predict(X_val)
#     score = f1_score(y_val, cat_pred)
        
#     return score

In [None]:
def objective(trial):
    # Parameters
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "objective": trial.suggest_categorical("objective", ["Logloss", "CrossEntropy"]),
        "colsample_bylevel": trial.suggest_float("colsample_bylevel", 0.1, 1.0),
        "depth": trial.suggest_int("depth", 4, 16),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "used_ram_limit": "3gb",
     
    }

    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 50)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)
        
    #Learning
    cat = CatBoostClassifier(**param, eval_metric="F1", one_hot_max_size=3)
    cat.fit(X_train, y_train,
            
            eval_set=[(X_train, y_train), (X_val,y_val)],
            early_stopping_rounds=100,cat_features=cat_cols,
            verbose=100)
    cat_pred = cat.predict(X_val)
    score = f1_score(y_val, cat_pred)
        
    return score

In [None]:
study = optuna.create_study(direction = "maximize", sampler = TPESampler(seed=int(42)))
study.optimize(objective, n_trials = 10)

In [None]:
# 하이퍼 파라미터 중요도
optuna.visualization.plot_param_importances(study)

## K-Fold

In [None]:
X_train = X_train.sort_index(ascending=True)

In [None]:
X_train1 = X_train.reset_index(drop=True)

In [None]:
y_train = y_train.sort_index(ascending=True)

In [None]:
y_train1 = y_train.reset_index(drop=True)

In [None]:
# Optuna 버젼
is_holdout=False
cv=KFold(n_splits=10,
        shuffle=True)

scores=[]
models=[]
for train_idx,val_idx in cv.split(X_train1):
    print('='*60)
    preds=[]
    
    model=CatBoostClassifier(**study.best_params, eval_metric='F1')
    
    model.fit(X_train1.iloc[train_idx],y_train1[train_idx],
              eval_set=[(X_train1.iloc[val_idx],y_train1[val_idx])],
              early_stopping_rounds=100,
              verbose=1)
    
    models.append(model)
    scores.append(model.get_best_score()['validation']['F1'])
    if is_holdout:
        break

print(scores)
print(np.mean(scores))

In [None]:
threshold = 0.3475
pred_list=[]
scores_2=[]
for i,(train_idx,val_idx) in enumerate(cv.split(X_train1)):
    pred=models[i].predict_proba(X_train1.iloc[val_idx])[:,1]
    pred=np.where(pred>=threshold,1,0)
    score=f1_score(y_train1[val_idx],pred)
    scores_2.append(score)
    pred=models[i].predict_proba(test)[:,1]
    pred_list.append(pred)
print(scores_2)
print(np.mean(scores_2))

In [None]:
preds=np.mean(pred_list,axis=0) # catboost
preds=np.where(preds>=threshold,1,0) # catboost

In [None]:
ss['target']=preds.round()
ss.to_csv('submission2.csv',index=False)

In [None]:
ss