In [1]:
# default_exp algo.ml.tree.catboost

# Catboost

CatBoost is an algorithm for gradient boosting on decision trees.

* 官网https://catboost.ai/
* https://github.com/catboost/catboost 是一个C++库  
* https://github.com/catboost/tutorials  
* 论文地址https://arxiv.org/pdf/1706.09516.pdf

## install

In [3]:
# !pip install catboost -U
!pip freeze | grep catboost

catboost==0.23


# encap

In [None]:
#export
from sklearn.metrics import classification_report

from catboost import Pool, CatBoostClassifier

In [None]:
def time_cost(func):
    import time
    def wrapper(*args,**kvargs):
        tic=time.time()
        result=func(*args,**kvargs)
        toc=time.time()
        print('{} is called. {}s is used.'.format(func.__name__,toc-tic))
        return result
    return wrapper

In [None]:
@time_cost
def load_data(fp):
    return df

In [None]:
def trainset_input_fn(df):
    return X_train, X_val, y_train, y_val

def predict_input_fn(df):
    return X_testa

In [None]:
def train(df_train, y_train, cat_cols,df_val=None, y_val=None, iterations=500, custom_metric='F1', plot=False):

    train_data = Pool(data=df_train,
                   label=y_train,
                   cat_features=cat_cols)
    val_data = Pool(data=df_val, label=y_val, cat_features=cat_cols) if df_val is not None else None
    if iterations==0:
        iterations = 1
    params = {
    'iterations': iterations,
    'learning_rate': 0.05,
    'random_seed': 144,
    'custom_metric': custom_metric,
    'loss_function': 'Logloss',
    'class_weights': [1, 8],
    # 'use_best_model': True
    }
    print(params)
    model = CatBoostClassifier(**params)
    r = model.fit(train_data, eval_set=val_data, verbose=True)
    df_features_importance = pd.DataFrame({'name': model.feature_names_,
                                        'value': model.feature_importances_})
    df_features_importance = df_features_importance.sort_values('value', ascending=False)

    df_features_importance.reset_index(drop=True, inplace=True)
    print(df_features_importance.head(50))
    if plot:
    import matplotlib.pyplot as plt
    fea_ = df_features_importance.sort_values('value')[df_features_importance.value > 0].value
    fea_name = df_features_importance.sort_values('value')[df_features_importance.value > 0].name
    plt.figure(figsize=(10, 20))
    plt.barh(fea_name, fea_, height=0.5)
    plt.show()
    return model, df_features_importance, r.best_iteration_


In [None]:
#export
def val(model, df_val, y_val, cat_cols):
    test_data = Pool(data=df_val,
                  cat_features=cat_cols)
    print(pd.Series(model.predict(test_data)).value_counts())
    print(classification_report(y_val, model.predict(test_data)))
    dfr = pd.DataFrame(y_val)
    dfr.columns = ['true_label']
    dfr['CHANGE_ID'] = df_val['CHANGE_ID']
    y_test_hat = model.predict_proba(test_data)[:, 1]
    dfr['score'] = y_test_hat
    dfr['predict_label'] = model.predict(test_data)
    dfr = dfr.sort_values('score', ascending=False)
    dfr['order'] = range(1, dfr.shape[0] + 1)
    print(dfr[dfr.true_label == 1])
    return dfr

In [12]:
#export
def predict(model,df_predict,cat_cols, thr=0.5):
    test_data=Pool(data=df_predict,cat_features=cat_cols)
    dfr=pd.DataFrame(df_predict['CHANGE_ID'])
    y_test_hat=model.predict_proba(test_data)[:,1]
    dfr['score']=y_test_hat
    dfr['predict_label'] = 0
    dfr.loc[dfr.score>=thr, 'predict_label']=1
    dfr.sort_values("score", ascending=False, inplace=True)
    print('--------------------------------------------------')
    s=dfr['predict_label'].value_counts()
    print(s)
    print(f'su sample num：{s.loc[1] if 1 in s else 0}')
    return dfr

In [2]:
# export
def explain(model,df_predict,cat_cols,dfr):
    test_data=Pool(data=df_predict,cat_features=cat_cols)
    shap_values=model.get_feature_importance(test_data,type='ShapValues')
    dfs=pd.DataFrame(shap_values[:,:-1],columns=df_predict.columns,index=df_predict['CHANGE_ID'])
    dfs_T=dfs.T
    ss=[]
    for i in range(dfs_T.shape[1]):
        ss.append(dfs_T.iloc[:,i].copy().sort_values(ascending=False).iloc[:5])
    count=0
    rr=[]
    for line in dfr[dfr.predict_label==1].itertuples():
        rr.append({"change_id":line.CHANGE_ID,"FS_SC_NM":"个险模型","FS_SC_SCORE":round(line.score,2),"FS_SC_EXPLAIN":','.join([f'{i[0]}:{round(i[1], 2)}' for i in list(zip(ss[count].index,ss[count].values))])})
    count+=1
    print(rr)
    return rr

# nb_export

In [4]:
from nbdev.export import *
notebook2script()

Converted 00_core.ipynb.
Converted 00_template.ipynb.
Converted algo_ml_shallow_tree_catboost.ipynb.
Converted dl_keras.ipynb.
Converted engineering_nbdev.ipynb.
Converted engineering_panel.ipynb.
Converted index.ipynb.
