In [1]:
# 查看当前挂载的数据集目录, 该目录下的变更重启环境后会自动还原
# View dataset directory. 
# This directory will be recovered automatically after resetting environment. 
!ls /home/aistudio/data

data60767


In [2]:
# 查看工作区文件, 该目录下的变更将会持久保存. 请及时清理不必要的文件, 避免加载过慢.
# View personal work directory. 
# All changes under this directory will be kept even after reset. 
# Please clean unnecessary files in time to speed up environment loading. 
!ls /home/aistudio/work

In [3]:
# 如果需要进行持久化安装, 需要使用持久化路径, 如下方代码示例:
# If a persistence installation is required, 
# you need to use the persistence path as the following: 
# !mkdir /home/aistudio/external-libraries
# !pip install beautifulsoup4 -t /home/aistudio/external-libraries
# !pip install lightgbm==2.3.1 -t /home/aistudio/external-libraries
# !pip install catboost==0.23 -t /home/aistudio/external-libraries
# !pip uninstall --yes pandas
# !pip install pandas==1.0.5

In [4]:
# 同时添加如下代码, 这样每次环境(kernel)启动的时候只要运行下方代码即可: 
# Also add the following code, 
# so that every time the environment (kernel) starts, 
# just run the following code: 
import sys 
sys.path.append('/home/aistudio/external-libraries')

In [5]:
def count_encode(df, cat_cols):
    """
    类别特征的频次编码
    @param df:
    @param cat_cols:
    @return:
    """
    for col in cat_cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df


def cat_num_stats(df, cat_cols, num_cols):
    """
    类别特征与数据特征groupby统计特征，简单版
    @param df:
    @param cat_cols: 类别特征
    @param num_cols: 数值特征
    @return:
    """
    for f1 in tqdm(cat_cols):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_cols):
            tmp = g[f2].agg({
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_std'.format(f1, f2): 'std'
            })
            df = df.merge(tmp, on=f1, how='left')
            del tmp
            gc.collect()
    return df

In [6]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import gc
import warnings
warnings.filterwarnings('ignore')

In [7]:
data = pd.read_csv('data/data60767/data.csv')
base_info = pd.read_csv('data/data60767/base_info.csv')
annual_report_info = pd.read_csv('data/data60767/annual_report_info.csv')
tax_info = pd.read_csv('data/data60767/tax_info.csv')
change_info = pd.read_csv('data/data60767/change_info.csv')
news_info = pd.read_csv('data/data60767/news_info.csv')
other_info = pd.read_csv('data/data60767/other_info.csv')

data = data.merge(base_info, how='left', on='id')
data = data.merge(annual_report_info, how='left', on='id')
data = data.merge(tax_info, how='left', on='id')
data = data.merge(change_info, how='left', on='id')
data = data.merge(news_info, how='left', on='id')
data = data.merge(other_info, how='left', on='id')

In [8]:
cat_cols = ['oplocdistrict', 'industryphy', 'industryco', 'enttype', 'enttypeitem', 'state', 'orgid', 'jobid', 'regtype', 'opform', 'venind', 'enttypeminu', 'oploc', 'enttypegb']
two_values = ['adbusign', 'townsign', 'compform', 'protype']
num_cols = ['empnum', 'parnum', 'exenum', 'regcap', 'reccap', 'forreccap', 'forregcap', 'congro']
many_cols = ['dom', 'opscope']
dt_cols = ['opfrom', 'opto']
null_to_drop = ['midpreindcode', 'ptbusscope', 'protype', 'forreccap', 'congro', 'forregcap', 'exenum', 'parnum']
imp_to_drop = ['adbusign', 'regtype', 'opform', 'venind', 'oploc', 'state']

cat_cols = [i for i in cat_cols if i not in null_to_drop]
two_values = [i for i in two_values if i not in null_to_drop]
num_cols = [i for i in num_cols if i not in null_to_drop]

# cat_cols = [i for i in cat_cols if i not in imp_to_drop]
# two_values = [i for i in two_values if i not in imp_to_drop]
# num_cols = [i for i in num_cols if i not in imp_to_drop]

# data.drop(imp_to_drop, axis=1, inplace=True)

data = count_encode(data, cat_cols)

# data = cat_num_stats(data, cat_cols, num_cols)


# industryphy_industryco_enttypeminu
data['industryphy_industryco_enttypeminu'] = data['industryphy'].astype(str) + '_' + data['industryco'].astype(str) + '_' + data['enttypeminu'].astype(str)
cat_cols.append('industryphy_industryco_enttypeminu')

# enttype_enttypeitem
data['enttype_enttypeitem'] = data['enttype'].astype(str) + '_' + data['enttypeitem'].astype(str)
cat_cols.append('enttype_enttypeitem')

# enttypegb_enttype
data['enttypegb_enttype'] = data['enttypegb'].astype(str) + '_' + data['enttype'].astype(str)
cat_cols.append('enttypegb_enttype')

data['regcap+reccap'] = data['regcap'] + data['reccap']

for i in tqdm(cat_cols + many_cols):
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i].astype(str))

for i in tqdm(cat_cols + many_cols):
    data[i] = data[i].astype('str')

 68%|██████▊   | 13/19 [00:00<00:00, 61.81it/s]

oplocdistrict
industryphy
industryco
enttype
enttypeitem
state
orgid
jobid
regtype
opform
venind
enttypeminu
oploc
enttypegb


100%|██████████| 19/19 [00:00<00:00, 60.05it/s]
100%|██████████| 19/19 [00:00<00:00, 56.61it/s]


In [9]:
def stat(df, df_merge, group_by, agg):
    group = df.groupby(group_by).agg(agg)

    columns = []
    for on, methods in agg.items():
        for method in methods:
            columns.append('{}_{}_{}'.format('_'.join(group_by), on, method))
    group.columns = columns
    group.reset_index(inplace=True)
    df_merge = df_merge.merge(group, on=group_by, how='left')

    del (group)
    gc.collect()
    return df_merge


def statis_feat(df_know, df_unknow):
    df_unknow = stat(df_know, df_unknow, ['industryphy'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['industryco'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['industryphy', 'industryco'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['enttypegb'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['enttype'], {'label': ['mean']})
    df_unknow = stat(df_know, df_unknow, ['enttypegb', 'enttype'], {'label': ['mean']})
    # df_unknow = stat(df_know, df_unknow, ['age', 'op_device'], {'label': ['mean']})
    # df_unknow = stat(df_know, df_unknow, ['using_time'], {'label': ['mean']})
    # df_unknow = stat(df_know, df_unknow, ['city', 'op_device'], {'label': ['mean']})
    # df_unknow = stat(df_know, df_unknow, ['age', 'city'], {'label': ['mean']})
    # df_unknow = stat(df_know, df_unknow, ['op_device', 'level'], {'label': ['mean']})

    return df_unknow


df_train = data[data['label'].notnull()]
df_test = data[data['label'].isnull()]

df_stas_feat = None
kf = StratifiedKFold(n_splits=5, random_state=1024, shuffle=True)
for train_index, val_index in kf.split(df_train, df_train['label']):
    df_fold_train = df_train.iloc[train_index]
    df_fold_val = df_train.iloc[val_index]

    df_fold_val = statis_feat(df_fold_train, df_fold_val)
    df_stas_feat = pd.concat([df_stas_feat, df_fold_val], axis=0)

    del (df_fold_train)
    del (df_fold_val)
    gc.collect()

df_test = statis_feat(df_train, df_test)
df_feature = pd.concat([df_stas_feat, df_test], axis=0)
# df_feature = df_feature.reset_index(drop=True)

del (df_stas_feat)
del (df_train)
del (df_test)
gc.collect()

0

In [10]:
imp_drop = ['bgxmdm_cnt_median', 'UNENUM', 'positive_negtive_0', 'PUBSTATE', 'state_count', 'positive_negtive_mean', 'bgxmdm_190.0', 'compform', 'BUSSTNAME', 'UNEEMPLNUM', 'bgxmdm_117.0', 'regtype_count', 'DISPERNUM+DISEMPLNUM', 'ANCHEYEAR', 'COLGRANUM', 'bgxmdm_cnt_min', 'legal_judgment_num', 'positive_negtive_1', 'RETEMPLNUM', 'brand_num', 'RETSOLNUM+RETEMPLNUM', 'STOCKTRANSIGN', 'bgxmdm_129.0', 'FUNDAM', 'patent_num']
df_feature.drop(imp_drop, axis=1, inplace=True)

In [11]:
train = df_feature[df_feature['label'].notnull()]
test = df_feature[df_feature['label'].isnull()]
sub = test[['id']]
# train.shape, test.shape

used_cols = [i for i in train.columns if i not in ['id', 'label', 'opfrom', 'opto']]
y = train['label']
train = train[used_cols]
test = test[used_cols]


num_folds=5
kfold = StratifiedKFold(n_splits=num_folds, random_state=1024, shuffle=True)

oof_probs = np.zeros(train.shape[0])
output_probs = np.zeros((test.shape[0], 5))
offline_score = []
feature_importance_df = pd.DataFrame()

for fold, (train_idx, valid_idx) in enumerate(kfold.split(train, y)):
    X_train, y_train = train.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train.iloc[valid_idx], y.iloc[valid_idx]
    
    model=CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="F1",
        task_type="GPU",
        learning_rate=0.01,
        iterations=100000,
        random_seed=2020,
        od_type="Iter",
        depth=8,
        early_stopping_rounds=500
    )

    clf = model.fit(X_train, y_train, eval_set=(X_valid,y_valid), verbose=500, cat_features=cat_cols)
    yy_pred_valid=clf.predict(X_valid)
    y_pred_valid = clf.predict(X_valid, prediction_type='Probability')[:, -1]
    oof_probs[valid_idx] = y_pred_valid
    offline_score.append(f1_score(y_valid, yy_pred_valid))
    output_probs[:, fold] = clf.predict(test, prediction_type='Probability')[:,-1]
    
    # feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = model.feature_names_
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

print('OOF-MEAN-F1:%.6f, OOF-STD-F1:%.6f' % (np.mean(offline_score), np.std(offline_score)))
print('feature importance:')
feature_importance_df_ = feature_importance_df.groupby('feature', as_index=False)['importance'].mean().sort_values(by='importance', ascending=False)
feature_importance_df_['normalized_importance'] = feature_importance_df_['importance'] / feature_importance_df_['importance'].sum()
feature_importance_df_['cumulative_importance'] = np.cumsum(feature_importance_df_['normalized_importance'])
record_low_importance = feature_importance_df_[feature_importance_df_['cumulative_importance'] > 0.99]
to_drop = list(record_low_importance['feature'])
print(to_drop)
# print(feature_importance_df_.head(15))
print(feature_importance_df_)
# feature_importance_df_.to_csv("./importance.csv")

sub['score'] = np.mean(output_probs, axis=1)
# print(sub['score'])
sub.to_csv('cat_sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), np.mean(offline_score)), index=False)

0:	learn: 0.8002602	test: 0.8091603	best: 0.8091603 (0)	total: 120ms	remaining: 3h 20m 24s
500:	learn: 0.8719212	test: 0.8459658	best: 0.8480392 (331)	total: 49.3s	remaining: 2h 43m 3s
1000:	learn: 0.9003096	test: 0.8446602	best: 0.8536585 (771)	total: 1m 37s	remaining: 2h 40m 48s
bestTest = 0.8536585366
bestIteration = 771
Shrink model to first 772 iterations.
0:	learn: 0.7963446	test: 0.8060453	best: 0.8060453 (0)	total: 95.6ms	remaining: 2h 39m 22s
500:	learn: 0.8731026	test: 0.8390244	best: 0.8390244 (440)	total: 48.6s	remaining: 2h 40m 57s
1000:	learn: 0.9078624	test: 0.8472906	best: 0.8493827 (750)	total: 1m 38s	remaining: 2h 42m 18s
bestTest = 0.849382716
bestIteration = 750
Shrink model to first 751 iterations.
0:	learn: 0.7994809	test: 0.8040712	best: 0.8040712 (0)	total: 91.7ms	remaining: 2h 32m 53s
500:	learn: 0.8665425	test: 0.8402948	best: 0.8402948 (419)	total: 50.8s	remaining: 2h 48m 6s
bestTest = 0.8402948403
bestIteration = 419
Shrink model to first 420 iterations.
0:	