In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')
from util import *

In [2]:
data = pd.read_csv('../input/data.csv')
base_info = pd.read_csv('../input/base_info.csv')
annual_report_info = pd.read_csv('../input/annual_report_info.csv')
tax_info = pd.read_csv('../input/tax_info.csv')
change_info = pd.read_csv('../input/change_info.csv')
news_info = pd.read_csv('../input/news_info.csv')
other_info = pd.read_csv('../input/other_info.csv')

In [3]:
data = data.merge(base_info, how='left', on='id')
data = data.merge(annual_report_info, how='left', on='id')
data = data.merge(tax_info, how='left', on='id')
data = data.merge(change_info, how='left', on='id')
data = data.merge(news_info, how='left', on='id')
data = data.merge(other_info, how='left', on='id')
data.shape

(24865, 87)

In [4]:
data.columns

Index(['id', 'label', 'oplocdistrict', 'industryphy', 'industryco', 'dom',
       'opscope', 'enttype', 'enttypeitem', 'opfrom', 'opto', 'state', 'orgid',
       'jobid', 'adbusign', 'townsign', 'regtype', 'empnum', 'compform',
       'opform', 'venind', 'enttypeminu', 'oploc', 'regcap', 'reccap',
       'enttypegb', 'ANCHEYEAR', 'FUNDAM', 'EMPNUM', 'EMPNUMSIGN', 'BUSSTNAME',
       'COLGRANUM', 'UNENUM', 'COLEMPLNUM', 'RETEMPLNUM', 'UNEEMPLNUM',
       'WEBSITSIGN', 'FORINVESTSIGN', 'STOCKTRANSIGN', 'PUBSTATE',
       'COLGRANUM+COLEMPLNUM', 'RETSOLNUM+RETEMPLNUM', 'DISPERNUM+DISEMPLNUM',
       'UNENUM+UNEEMPLNUM', 'ALLNUM', 'TAX_AMOUNT_sum', 'bgxmdm_mode',
       'bgxmdm_cnt', 'bgxmdm_nunique', 'bgxmdm_110.0', 'bgxmdm_111.0',
       'bgxmdm_113.0', 'bgxmdm_115.0', 'bgxmdm_117.0', 'bgxmdm_118.0',
       'bgxmdm_120.0', 'bgxmdm_121.0', 'bgxmdm_129.0', 'bgxmdm_131.0',
       'bgxmdm_133.0', 'bgxmdm_137.0', 'bgxmdm_190.0', 'bgxmdm_930.0',
       'bgxmdm_939.0', 'bgxmdm_cnt_max', 'bgxmdm

In [5]:
cat_cols = ['oplocdistrict', 'industryphy', 'industryco', 'enttype', 'enttypeitem', 'state', 'orgid', 'jobid', 'regtype', 'opform', 'venind', 'enttypeminu', 'oploc', 'enttypegb']
two_values = ['adbusign', 'townsign', 'compform', 'protype']
num_cols = ['empnum', 'parnum', 'exenum', 'regcap', 'reccap', 'forreccap', 'forregcap', 'congro']
many_cols = ['dom', 'opscope']
dt_cols = ['opfrom', 'opto']
null_to_drop = ['midpreindcode', 'ptbusscope', 'protype', 'forreccap', 'congro', 'forregcap', 'exenum', 'parnum']
imp_to_drop = ['adbusign', 'regtype', 'opform', 'venind', 'oploc', 'state']

cat_cols = [i for i in cat_cols if i not in null_to_drop]
two_values = [i for i in two_values if i not in null_to_drop]
num_cols = [i for i in num_cols if i not in null_to_drop]

# cat_cols = [i for i in cat_cols if i not in imp_to_drop]
# two_values = [i for i in two_values if i not in imp_to_drop]
# num_cols = [i for i in num_cols if i not in imp_to_drop]

# data.drop(imp_to_drop, axis=1, inplace=True)

In [6]:
# data.drop(many_cols, axis=1, inplace=True)

In [7]:
# industryphy_industryco_enttypeminu
data['industryphy_industryco_enttypeminu'] = data['industryphy'].astype(str) + '_' + data['industryco'].astype(str) + '_' + data['enttypeminu'].astype(str)
cat_cols.append('industryphy_industryco_enttypeminu')

# enttype_enttypeitem
data['enttype_enttypeitem'] = data['enttype'].astype(str) + '_' + data['enttypeitem'].astype(str)
cat_cols.append('enttype_enttypeitem')

# enttypegb_enttype
data['enttypegb_enttype'] = data['enttypegb'].astype(str) + '_' + data['enttype'].astype(str)
cat_cols.append('enttypegb_enttype')

data['regcap+reccap'] = data['regcap'] + data['reccap']

In [8]:
for i in tqdm(cat_cols + many_cols):
    le = LabelEncoder()
    data[i] = le.fit_transform(data[i].astype(str))

100%|██████████████████████████████████████████████████████████████████████████████████| 19/19 [00:00<00:00, 99.08it/s]


In [9]:
for i in tqdm(cat_cols + many_cols):
    data[i] = data[i].astype('str')

100%|█████████████████████████████████████████████████████████████████████████████████| 19/19 [00:00<00:00, 216.49it/s]


In [10]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
sub = test[['id']]
# train.shape, test.shape

used_cols = [i for i in train.columns if i not in ['id', 'label', 'opfrom', 'opto']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

In [11]:
num_folds=5
kfold = StratifiedKFold(n_splits=num_folds, random_state=1024, shuffle=True)

oof_probs = np.zeros(train.shape[0])
output_probs = np.zeros((test.shape[0], 5))
offline_score = []
feature_importance_df = pd.DataFrame()

for fold, (train_idx, valid_idx) in enumerate(kfold.split(train, y)):
    X_train, y_train = train.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = train.iloc[valid_idx], y.iloc[valid_idx]
    
    model=CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="F1",
        task_type="GPU",
        learning_rate=0.01,
        iterations=100000,
        random_seed=2020,
        od_type="Iter",
        depth=8,
        early_stopping_rounds=500
    )

    clf = model.fit(X_train, y_train, eval_set=(X_valid,y_valid), verbose=500, cat_features=cat_cols)
    yy_pred_valid=clf.predict(X_valid)
    y_pred_valid = clf.predict(X_valid, prediction_type='Probability')[:, -1]
    oof_probs[valid_idx] = y_pred_valid
    offline_score.append(f1_score(y_valid, yy_pred_valid))
    output_probs[:, fold] = clf.predict(test, prediction_type='Probability')[:,-1]
    
    # feature importance
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = model.feature_names_
    fold_importance_df["importance"] = model.feature_importances_
    fold_importance_df["fold"] = fold + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

print('OOF-MEAN-F1:%.6f, OOF-STD-F1:%.6f' % (np.mean(offline_score), np.std(offline_score)))
print('feature importance:')
feature_importance_df_ = feature_importance_df.groupby(['feature'])['importance'].mean().sort_values(ascending=False)
print(feature_importance_df_.head(15))
# print(feature_importance_df_)
# feature_importance_df_.to_csv("./importance.csv")

0:	learn: 0.8188002	test: 0.8238213	best: 0.8238213 (0)	total: 274ms	remaining: 7h 36m 34s
500:	learn: 0.8754717	test: 0.8235294	best: 0.8295165 (315)	total: 1m 55s	remaining: 6h 22m 46s
1000:	learn: 0.9094364	test: 0.8247423	best: 0.8316327 (774)	total: 3m 49s	remaining: 6h 17m 34s
bestTest = 0.8316326531
bestIteration = 774
Shrink model to first 775 iterations.
0:	learn: 0.8309693	test: 0.8390244	best: 0.8390244 (0)	total: 236ms	remaining: 6h 33m 50s
500:	learn: 0.8776529	test: 0.8550000	best: 0.8578554 (249)	total: 1m 57s	remaining: 6h 27m 26s
1000:	learn: 0.9065831	test: 0.8606965	best: 0.8606965 (548)	total: 3m 50s	remaining: 6h 20m 5s
bestTest = 0.8606965174
bestIteration = 548
Shrink model to first 549 iterations.
0:	learn: 0.8023256	test: 0.7864583	best: 0.7864583 (0)	total: 201ms	remaining: 5h 35m 48s
500:	learn: 0.8807453	test: 0.8264059	best: 0.8296296 (63)	total: 1m 58s	remaining: 6h 32m 19s
bestTest = 0.8296296296
bestIteration = 63
Shrink model to first 64 iterations.
0:	

KeyboardInterrupt: 

In [None]:
sub['score'] = np.mean(output_probs, axis=1)
print(sub['score'])
sub.to_csv('../sub/cat_sub.csv')