In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm as lgb
import gc
from joblib import Parallel, delayed
from util import *
import warnings
warnings.filterwarnings('ignore')

In [2]:
def count_encode(df, cols=[]):
    """
    count编码
    @param df:
    @param cols:
    @return:
    """
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')
    return df

def label_encode(df, cols, verbose=True):
    """
    label encode
    @param df:
    @param cols:
    @param verbose:
    @return:
    """
    for col in cols:
        df[col], _ = df[col].astype(str).factorize(sort=True)
        if df[col].max() > 32000:
            df[col] = df[col].astype('int32')
        else:
            df[col] = df[col].astype('int16')
        if verbose:
            print(col)
    return df


def cross_cat_num(df, cat_cols, num_cols):
    """
    类别特征与数据特征groupby统计
    @param df:
    @param cat_col: 类别特征
    @param num_col: 数值特征
    @return:
    """
    def max_min(s):
        return s.max() - s.min()

    def quantile(s, q=0.25):
        return s.quantile(q)

    for f1 in tqdm(cat_cols):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_cols):
            tmp = g[f2].agg({
                '{}_{}_count'.format(f1, f2): 'count',
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_std'.format(f1, f2): 'std',
                '{}_{}_nunique'.format(f1, f2): 'nunique',
                '{}_{}_max_min'.format(f1, f2): max_min,
                '{}_{}_quantile_25'.format(f1, f2): lambda x: quantile(x, 0.25),
                '{}_{}_quantile_75'.format(f1, f2): lambda x: quantile(x, 0.75)
            })
            df = df.merge(tmp, on=f1, how='left')
            del tmp
            gc.collect()
    return df


def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    for i in tqdm(range(len(cross_features))):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in tqdm(cross_features):
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df

In [3]:
# 数据集7：entprise_info.csv
# 带标注的企业数据。每一行代表一个企业，每一行2列，其中id列为企业唯一标识，label列为标注（1：有非法集资风险，0：无非法集资风险），列之间采用“,”分隔符分割。
# 训练集 id 及标签
entprise_info = pd.read_csv('../input/train/entprise_info.csv')
print(entprise_info.shape)
entprise_info.head()

(14865, 2)


Unnamed: 0,id,label
0,59b38c56de3836831ff90a77d892a13523b7494f6ed09ff7,1
1,da8691b210adb3f6be8064e006f220070565db287275ad38,0
2,82750f1b9d122350918121f97c99bf96e11aa24ee91504a9,0
3,f000950527a6feb6b2c6de6f85c1e7438ba5590be931e2ec,0
4,f1c1045b13d1832927e3743e49d2917f2d98424f0849a373,0


In [4]:
# 数据集8（验证集）：entprise_evaluate.csv
# 未标注企业数据。参赛队伍需提交的最终结果数据集，每一行代表一个企业，每一行有 2 列, 其中id列为企业唯一标识，score列为空，列之间采用“,”分隔符分割。
# 测试集 id score
entprise_evaluate = pd.read_csv('../input/entprise_evaluate.csv')

print(entprise_evaluate.shape)
entprise_evaluate.head()

(10000, 2)


Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,


In [5]:
# 数据集1：base_info.csv
# 包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型]
# 基础信息表
base_info = pd.read_csv('../input/train/base_info.csv')
print(base_info.shape)
base_info.info()

(24865, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             24865 non-null  object 
 1   oplocdistrict  24865 non-null  int64  
 2   industryphy    24865 non-null  object 
 3   industryco     24864 non-null  float64
 4   dom            24865 non-null  object 
 5   opscope        24865 non-null  object 
 6   enttype        24865 non-null  int64  
 7   enttypeitem    16651 non-null  float64
 8   opfrom         24865 non-null  object 
 9   opto           8825 non-null   object 
 10  state          24865 non-null  int64  
 11  orgid          24865 non-null  int64  
 12  jobid          24865 non-null  int64  
 13  adbusign       24865 non-null  int64  
 14  townsign       24865 non-null  int64  
 15  regtype        24865 non-null  int64  
 16  empnum         19615 non-null  float64
 17  compform       10631 non-null  float64

In [6]:
single_cols = ['ptbusscope', 'midpreindcode']
many_cols = ['dom', 'opscope']
to_drop = single_cols + many_cols
base_info.drop(to_drop, axis=1, inplace=True)
gc.collect()

20

In [7]:
base_info['opform'] = base_info['opform'].replace('01', '01-以个人财产出资').replace('02', '02-以家庭共有财产作为个人出资').replace('   ', np.nan)

In [8]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(len(to_drop), missing_threshold))
    return to_drop

In [9]:
to_drop = identify_missing(base_info, missing_threshold=0.9)
base_info.drop(to_drop, axis=1, inplace=True)
to_drop

protype          0.998633
forreccap        0.990871
congro           0.989986
forregcap        0.989946
exenum           0.944581
parnum           0.905932
reccap           0.715102
enttypeminu      0.707621
venind           0.660688
opto             0.645083
opform           0.638086
compform         0.572451
enttypeitem      0.330344
empnum           0.211140
regcap           0.007681
industryco       0.000040
oplocdistrict    0.000000
industryphy      0.000000
enttype          0.000000
opfrom           0.000000
enttypegb        0.000000
state            0.000000
orgid            0.000000
jobid            0.000000
adbusign         0.000000
townsign         0.000000
regtype          0.000000
oploc            0.000000
id               0.000000
dtype: float64
6 features with greater than 0.9 missing values.



['protype', 'forreccap', 'congro', 'forregcap', 'exenum', 'parnum']

In [10]:
num_cols = [i for i in ['empnum', 'parnum', 'exenum', 'regcap', 'reccap', 'forreccap', 'forregcap', 'congro'] if i not in to_drop]
le_cols = [i for i in ['oplocdistrict', 'industryphy', 'industryco', 'enttype', 'enttypeitem', 'state',
           'orgid', 'jobid', 'regtype', 'compform', 'opform', 'venind', 'enttypeminu',
           'protype', 'oploc', 'enttypegb'] if i not in to_drop]
one_zero_cols = ['adbusign', 'townsign']
cat_cols = le_cols + one_zero_cols
dt_cols = ['opfrom', 'opto']

In [11]:
# 时间转换, 暂时先抽取年份特征
base_info['opfrom'] = pd.to_datetime(base_info.opfrom)
base_info['opfrom_year'] = base_info['opfrom'].dt.year.astype('int')

base_info['opto'] = pd.to_datetime(base_info.opto)
base_info['opto_year'] = base_info['opto'].dt.year.fillna(-1).astype('int')

del base_info['opfrom']
del base_info['opto']
gc.collect()

43

In [12]:
cat_cols += ['opfrom_year', 'opto_year']

In [13]:
# for col in tqdm(base_info.select_dtypes(['float64']).columns):
#     base_info[col] = base_info[col].fillna(base_info[col].median())

In [14]:
base_info = count_encode(base_info, cat_cols)

oplocdistrict
industryphy
industryco
enttype
enttypeitem
state
orgid
jobid
regtype
compform
opform
venind
enttypeminu
oploc
enttypegb
adbusign
townsign
opfrom_year
opto_year


In [15]:
base_info = cross_cat_num(base_info, cat_cols, num_cols)

  0%|                                                                                           | 0/19 [00:00<?, ?it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 12.69it/s][A
  5%|████▎                                                                              | 1/19 [00:00<00:04,  4.18it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 11.58it/s][A
 11%|████████▋                                                                          | 2/19 [00:00<00:04,  4.06it/s]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████

 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  5.14it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  5.09it/s][A
 79%|████████████████████████████████████████████████████████████████▋                 | 15/19 [00:24<00:15,  3.89s/it]
  0%|                                                                                            | 0/3 [00:00<?, ?it/s][A
 33%|████████████████████████████                                                        | 1/3 [00:00<00:00,  7.11it/s][A
 67%|████████████████████████████████████████████████████████                            | 2/3 [00:00<00:00,  6.92it/s][A
100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00,  6.67it/s][A
 84%|█████████████████████████████████████████████████████████████████████             | 16/19 [00:24<00:08,  2.86s/it]
  0%|                 

In [16]:
base_info = arithmetic(base_info, num_cols)

100%|████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 18.01it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 758.74it/s]


In [17]:
base_info = label_encode(base_info, cat_cols, verbose=True)

oplocdistrict
industryphy
industryco
enttype
enttypeitem
state
orgid
jobid
regtype
compform
opform
venind
enttypeminu
oploc
enttypegb
adbusign
townsign
opfrom_year
opto_year


In [18]:
base_info.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24865 entries, 0 to 24864
Columns: 741 entries, id to reccap_regcap_ratio
dtypes: float32(19), float64(624), int16(19), int64(78), object(1)
memory usage: 136.3+ MB


In [19]:
# 划分训练集和测试集
entprise_evaluate.columns = ['id', 'label']
data = pd.concat([entprise_info, entprise_evaluate])
df = pd.merge(data, base_info, on='id', how='left')

print(df.shape)
df.head()

(24865, 742)


Unnamed: 0,id,label,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,...,empnum_reccapc_multiply,regcap_reccap_add,regcap_reccap_subtract,regcap_reccapc_multiply,empnum_regcap_ratio,empnum_reccap_ratio,regcap_empnum_ratio,regcap_reccap_ratio,reccap_empnum_ratio,reccap_regcap_ratio
0,59b38c56de3836831ff90a77d892a13523b7494f6ed09ff7,1.0,3,9,122,9,17,4,32,150,...,75960.0,80185.0,4225.0,1602946000.0,4.7e-05,5.3e-05,21091.954023,1.111243,18980.509745,0.899893
1,da8691b210adb3f6be8064e006f220070565db287275ad38,0.0,6,14,240,16,31,4,46,175,...,,,,,0.19998,,4.997501,,,
2,82750f1b9d122350918121f97c99bf96e11aa24ee91504a9,0.0,7,12,206,0,2,4,40,424,...,,,,,0.03,,33.322226,,,
3,f000950527a6feb6b2c6de6f85c1e7438ba5590be931e2ec,0.0,3,12,198,0,4,4,1,367,...,,,,,0.002335,,428.190603,,,
4,f1c1045b13d1832927e3743e49d2917f2d98424f0849a373,0.0,12,14,245,16,31,4,70,404,...,,,,,0.02,,49.975012,,,


In [20]:
train = df[df.label.notna()]
test = df[df.label.isna()]

print(train.shape, test.shape)

(14865, 742) (10000, 742)


In [21]:
y = train['label'].astype(int)
sub = test[['id']]
used_cols = [i for i in train.columns if i not in ['id', 'label']]
train = train[used_cols]
test = test[used_cols]

In [22]:
psi_res = Parallel(n_jobs=4)(delayed(get_psi)(c, train, test) for c in used_cols)
psi_df = pd.concat(psi_res)
psi_used_cols = list(psi_df[psi_df['PSI'] <= 0.2]['变量名'].values)
psi_not_used_cols = list(psi_df[psi_df['PSI'] > 0.2]['变量名'].values)
print('PSI used features: \n', psi_used_cols)
print('PSI drop features: \n', psi_not_used_cols)
print('Error drop features: \n', list(set(used_cols) - set(psi_used_cols)))

PSI used features: 
 ['oplocdistrict', 'industryphy', 'enttype', 'enttypeitem', 'state', 'orgid', 'adbusign', 'townsign', 'regtype', 'empnum', 'compform', 'opform', 'venind', 'enttypeminu', 'oploc', 'regcap', 'enttypegb', 'opto_year', 'oplocdistrict_count', 'industryphy_count', 'industryco_count', 'enttype_count', 'enttypeitem_count', 'state_count', 'orgid_count', 'jobid_count', 'regtype_count', 'compform_count', 'opform_count', 'venind_count', 'enttypeminu_count', 'oploc_count', 'enttypegb_count', 'adbusign_count', 'townsign_count', 'opto_year_count', 'oplocdistrict_empnum_count', 'oplocdistrict_empnum_max', 'oplocdistrict_empnum_min', 'oplocdistrict_empnum_median', 'oplocdistrict_empnum_mean', 'oplocdistrict_empnum_sum', 'oplocdistrict_empnum_std', 'oplocdistrict_empnum_nunique', 'oplocdistrict_empnum_max_min', 'oplocdistrict_empnum_quantile_25', 'oplocdistrict_empnum_quantile_75', 'oplocdistrict_regcap_count', 'oplocdistrict_regcap_max', 'oplocdistrict_regcap_min', 'oplocdistrict_re

In [23]:
train = train[psi_used_cols]
test = test[psi_used_cols]

In [24]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52)
print('AUC useless_cols: \n', useless_cols)

oplocdistrict
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	training's auc: 0.800172	valid_0's auc: 0.784394
Evaluated only: auc
**********
0.784394131780055
industryphy
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	training's auc: 0.977366	valid_0's auc: 0.967944
Evaluated only: auc
**********
0.9679441641137959
enttype
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.866492	valid_0's auc: 0.846703
Evaluated only: auc
**********
0.8467026664715628
enttypeitem
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	training's auc: 0.910801	valid_0's auc: 0.88193
Evaluated only: auc
**********
0.8819299231070601
state
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.551893	valid_0's auc: 0.538579
Ev

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.529691	valid_0's auc: 0.526576
Evaluated only: auc
**********
0.5265762108631928
oplocdistrict_empnum_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[108]	training's auc: 0.799992	valid_0's auc: 0.782321
Evaluated only: auc
**********
0.7823213501814937
oplocdistrict_empnum_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.79929	valid_0's auc: 0.782426
Evaluated only: auc
**********
0.782426375836075
oplocdistrict_empnum_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.800172	valid_0's auc: 0.784153
Evaluated only: auc
**********
0.7841528087872252
oplocdistrict_empnum_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]

Early stopping, best iteration is:
[16]	training's auc: 0.974123	valid_0's auc: 0.963698
Evaluated only: auc
**********
0.9636982955162305
industryphy_regcap_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[51]	training's auc: 0.976827	valid_0's auc: 0.967339
Evaluated only: auc
**********
0.9673387915205355
industryphy_regcap_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.977328	valid_0's auc: 0.967906
Evaluated only: auc
**********
0.9679058120489207
industryphy_regcap_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[63]	training's auc: 0.977342	valid_0's auc: 0.967938
Evaluated only: auc
**********
0.9679376737643555
industryphy_regcap_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[54]	training's auc: 0.976807	valid_0's auc: 0.967264
Evaluated only: a

Early stopping, best iteration is:
[19]	training's auc: 0.982822	valid_0's auc: 0.971512
Evaluated only: auc
**********
0.9715120862107216
industryco_regcap_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.982763	valid_0's auc: 0.975451
Evaluated only: auc
**********
0.9754511382892855
industryco_regcap_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[99]	training's auc: 0.985319	valid_0's auc: 0.971203
Evaluated only: auc
**********
0.9712029095646509
industryco_regcap_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	training's auc: 0.978605	valid_0's auc: 0.965774
Evaluated only: auc
**********
0.9657740272736284
industryco_regcap_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[29]	training's auc: 0.982571	valid_0's auc: 0.964998
Evaluated only:

[67]	training's auc: 0.8651	valid_0's auc: 0.845939
Evaluated only: auc
**********
0.8459391653646632
enttype_regcap_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.864717	valid_0's auc: 0.845585
Evaluated only: auc
**********
0.8455851463042771
enttype_regcap_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[55]	training's auc: 0.865226	valid_0's auc: 0.845939
Evaluated only: auc
**********
0.8459391653646632
enttype_regcap_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	training's auc: 0.865042	valid_0's auc: 0.845939
Evaluated only: auc
**********
0.8459391653646632
enttype_regcap_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[10]	training's auc: 0.865226	valid_0's auc: 0.845939
Evaluated only: auc
**********
0.8459391653646632

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	training's auc: 0.852452	valid_0's auc: 0.826519
Evaluated only: auc
**********
0.8265194498071776
enttypeitem_regcap_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[17]	training's auc: 0.845474	valid_0's auc: 0.820073
Evaluated only: auc
**********
0.8200727627175447
enttypeitem_regcap_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[6]	training's auc: 0.851679	valid_0's auc: 0.824023
Evaluated only: auc
**********
0.8240230253996875
enttypeitem_reccap_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[67]	training's auc: 0.910449	valid_0's auc: 0.886317
Evaluated only: auc
**********
0.8863173993287798
enttypeitem_reccap_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration

Early stopping, best iteration is:
[1]	training's auc: 0.551964	valid_0's auc: 0.538566
Evaluated only: auc
**********
0.5385662464067066
state_reccap_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.551964	valid_0's auc: 0.538566
Evaluated only: auc
**********
0.5385662464067066
state_reccap_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_0's auc: 0.5
Evaluated only: auc
**********
0.5
state_reccap_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.501729	valid_0's auc: 0.50144
Evaluated only: auc
**********
0.5014396775122373
state_reccap_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.551964	valid_0's auc: 0.538566
Evaluated only: auc
**********
0.5385662464067066
state_reccap_sum
Tra

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.609882	valid_0's auc: 0.61345
Evaluated only: auc
**********
0.6134495381231325
orgid_reccap_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	training's auc: 0.700453	valid_0's auc: 0.703018
Evaluated only: auc
**********
0.7030175404643786
orgid_reccap_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[6]	training's auc: 0.822609	valid_0's auc: 0.804958
Evaluated only: auc
**********
0.8049579189343554
orgid_reccap_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	training's auc: 0.822609	valid_0's auc: 0.804958
Evaluated only: auc
**********
0.8049579189343554
orgid_reccap_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.822609	valid

Early stopping, best iteration is:
[25]	training's auc: 0.929727	valid_0's auc: 0.921695
Evaluated only: auc
**********
0.921694524033174
jobid_reccap_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	training's auc: 0.931511	valid_0's auc: 0.923476
Evaluated only: auc
**********
0.9234758299386839
jobid_reccap_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[59]	training's auc: 0.91253	valid_0's auc: 0.905876
Evaluated only: auc
**********
0.9058763623833507
jobid_reccap_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[55]	training's auc: 0.927748	valid_0's auc: 0.91832
Evaluated only: auc
**********
0.918320132355926
jobid_reccap_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[15]	training's auc: 0.741383	valid_0's auc: 0.750499
Evaluated only: auc
**********


Early stopping, best iteration is:
[1]	training's auc: 0.582879	valid_0's auc: 0.57286
Evaluated only: auc
**********
0.5728600727863188
compform_empnum_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.582879	valid_0's auc: 0.57286
Evaluated only: auc
**********
0.5728600727863188
compform_empnum_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.582879	valid_0's auc: 0.57286
Evaluated only: auc
**********
0.5728600727863188
compform_empnum_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.582879	valid_0's auc: 0.57286
Evaluated only: auc
**********
0.5728600727863188
compform_empnum_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.582879	valid_0's auc: 0.57286
Evaluated only: auc
**********
0.572

Early stopping, best iteration is:
[45]	training's auc: 0.67588	valid_0's auc: 0.677449
Evaluated only: auc
**********
0.6774485138279845
opform_empnum_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.675391	valid_0's auc: 0.676759
Evaluated only: auc
**********
0.6767587666919987
opform_empnum_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.675439	valid_0's auc: 0.676759
Evaluated only: auc
**********
0.6767587666919987
opform_empnum_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.675487	valid_0's auc: 0.676759
Evaluated only: auc
**********
0.6767587666919987
opform_empnum_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	training's auc: 0.675784	valid_0's auc: 0.677449
Evaluated only: auc
**********
0.67744

[47]	training's auc: 0.664749	valid_0's auc: 0.660408
Evaluated only: auc
**********
0.6604083963880615
venind_empnum_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[47]	training's auc: 0.664749	valid_0's auc: 0.660408
Evaluated only: auc
**********
0.6604083963880615
venind_empnum_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.664733	valid_0's auc: 0.660352
Evaluated only: auc
**********
0.6603517533383998
venind_empnum_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]	training's auc: 0.664529	valid_0's auc: 0.660418
Evaluated only: auc
**********
0.6604178368963385
venind_empnum_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's auc: 0.664743	valid_0's auc: 0.66038
Evaluated only: auc
**********
0.6603800748632306
ve

[2]	training's auc: 0.849529	valid_0's auc: 0.824661
Evaluated only: auc
**********
0.82466084974015
enttypeminu_empnum_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[5]	training's auc: 0.849162	valid_0's auc: 0.825502
Evaluated only: auc
**********
0.8255016450085673
enttypeminu_empnum_quantile_25
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.703843	valid_0's auc: 0.690155
Evaluated only: auc
**********
0.6901554379687802
enttypeminu_empnum_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[7]	training's auc: 0.848837	valid_0's auc: 0.825368
Evaluated only: auc
**********
0.8253682978291551
enttypeminu_regcap_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.849656	valid_0's auc: 0.82484
Evaluated only: auc
**********
0.8248402

Early stopping, best iteration is:
[29]	training's auc: 0.59363	valid_0's auc: 0.583183
Evaluated only: auc
**********
0.5831832685871807
oploc_empnum_quantile_75
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	training's auc: 0.620081	valid_0's auc: 0.609865
Evaluated only: auc
**********
0.6098645051049548
oploc_regcap_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.64032	valid_0's auc: 0.641564
Evaluated only: auc
**********
0.6415639618037036
oploc_regcap_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.64032	valid_0's auc: 0.641564
Evaluated only: auc
**********
0.6415639618037036
oploc_regcap_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.64032	valid_0's auc: 0.641564
Evaluated only: auc
**********
0.6415639

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.942946	valid_0's auc: 0.916873
Evaluated only: auc
**********
0.9168733744624811
enttypegb_regcap_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.941692	valid_0's auc: 0.911542
Evaluated only: auc
**********
0.911542437444832
enttypegb_regcap_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.81844	valid_0's auc: 0.806777
Evaluated only: auc
**********
0.8067769868729733
enttypegb_regcap_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.941349	valid_0's auc: 0.906337
Evaluated only: auc
**********
0.9063365871618528
enttypegb_regcap_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[2]	training's auc: 0.

Early stopping, best iteration is:
[1]	training's auc: 0.504803	valid_0's auc: 0.501185
Evaluated only: auc
**********
0.5011853738205265
adbusign_regcap_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.504803	valid_0's auc: 0.501185
Evaluated only: auc
**********
0.5011853738205265
adbusign_regcap_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.504803	valid_0's auc: 0.501185
Evaluated only: auc
**********
0.5011853738205265
adbusign_regcap_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.504803	valid_0's auc: 0.501185
Evaluated only: auc
**********
0.5011853738205265
adbusign_regcap_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.504803	valid_0's auc: 0.501185
Evaluated only: auc
**********


Early stopping, best iteration is:
[1]	training's auc: 0.743673	valid_0's auc: 0.749927
Evaluated only: auc
**********
0.7499268360608535
townsign_regcap_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.743673	valid_0's auc: 0.749927
Evaluated only: auc
**********
0.7499268360608535
townsign_regcap_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.743673	valid_0's auc: 0.749927
Evaluated only: auc
**********
0.7499268360608535
townsign_regcap_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.743673	valid_0's auc: 0.749927
Evaluated only: auc
**********
0.7499268360608535
townsign_regcap_max_min
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.743673	valid_0's auc: 0.749927
Evaluated only: auc
*******

Early stopping, best iteration is:
[35]	training's auc: 0.853242	valid_0's auc: 0.837067
Evaluated only: auc
**********
0.8370674477113847
opto_year_regcap_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[53]	training's auc: 0.878983	valid_0's auc: 0.867038
Evaluated only: auc
**********
0.8670381113319141
opto_year_regcap_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[43]	training's auc: 0.87843	valid_0's auc: 0.863231
Evaluated only: auc
**********
0.8632306363374604
opto_year_regcap_skew
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[24]	training's auc: 0.880862	valid_0's auc: 0.866898
Evaluated only: auc
**********
0.8668976837712943
opto_year_regcap_std
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[27]	training's auc: 0.881181	valid_0's auc: 0.865721
Evaluated only: auc
******

In [25]:
X_train = X_train[useful_cols]
X_valid = X_valid[useful_cols]
test = test[useful_cols]

In [26]:
col_corr = correlation(X_train, useful_dict, threshold=0.98)
print('Corr drop cols: \n', col_corr)

Corr drop cols: 
 ['venind_reccap_quantile_75', 'opform_reccap_mean', 'enttypegb_reccap_max', 'industryco_empnum_count', 'industryco_reccap_max', 'industryphy_empnum_max', 'industryphy_empnum_max_min', 'compform_empnum_std', 'oploc_regcap_max', 'townsign_reccap_std', 'townsign_reccap_sum', 'industryphy_regcap_sum', 'opto_year_count', 'enttypeminu_empnum_nunique', 'venind_reccap_nunique', 'compform_empnum_mean', 'enttype_reccap_nunique', 'oploc_regcap_nunique', 'state_empnum_nunique', 'enttypeitem_regcap_quantile_25', 'enttypeitem_reccap_median', 'oplocdistrict_regcap_nunique', 'oplocdistrict_regcap_max', 'oploc_regcap_quantile_25', 'townsign_empnum_mean', 'venind_empnum_quantile_25', 'enttype_reccap_std', 'venind_count', 'opform_empnum_max', 'enttypeitem_empnum_quantile_75', 'industryphy_regcap_count', 'enttypeitem_reccap_quantile_25', 'enttype_empnum_quantile_25', 'enttypegb_empnum_max', 'enttypeminu_empnum_max_min', 'orgid_reccap_quantile_25', 'state_reccap_sum', 'enttypeminu_empnum_

In [27]:
X_train.drop(col_corr, axis=1, inplace=True)
X_valid.drop(col_corr, axis=1, inplace=True)
test.drop(col_corr, axis=1, inplace=True)

In [28]:
cols = X_train.columns.to_list()
print('Final cols: \n', len(cols))
train_dataset = lgb.Dataset(X_train, y_train)
valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
all_dataset = lgb.Dataset(train[cols], y, reference=train_dataset)


def lgb_f1_score(y_hat, data):
    y_true = data.get_label()
    y_hat = np.where(y_hat > 0.5, 1, 0)  
    return 'f1', f1_score(y_true, y_hat), True


def f1_loss(y, pred):
    beta = 2
    p = 1. / (1 + np.exp(-pred))
    grad = p * ((beta - 1) * y + 1) - beta * y
    hess = ((beta - 1) * y + 1) * p * (1.0 - p)
    return grad, hess


params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    train_dataset,
    valid_sets=[train_dataset, valid_dataset],
    early_stopping_rounds=200,
    verbose_eval=300,
#     feval=lgb_f1_score,
#     fobj=f1_loss
    
    
)
pred = valid_model.predict(X_valid)
y_valid_pred = np.where(pred > 0.5, 1, 0)
F1 = np.round(f1_score(y_valid, y_valid_pred), 5)
print('Valid F1: ', F1)
print('Valid mean label: ', np.mean(y_valid_pred))

train_model = lgb.train(
    params,
    all_dataset,
    num_boost_round=valid_model.best_iteration+20,
#     feval=lgb_f1_score,
#     fobj=f1_loss
)
y_test_pred = np.where(train_model.predict(test) > 0.5, 1, 0)

print('Test mean label: ', np.mean(y_test_pred))
sub['score'] = y_test_pred
sub.to_csv('../sub/baseline_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(F1)), index=False)

Final cols: 
 242
Training until validation scores don't improve for 200 rounds
Did not meet early stopping. Best iteration is:
[100]	training's f1: 0.931732	valid_1's f1: 0.805344
Evaluated only: f1
Valid F1:  0.80534
Valid mean label:  0.07532956685499058
Test mean label:  0.1155
