In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from util import *

In [2]:
data = pd.read_csv('../../input/data.csv')

In [3]:
# 数据集4：change_info.csv
# 包含数据集7和8中涉及到的企业的变更信息，每一行代表一个企业变更信息，每一行5列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, bgxmdm:变更信息代码, bgq:变更前, bgh:变更后, bgrq:变更日期]
change_info = pd.read_csv('../../input/train/change_info.csv')
print(change_info.shape)
change_info.info()

(45940, 5)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45940 entries, 0 to 45939
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      45940 non-null  object 
 1   bgxmdm  45940 non-null  float64
 2   bgq     45940 non-null  object 
 3   bgh     45940 non-null  object 
 4   bgrq    45940 non-null  float64
dtypes: float64(2), object(3)
memory usage: 1.8+ MB


In [4]:
change_info.head()

Unnamed: 0,id,bgxmdm,bgq,bgh,bgrq
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,939.0,9dec12da51cdb672a91b4a8ae0e0895f7bfeb243dfa3e0c8,9dec12da51cdb672a91b4a8ae0e0895f4a56cbe3deca98...,20190600000000.0
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,112.0,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,20190600000000.0
2,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,111.0,54ca436ffb87f24c820178b45fcc3a7b,f80e3376abcf81ad2a279d6d99046153,20170130000000.0
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,128.0,f1fdb1c866dc96638cbfb8b788b91393,1eca8a0d8beca58d988f7dccab5dc868,20170130000000.0
4,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,925.0,54ca436ffb87f24c820178b45fcc3a7b,f80e3376abcf81ad2a279d6d99046153,20170130000000.0


In [5]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(len(to_drop), missing_threshold))
    return to_drop

In [6]:
to_drop = identify_missing(change_info, missing_threshold=0.5)
change_info.drop(to_drop, axis=1, inplace=True)
to_drop

bgrq      0.0
bgh       0.0
bgq       0.0
bgxmdm    0.0
id        0.0
dtype: float64
0 features with greater than 0.5 missing values.



[]

In [7]:
change_info['id'].nunique(), change_info['bgxmdm'].nunique(), change_info['bgq'].nunique(), change_info['bgh'].nunique()

(8726, 45, 28802, 30501)

In [8]:
change_info['bgxmdm'].unique()

array([939., 112., 111., 128., 925., 120., 137., 121., 922., 131., 921.,
       117., 113., 930., 110., 190., 133., 115., 150., 129., 118., 907.,
       134., 135., 136., 172., 116., 144., 143., 138., 301., 132., 119.,
       902., 114., 908., 130., 125., 124., 903., 180., 126., 935., 901.,
       140.])

In [9]:
df = pd.DataFrame(change_info['id'].unique())
df.columns = ['id']
df.shape

(8726, 1)

In [10]:
tmp = change_info.groupby('id', as_index=False)['bgxmdm'].agg({
    'bgxmdm_mode': lambda x: np.mean(pd.Series.mode(x)),
    'bgxmdm_cnt': 'count',
    'bgxmdm_nunique': 'nunique',
})
# tmp['cnt.nuique'] = tmp['bgxmdm_nunique'] / tmp['bgxmdm_cnt']

df = df.merge(tmp, on='id', how='left')

In [11]:
change_info['tmp'] = 1
tmp = change_info.pivot_table(values='tmp', index='id', columns='bgxmdm', aggfunc=np.sum).fillna(0)
cols = ['bgxmdm_{}'.format(f) for f in tmp.columns]
tmp.columns = cols
tmp.reset_index(inplace=True)

df = df.merge(tmp, on='id', how='left')

In [12]:
# for f in tqdm(cols):
#     df[f + '_rate'] = df[f] / df['bgxmdm_cnt']

In [13]:
data = data.merge(df, how='left', on='id')

In [14]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
train.shape, test.shape

((14865, 50), (10000, 50))

In [15]:
used_cols = [i for i in train.columns if i not in ['id', 'label']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

In [16]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52)
print('AUC useless_cols: \n', useless_cols)

bgxmdm_mode
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[4]	training's auc: 0.782946	valid_0's auc: 0.761218
Evaluated only: auc
**********
0.7612176839601044
bgxmdm_cnt
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.741142	valid_0's auc: 0.735908
Evaluated only: auc
**********
0.7359082713013269
bgxmdm_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[13]	training's auc: 0.743564	valid_0's auc: 0.734871
Evaluated only: auc
**********
0.7348709954543953
bgxmdm_110.0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	training's auc: 0.724706	valid_0's auc: 0.721169
Evaluated only: auc
**********
0.7211692777539143
bgxmdm_111.0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[39]	training's auc: 0.733216	valid_0's auc

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.722258	valid_0's auc: 0.719087
Evaluated only: auc
**********
0.7190870556470761
bgxmdm_908.0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.722258	valid_0's auc: 0.719087
Evaluated only: auc
**********
0.7190870556470761
bgxmdm_921.0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[35]	training's auc: 0.737345	valid_0's auc: 0.733847
Evaluated only: auc
**********
0.7338467003063445
bgxmdm_922.0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[35]	training's auc: 0.7358	valid_0's auc: 0.732937
Evaluated only: auc
**********
0.7329368713211519
bgxmdm_925.0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[20]	training's auc: 0.724802	valid_0's auc: 0.72032
Eva

In [17]:
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

In [18]:
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=300 
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[12]	training's auc: 0.825697	valid_1's auc: 0.748583
Evaluated only: auc


In [19]:
importance = valid_model.feature_importance(importance_type='gain')
feature_name = valid_model.feature_name()

df_importance = pd.DataFrame({
    'feature_name': feature_name,
    'importance': importance
}).sort_values(by='importance', ascending=False)
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
df_importance

Unnamed: 0,feature_name,importance,normalized_importance,cumulative_importance
35,bgxmdm_190.0,18300.291985,0.436262,0.436262
21,bgxmdm_131.0,4111.00457,0.098003,0.534264
0,bgxmdm_mode,3748.611426,0.089363,0.623628
14,bgxmdm_121.0,2887.1838,0.068828,0.692456
3,bgxmdm_110.0,1948.125786,0.046442,0.738897
19,bgxmdm_129.0,1666.700603,0.039733,0.77863
13,bgxmdm_120.0,1526.167997,0.036382,0.815012
1,bgxmdm_cnt,1212.316754,0.028901,0.843913
6,bgxmdm_113.0,1142.770395,0.027243,0.871155
4,bgxmdm_111.0,1047.086097,0.024962,0.896117
