In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from util import *

In [2]:
data = pd.read_csv('../../input/data.csv')

In [3]:
# 数据集2：annual_report_info.csv
# 包含数据集7和8中涉及到的企业的年报基本信息，每一行代表一个企业的年报基本数据，每一行有23列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, ANCHEYEAR:年度, STATE:状态, FUNDAM:资金数额, MEMNUM:成员人数, FARNUM:农民人数, ANNNEWMEMNUM:本年度新增成员人数, ANNREDMEMNUM:本年度退出成员人数, EMPNUM:从业人数, EMPNUMSIGN:从业人数是否公示, BUSSTNAME:经营状态名称, COLGRANUM:其中高校毕业生人数经营者, RETSOLNUM:其中退役士兵人数经营者, DISPERNUM:其中残疾人人数经营者, UNENUM:其中下岗失业人数经营者, COLEMPLNUM:其中高校毕业生人数雇员, RETEMPLNUM:其中退役士兵人数雇员, DISEMPLNUM:其中残疾人人数雇员, UNEEMPLNUM:其中下岗失业人数雇员, WEBSITSIGN:是否有网站标志, FORINVESTSIGN:是否有对外投资企业标志, STOCKTRANSIGN:有限责任公司本年度是否发生股东股权转让标志, PUBSTATE:公示状态：1 全部公示，2部分公示,3全部不公示]
annual_report_info = pd.read_csv('../../input/train/annual_report_info.csv')
print(annual_report_info.shape)
annual_report_info.info()

(22550, 23)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22550 entries, 0 to 22549
Data columns (total 23 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             22550 non-null  object 
 1   ANCHEYEAR      22550 non-null  float64
 2   STATE          22545 non-null  float64
 3   FUNDAM         5702 non-null   float64
 4   MEMNUM         29 non-null     float64
 5   FARNUM         29 non-null     float64
 6   ANNNEWMEMNUM   29 non-null     float64
 7   ANNREDMEMNUM   29 non-null     float64
 8   EMPNUM         22535 non-null  float64
 9   EMPNUMSIGN     16833 non-null  float64
 10  BUSSTNAME      17680 non-null  object 
 11  COLGRANUM      20041 non-null  float64
 12  RETSOLNUM      20041 non-null  float64
 13  DISPERNUM      20041 non-null  float64
 14  UNENUM         20041 non-null  float64
 15  COLEMPLNUM     20041 non-null  float64
 16  RETEMPLNUM     20041 non-null  float64
 17  DISEMPLNUM     20041 non-null  float64

In [4]:
annual_report_info.head()

Unnamed: 0,id,ANCHEYEAR,STATE,FUNDAM,MEMNUM,FARNUM,ANNNEWMEMNUM,ANNREDMEMNUM,EMPNUM,EMPNUMSIGN,...,DISPERNUM,UNENUM,COLEMPLNUM,RETEMPLNUM,DISEMPLNUM,UNEEMPLNUM,WEBSITSIGN,FORINVESTSIGN,STOCKTRANSIGN,PUBSTATE
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,2017.0,2.0,5.0,,,,,10.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,,3.0
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,2018.0,2.0,2.0,,,,,2.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,,3.0
2,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,2017.0,2.0,,,,,,4.0,2.0,...,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0
3,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,2018.0,2.0,,,,,,3.0,2.0,...,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0
4,9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17,2017.0,2.0,5.0,,,,,10.0,,...,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,,3.0


In [5]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(
        len(to_drop), missing_threshold))
    return to_drop

In [6]:
to_drop = identify_missing(annual_report_info, missing_threshold=0.9)
annual_report_info.drop(to_drop, axis=1, inplace=True)
to_drop

FARNUM           0.998714
ANNNEWMEMNUM     0.998714
ANNREDMEMNUM     0.998714
MEMNUM           0.998714
FUNDAM           0.747140
STOCKTRANSIGN    0.401020
FORINVESTSIGN    0.268780
EMPNUMSIGN       0.253525
BUSSTNAME        0.215965
RETSOLNUM        0.111264
COLGRANUM        0.111264
UNENUM           0.111264
COLEMPLNUM       0.111264
RETEMPLNUM       0.111264
DISEMPLNUM       0.111264
UNEEMPLNUM       0.111264
DISPERNUM        0.111264
WEBSITSIGN       0.001463
PUBSTATE         0.000887
EMPNUM           0.000665
STATE            0.000222
ANCHEYEAR        0.000000
id               0.000000
dtype: float64
4 features with greater than 0.9 missing values.



['FARNUM', 'ANNNEWMEMNUM', 'ANNREDMEMNUM', 'MEMNUM']

In [7]:
annual_report_info['id'].nunique()

8937

In [8]:
annual_report_info.head()

Unnamed: 0,id,ANCHEYEAR,STATE,FUNDAM,EMPNUM,EMPNUMSIGN,BUSSTNAME,COLGRANUM,RETSOLNUM,DISPERNUM,UNENUM,COLEMPLNUM,RETEMPLNUM,DISEMPLNUM,UNEEMPLNUM,WEBSITSIGN,FORINVESTSIGN,STOCKTRANSIGN,PUBSTATE
0,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,2017.0,2.0,5.0,10.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,,3.0
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,2018.0,2.0,2.0,2.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,,3.0
2,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,2017.0,2.0,,4.0,2.0,开业,3.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0
3,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,2018.0,2.0,,3.0,2.0,开业,1.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0
4,9c7fa510616a68309e4badf2a7a3123c0462fb85bf28ef17,2017.0,2.0,5.0,10.0,,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,,,3.0


In [9]:
annual_report_info.nunique()

id               8937
ANCHEYEAR           4
STATE               2
FUNDAM            167
EMPNUM            270
EMPNUMSIGN          2
BUSSTNAME           4
COLGRANUM          43
RETSOLNUM          14
DISPERNUM           5
UNENUM             35
COLEMPLNUM         94
RETEMPLNUM         26
DISEMPLNUM         16
UNEEMPLNUM         72
WEBSITSIGN          2
FORINVESTSIGN       2
STOCKTRANSIGN       2
PUBSTATE            3
dtype: int64

In [10]:
annual_report_info['ANCHEYEAR'].unique(), annual_report_info['STATE'].unique()

(array([2017., 2018., 2016., 2015.]), array([ 2.,  1., nan]))

In [11]:
annual_report_info['EMPNUMSIGN'].unique()

array([nan,  2.,  1.])

In [12]:
annual_report_info['BUSSTNAME'].unique()

array([nan, '开业', '歇业', '停业', '清算'], dtype=object)

In [13]:
annual_report_info.sort_values(['id', 'ANCHEYEAR'], inplace=True)
df = annual_report_info.groupby('id', as_index=False).last()
df.shape

(8937, 19)

In [14]:
df['ANCHEYEAR'].unique()

array([2018., 2016., 2017., 2015.])

In [15]:
df.head()

Unnamed: 0,id,ANCHEYEAR,STATE,FUNDAM,EMPNUM,EMPNUMSIGN,BUSSTNAME,COLGRANUM,RETSOLNUM,DISPERNUM,UNENUM,COLEMPLNUM,RETEMPLNUM,DISEMPLNUM,UNEEMPLNUM,WEBSITSIGN,FORINVESTSIGN,STOCKTRANSIGN,PUBSTATE
0,175ebe5f059ec05007223e9af0a48b885f4cbfa833d93eed,2018.0,2.0,,10.0,2.0,开业,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,2.0,3.0
1,175ebe5f059ec05036d901021be6da41057ae3ee1fe6b8bb,2018.0,2.0,,19.0,2.0,开业,5.0,0.0,0.0,0.0,14.0,0.0,0.0,0.0,1.0,2.0,2.0,3.0
2,175ebe5f059ec050efe07058fc35890a2b8858a6795a2e24,2018.0,2.0,,30.0,2.0,开业,1.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0
3,216bd2aaf4d0792406c041069b786b3bcb5baa4cf80d5987,2018.0,2.0,,4.0,1.0,开业,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,2.0,2.0,2.0,2.0
4,216bd2aaf4d079240c2b8f7bbf3177618ad289f21af0221e,2018.0,2.0,,0.0,2.0,歇业,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2.0,,2.0


In [16]:
df['EMPNUM'].unique()

array([1.0000e+01, 1.9000e+01, 3.0000e+01, 4.0000e+00, 0.0000e+00,
       5.0000e+00, 2.0000e+00, 7.0000e+00, 1.0000e+00, 8.0000e+00,
       3.0000e+00, 1.1000e+01, 9.0000e+00, 6.0000e+00, 2.0000e+01,
       2.8000e+01, 1.6000e+02, 2.7000e+01, 1.5000e+01, 3.2000e+01,
       1.2000e+01, 2.6000e+01, 4.5000e+01, 1.6000e+01, 8.9000e+01,
       1.4000e+01, 3.5000e+01, 1.8000e+01, 1.7000e+01, 4.0000e+01,
       1.0600e+03, 2.5000e+01, 2.1000e+01, 1.2045e+04, 2.9000e+01,
       6.0000e+01, 3.7000e+01, 4.9000e+01, 5.0000e+01, 1.3000e+01,
       4.1000e+01, 4.4000e+01, 1.6100e+02, 5.0000e+02, 1.5200e+02,
       3.3000e+01, 2.2000e+01, 3.3300e+02, 3.3330e+03, 2.3000e+01,
       7.2000e+01, 2.4000e+01, 4.2000e+01, 8.0000e+01, 5.2000e+01,
       6.6000e+01, 9.4000e+01, 6.5000e+01, 6.2000e+01,        nan,
       7.0000e+01, 9.2000e+01, 5.5000e+01, 4.8000e+01, 1.2800e+02,
       3.6000e+01, 1.1800e+02, 5.8000e+01, 4.2500e+02, 3.5000e+02,
       7.5000e+01, 1.0700e+02, 8.8000e+01, 9.5000e+01, 1.4600e

In [17]:
df['COLGRANUM+COLEMPLNUM'] = df['COLGRANUM'] + df['COLEMPLNUM']
df['RETSOLNUM+RETEMPLNUM'] = df['RETSOLNUM'] + df['RETEMPLNUM']
df['DISPERNUM+DISEMPLNUM'] = df['DISPERNUM'] + df['DISEMPLNUM']
df['UNENUM+UNEEMPLNUM'] = df['COLGRANUM'] + df['UNEEMPLNUM']

df['ALLNUM'] = df['COLGRANUM+COLEMPLNUM'] + df['RETSOLNUM+RETEMPLNUM'] + df['DISPERNUM+DISEMPLNUM'] + df['UNENUM+UNEEMPLNUM']

In [18]:
le = LabelEncoder()
df['BUSSTNAME'] = le.fit_transform(df['BUSSTNAME'].astype(str))

In [19]:
data = data.merge(df, how='left', on='id')

In [20]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
# train.shape, test.shape

used_cols = [i for i in train.columns if i not in ['id', 'label']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52)
print('AUC useless_cols: \n', useless_cols)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=300 
)

ANCHEYEAR
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[91]	training's auc: 0.681508	valid_0's auc: 0.680962
Evaluated only: auc
**********
0.6809615629705503
STATE
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.673578	valid_0's auc: 0.669503
Evaluated only: auc
**********
0.6695031460493833
FUNDAM
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.527423	valid_0's auc: 0.524906
Evaluated only: auc
**********
0.5249064209617046
EMPNUM
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[11]	training's auc: 0.694713	valid_0's auc: 0.68918
Evaluated only: auc
**********
0.6891801154574162
EMPNUMSIGN
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.703215	valid_0's auc: 0.694598
Evaluated onl

In [21]:
importance = valid_model.feature_importance(importance_type='gain')
feature_name = valid_model.feature_name()

df_importance = pd.DataFrame({
    'feature_name': feature_name,
    'importance': importance
}).sort_values(by='importance', ascending=False)
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
df_importance

Unnamed: 0,feature_name,importance,normalized_importance,cumulative_importance
4,EMPNUMSIGN,21773.828596,0.57293,0.57293
16,STOCKTRANSIGN,5503.851317,0.144822,0.717752
0,ANCHEYEAR,2967.558207,0.078085,0.795836
10,COLEMPLNUM,2461.902155,0.06478,0.860616
3,EMPNUM,1813.504412,0.047718,0.908334
22,ALLNUM,864.862934,0.022757,0.931091
6,COLGRANUM,531.691917,0.01399,0.945082
15,FORINVESTSIGN,456.80642,0.01202,0.957101
21,UNENUM+UNEEMPLNUM,357.413823,0.009405,0.966506
18,COLGRANUM+COLEMPLNUM,333.103889,0.008765,0.975271


In [22]:
record_low_importance = df_importance[df_importance['cumulative_importance'] > 0.99]
to_drop = list(record_low_importance['feature_name'])
print(to_drop)

df.drop(to_drop, axis=1, inplace=True)
df.to_csv('../../input/annual_report_info.csv', index=False)

['UNENUM', 'PUBSTATE', 'DISPERNUM+DISEMPLNUM', 'FUNDAM', 'WEBSITSIGN', 'RETSOLNUM+RETEMPLNUM', 'STATE', 'DISEMPLNUM', 'RETSOLNUM', 'DISPERNUM']
