In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from util import *

In [2]:
data = pd.read_csv('../../input/data.csv')

In [3]:
# 数据集1：base_info.csv
# 包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型]

base_info = pd.read_csv('../../input/train/base_info.csv')
print(base_info.shape)
base_info.info()

(24865, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             24865 non-null  object 
 1   oplocdistrict  24865 non-null  int64  
 2   industryphy    24865 non-null  object 
 3   industryco     24864 non-null  float64
 4   dom            24865 non-null  object 
 5   opscope        24865 non-null  object 
 6   enttype        24865 non-null  int64  
 7   enttypeitem    16651 non-null  float64
 8   opfrom         24865 non-null  object 
 9   opto           8825 non-null   object 
 10  state          24865 non-null  int64  
 11  orgid          24865 non-null  int64  
 12  jobid          24865 non-null  int64  
 13  adbusign       24865 non-null  int64  
 14  townsign       24865 non-null  int64  
 15  regtype        24865 non-null  int64  
 16  empnum         19615 non-null  float64
 17  compform       10631 non-null  float64

In [4]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(len(to_drop), missing_threshold))
    return to_drop

In [5]:
to_drop = identify_missing(base_info, missing_threshold=0.9)
base_info.drop(to_drop, axis=1, inplace=True)
to_drop

midpreindcode    1.000000
ptbusscope       1.000000
protype          0.998633
forreccap        0.990871
congro           0.989986
forregcap        0.989946
exenum           0.944581
parnum           0.905932
reccap           0.715102
enttypeminu      0.707621
venind           0.660688
opto             0.645083
opform           0.638045
compform         0.572451
enttypeitem      0.330344
empnum           0.211140
regcap           0.007681
industryco       0.000040
opfrom           0.000000
oplocdistrict    0.000000
industryphy      0.000000
dom              0.000000
opscope          0.000000
enttype          0.000000
enttypegb        0.000000
state            0.000000
orgid            0.000000
jobid            0.000000
adbusign         0.000000
townsign         0.000000
regtype          0.000000
oploc            0.000000
id               0.000000
dtype: float64
8 features with greater than 0.9 missing values.



['midpreindcode',
 'ptbusscope',
 'protype',
 'forreccap',
 'congro',
 'forregcap',
 'exenum',
 'parnum']

In [6]:
base_info['id'].nunique()

24865

In [7]:
base_info['opform'] = base_info['opform'].replace('01', '01-以个人财产出资').replace('02', '02-以家庭共有财产作为个人出资')

In [8]:
cat_cols = base_info.select_dtypes(include=['object']).columns.to_list()
num_cols = base_info.select_dtypes(exclude=['object']).columns.to_list()
cat_cols, num_cols

(['id', 'industryphy', 'dom', 'opscope', 'opfrom', 'opto', 'opform', 'oploc'],
 ['oplocdistrict',
  'industryco',
  'enttype',
  'enttypeitem',
  'state',
  'orgid',
  'jobid',
  'adbusign',
  'townsign',
  'regtype',
  'empnum',
  'compform',
  'venind',
  'enttypeminu',
  'regcap',
  'reccap',
  'enttypegb'])

In [9]:
# dom, opscope, opfrom, opto
# 'dom', 'opscope' 取值太多
base_info.drop(['dom', 'opscope', 'opfrom', 'opto'], axis=1, inplace=True)

In [10]:
base_info.drop(['compform', 'regtype', 'state', 'adbusign', 'oploc', 'opform', 'venind'], axis=1, inplace=True)

In [11]:
for i in tqdm(['industryphy', 'orgid', 'jobid', 'oplocdistrict', 'enttypegb', 'industryco', 'enttype', 'enttypeitem']):
    le = LabelEncoder()
    base_info[i] = le.fit_transform(base_info[i].astype(str))

100%|███████████████████████████████████████████████████████████████████████████████████| 8/8 [00:00<00:00, 111.41it/s]


In [12]:
data = data.merge(base_info, how='left', on='id')

In [13]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
sub = test[['id']]
# train.shape, test.shape

used_cols = [i for i in train.columns if i not in ['id', 'label']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52)
print('AUC useless_cols: \n', useless_cols)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=300 
)

oplocdistrict
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[8]	training's auc: 0.800172	valid_0's auc: 0.784394
Evaluated only: auc
**********
0.784394131780055
industryphy
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	training's auc: 0.977366	valid_0's auc: 0.967944
Evaluated only: auc
**********
0.9679441641137959
industryco
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[104]	training's auc: 0.986598	valid_0's auc: 0.975303
Evaluated only: auc
**********
0.9753030403156906
enttype
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.866492	valid_0's auc: 0.846703
Evaluated only: auc
**********
0.8467026664715628
enttypeitem
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[14]	training's auc: 0.910801	valid_0's auc: 0.8

In [14]:
importance = valid_model.feature_importance(importance_type='gain')
feature_name = valid_model.feature_name()

df_importance = pd.DataFrame({
    'feature_name': feature_name,
    'importance': importance
}).sort_values(by='importance', ascending=False)
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
df_importance

Unnamed: 0,feature_name,importance,normalized_importance,cumulative_importance
1,industryphy,115227.075478,0.859909,0.859909
10,regcap,4860.38528,0.036272,0.896181
11,reccap,2713.114652,0.020247,0.916428
2,industryco,1965.574881,0.014669,0.931097
8,empnum,1553.42659,0.011593,0.942689
6,jobid,1549.273748,0.011562,0.954251
12,enttypegb,1104.965834,0.008246,0.962497
9,enttypeminu,1084.066348,0.00809,0.970587
0,oplocdistrict,994.922867,0.007425,0.978012
4,enttypeitem,980.057485,0.007314,0.985326


In [15]:
record_low_importance = df_importance[df_importance['cumulative_importance'] > 0.999]
to_drop = list(record_low_importance['feature_name'])
print(to_drop)

# base_info.drop(to_drop, axis=1, inplace=True)
base_info.to_csv('../../input/base_info.csv', index=False)

['orgid']


In [16]:
y_test_pred = np.where(valid_model.predict(test) > 0.5, 1, 0)

print('Test mean label: ', np.mean(y_test_pred))
sub['score'] = y_test_pred
sub.to_csv('../../sub/baseline_{}.csv'.format(time.strftime('%Y%m%d')), index=False)

Test mean label:  0.1233
