In [None]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from util import *

In [None]:
data = pd.read_csv('../../input/data.csv')

In [None]:
# 数据集1：base_info.csv
# 包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型]

base_info = pd.read_csv('../../input/train/base_info.csv')
print(base_info.shape)
base_info.info()

In [None]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(len(to_drop), missing_threshold))
    return to_drop

In [None]:
to_drop = identify_missing(base_info, missing_threshold=0.9)
base_info.drop(to_drop, axis=1, inplace=True)
to_drop

In [None]:
base_info['id'].nunique()

In [None]:
base_info['opform'] = base_info['opform'].replace('01', '01-以个人财产出资').replace('02', '02-以家庭共有财产作为个人出资')

In [None]:
cat_cols = base_info.select_dtypes(include=['object']).columns.to_list()
num_cols = base_info.select_dtypes(exclude=['object']).columns.to_list()
cat_cols, num_cols

In [None]:
# dom, opscope, opfrom, opto
# 'dom', 'opscope' 取值太多
base_info.drop(['dom', 'opscope', 'opfrom', 'opto'], axis=1, inplace=True)

In [None]:

for i in tqdm(['industryphy', 'opform', 'oploc', 'orgid', 'jobid', 'oplocdistrict', 'enttypegb', 'industryco', 'enttype', 'enttypeitem']):
    le = LabelEncoder()
    base_info[i] = le.fit_transform(base_info[i].astype(str))

In [None]:
data = data.merge(base_info, how='left', on='id')

In [None]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
sub = test[['id']]
# train.shape, test.shape

used_cols = [i for i in train.columns if i not in ['id', 'label']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52)
print('AUC useless_cols: \n', useless_cols)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=300 
)

In [None]:
importance = valid_model.feature_importance(importance_type='gain')
feature_name = valid_model.feature_name()

df_importance = pd.DataFrame({
    'feature_name': feature_name,
    'importance': importance
}).sort_values(by='importance', ascending=False)
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
df_importance

In [None]:
record_low_importance = df_importance[df_importance['cumulative_importance'] > 0.999]
to_drop = list(record_low_importance['feature_name'])
print(to_drop)

base_info.drop(to_drop, axis=1, inplace=True)
base_info.to_csv('../../input/annual_report_info.csv', index=False)

In [None]:
y_test_pred = np.where(valid_model.predict(test) > 0.5, 1, 0)

print('Test mean label: ', np.mean(y_test_pred))
sub['score'] = y_test_pred
sub.to_csv('../../sub/baseline_{}_{}.csv'.format(time.strftime('%Y%m%d'), str(F1)), index=False)