In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from util import *

In [2]:
data = pd.read_csv('../../input/data.csv')

In [3]:
# 数据集6：other_info.csv
# 包含数据集7和8中涉及到的企业的其他信息，每一行代表一个企业其他信息，每一行4列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, legal_judgment_num:裁判文书数量, brand_num:注册商标数量, patent_num:专利数量]
other_info = pd.read_csv('../../input/train/other_info.csv')
print(other_info.shape)
other_info.info()

(1890, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1890 entries, 0 to 1889
Data columns (total 4 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  1890 non-null   object 
 1   legal_judgment_num  1006 non-null   float64
 2   brand_num           909 non-null    float64
 3   patent_num          396 non-null    float64
dtypes: float64(3), object(1)
memory usage: 59.2+ KB


In [4]:
other_info['id'].nunique()

1888

In [5]:
other_info['id'].nunique()

1888

In [6]:
other_info.duplicated().sum()

0

In [7]:
other_info.drop_duplicates(inplace=True)
other_info.shape, other_info['id'].nunique()

((1890, 4), 1888)

In [8]:
other_info.head()

Unnamed: 0,id,legal_judgment_num,brand_num,patent_num
0,f000950527a6feb6d340f91da09e61347d8200cd2f0d1602,4.0,,
1,f000950527a6feb608dd9322b74a99f60851207f36a3c94c,1.0,,
2,d8071a739aa75a3b9f23966f8dae78fd226c272515b9c255,2.0,,
3,216bd2aaf4d079242209b1496f81a36c7abed9dd0bb65ed3,,1.0,
4,e9f7b28ec10e0470de9631c789f49acdd4e7cf9ed6db094b,,2.0,


In [9]:
other_info.drop_duplicates('id', keep='last', inplace=True)
other_info.shape, other_info['id'].nunique()

((1888, 4), 1888)

In [10]:
data = data.merge(other_info, how='left', on='id')

train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
train.shape, test.shape

((14865, 5), (10000, 5))

In [11]:
used_cols = [i for i in train.columns if i not in ['id', 'label']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

In [12]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

In [13]:
cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.5)
print('AUC useless_cols: \n', useless_cols)

legal_judgment_num
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.550997	valid_0's auc: 0.541982
Evaluated only: auc
**********
0.5419819403076661
brand_num
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.505536	valid_0's auc: 0.501227
Evaluated only: auc
**********
0.5012272660760055
patent_num
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[1]	training's auc: 0.5	valid_0's auc: 0.5
Evaluated only: auc
**********
0.5
AUC useless_cols: 
 ['patent_num']


In [14]:
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=300 
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[3]	training's auc: 0.558923	valid_1's auc: 0.547851
Evaluated only: auc


In [15]:
importance = valid_model.feature_importance(importance_type='gain')
feature_name = valid_model.feature_name()

df_importance = pd.DataFrame({
    'feature_name': feature_name,
    'importance': importance
}).sort_values(by='importance', ascending=False)
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
df_importance

Unnamed: 0,feature_name,importance,normalized_importance,cumulative_importance
0,legal_judgment_num,3608.173866,0.87922,0.87922
1,brand_num,348.085629,0.08482,0.964039
2,patent_num,147.577002,0.035961,1.0
