In [1]:
import warnings
warnings.simplefilter('ignore')

import pandas as pd
pd.set_option('max_columns', None)
pd.set_option('max_rows', 100)
pd.set_option('display.float_format',lambda x : '%.2f' % x)
import numpy as np
np.set_printoptions(suppress=True)
from tqdm.notebook import tqdm

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

import gc

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, roc_auc_score

import lightgbm as lgb

In [3]:
# 数据集7：entprise_info.csv
# 带标注的企业数据。每一行代表一个企业，每一行2列，其中id列为企业唯一标识，label列为标注（1：有非法集资风险，0：无非法集资风险），列之间采用“,”分隔符分割。

# 训练集 id 及标签

entprise_info = pd.read_csv('../input/train/entprise_info.csv')

print(entprise_info.shape)
entprise_info.head()

(14865, 2)


Unnamed: 0,id,label
0,59b38c56de3836831ff90a77d892a13523b7494f6ed09ff7,1
1,da8691b210adb3f6be8064e006f220070565db287275ad38,0
2,82750f1b9d122350918121f97c99bf96e11aa24ee91504a9,0
3,f000950527a6feb6b2c6de6f85c1e7438ba5590be931e2ec,0
4,f1c1045b13d1832927e3743e49d2917f2d98424f0849a373,0


In [4]:
# 数据集8（验证集）：entprise_evaluate.csv
# 未标注企业数据。参赛队伍需提交的最终结果数据集，每一行代表一个企业，每一行有 2 列, 其中id列为企业唯一标识，score列为空，列之间采用“,”分隔符分割。

# 测试集 id

entprise_evaluate = pd.read_csv('../input/entprise_evaluate.csv')

print(entprise_evaluate.shape)
entprise_evaluate.head()

(10000, 2)


Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,


In [5]:
# 数据集1：base_info.csv
# 包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型]

# 基础信息表

base_info = pd.read_csv('../input/train/base_info.csv')

print(base_info.shape)
base_info.head()

(24865, 33)


Unnamed: 0,id,oplocdistrict,industryphy,industryco,dom,opscope,enttype,enttypeitem,opfrom,opto,state,orgid,jobid,adbusign,townsign,regtype,empnum,compform,parnum,exenum,opform,ptbusscope,venind,enttypeminu,midpreindcode,protype,oploc,regcap,reccap,forreccap,forregcap,congro,enttypegb
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513.0,31487d8f256f16bd6244b7251be2ebb24d1db51663c654...,纳米新材料、机械设备、五金配件加工、销售及技术推广服务，道路货物运输。（依法须经批准的项目，...,1100,1150.0,2019-07-11 00:00:00,,6,340223010010000000,340200000000115392,0,0,1,5.0,,,,,,,1151.0,,,2367b4cac96d8598,50.0,,,,,1151
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090.0,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,健身服务。（依法须经批准的项目，经相关部门批准后方可开展经营活动）,9600,,2017-09-06,,6,340222060010000000,340200000000112114,0,1,1,3.0,1.0,,,10,,3.0,,,,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,10.0,,,,,9600
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,R,9053.0,31487d8f256f16bd6244b7251be2ebb2ae36cd652943e8...,文化娱乐经纪人服务；境内文艺活动组织与策划；文化艺术交流活动组织策划；演出经纪；其他文化艺术...,1100,1150.0,2020-09-14 14:46:30,,6,340202010010000000,400000000000753910,0,0,1,2.0,,1.0,,,,,1151.0,,,2367b4cac96d8598,100.0,,,,,1151
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,L,7212.0,746df9aaed8578571760c563abe882c8ba25209fc6d5db...,投资管理及咨询(证券、期货除外)；企业管理。（依法须经批准的项目，经相关部门批准后方可开展经...,4500,4540.0,2015-09-30,,6,340221010010000000,400000000000013538,0,1,1,2.0,,,,01-以个人财产出资,,,,,,2367b4cac96d8598,10.0,,,,,4540
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,R,8810.0,31487d8f256f16bd6244b7251be2ebb2ae36cd652943e8...,境内文化艺术交流活动策划；企业形象策划；礼仪庆典服务；翻译服务；专利代理；广告设计、制作、代...,1100,1130.0,2017-12-01,2067-11-30,7,340200000000000000,400000000000283237,0,0,1,,,,,,,,,,,2367b4cac96d8598,100.0,,,,,1130


In [6]:
for col in tqdm(base_info.columns):
    print(col, base_info[col].nunique(dropna=False))

HBox(children=(FloatProgress(value=0.0, max=33.0), HTML(value='')))

id 24865
oplocdistrict 16
industryphy 20
industryco 346
dom 23278
opscope 20815
enttype 17
enttypeitem 32
opfrom 6620
opto 5747
state 6
orgid 78
jobid 434
adbusign 2
townsign 2
regtype 3
empnum 63
compform 3
parnum 52
exenum 51
opform 34
ptbusscope 1
venind 4
enttypeminu 27
midpreindcode 1
protype 3
oploc 5351
regcap 1144
reccap 598
forreccap 12
forregcap 39
congro 34
enttypegb 53



In [7]:
# 去掉只有一种类别的 columns

del base_info['ptbusscope']
del base_info['midpreindcode']

gc.collect()

436

In [8]:
# 去掉类别太多的 columns

del base_info['dom']
del base_info['opscope']  # 后面可以分词后用 svd 抽取特征, 但感觉效果不大

gc.collect()

20

In [9]:
base_info.head()

Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,opfrom,opto,state,orgid,jobid,adbusign,townsign,regtype,empnum,compform,parnum,exenum,opform,venind,enttypeminu,protype,oploc,regcap,reccap,forreccap,forregcap,congro,enttypegb
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513.0,1100,1150.0,2019-07-11 00:00:00,,6,340223010010000000,340200000000115392,0,0,1,5.0,,,,,,1151.0,,2367b4cac96d8598,50.0,,,,,1151
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090.0,9600,,2017-09-06,,6,340222060010000000,340200000000112114,0,1,1,3.0,1.0,,,10,3.0,,,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,10.0,,,,,9600
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,R,9053.0,1100,1150.0,2020-09-14 14:46:30,,6,340202010010000000,400000000000753910,0,0,1,2.0,,1.0,,,,1151.0,,2367b4cac96d8598,100.0,,,,,1151
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,L,7212.0,4500,4540.0,2015-09-30,,6,340221010010000000,400000000000013538,0,1,1,2.0,,,,01-以个人财产出资,,,,2367b4cac96d8598,10.0,,,,,4540
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,R,8810.0,1100,1130.0,2017-12-01,2067-11-30,7,340200000000000000,400000000000283237,0,0,1,,,,,,,,,2367b4cac96d8598,100.0,,,,,1130


In [10]:
# 时间转换, 暂时先抽取年份特征

base_info['opfrom'] = pd.to_datetime(base_info.opfrom)
base_info['opfrom_year'] = base_info['opfrom'].dt.year.astype('int')

base_info['opto'] = pd.to_datetime(base_info.opto)
base_info['opto_year'] = base_info['opto'].dt.year.fillna(-1).astype('int')

del base_info['opfrom']
del base_info['opto']
gc.collect()

62

In [11]:
base_info.head()

Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,townsign,regtype,empnum,compform,parnum,exenum,opform,venind,enttypeminu,protype,oploc,regcap,reccap,forreccap,forregcap,congro,enttypegb,opfrom_year,opto_year
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513.0,1100,1150.0,6,340223010010000000,340200000000115392,0,0,1,5.0,,,,,,1151.0,,2367b4cac96d8598,50.0,,,,,1151,2019,-1
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090.0,9600,,6,340222060010000000,340200000000112114,0,1,1,3.0,1.0,,,10,3.0,,,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,10.0,,,,,9600,2017,-1
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,R,9053.0,1100,1150.0,6,340202010010000000,400000000000753910,0,0,1,2.0,,1.0,,,,1151.0,,2367b4cac96d8598,100.0,,,,,1151,2020,-1
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,L,7212.0,4500,4540.0,6,340221010010000000,400000000000013538,0,1,1,2.0,,,,01-以个人财产出资,,,,2367b4cac96d8598,10.0,,,,,4540,2015,-1
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,R,8810.0,1100,1130.0,7,340200000000000000,400000000000283237,0,0,1,,,,,,,,,2367b4cac96d8598,100.0,,,,,1130,2017,2067


In [12]:
base_info.dtypes

id                object
oplocdistrict      int64
industryphy       object
industryco       float64
enttype            int64
enttypeitem      float64
state              int64
orgid              int64
jobid              int64
adbusign           int64
townsign           int64
regtype            int64
empnum           float64
compform         float64
parnum           float64
exenum           float64
opform            object
venind           float64
enttypeminu      float64
protype          float64
oploc             object
regcap           float64
reccap           float64
forreccap        float64
forregcap        float64
congro           float64
enttypegb          int64
opfrom_year        int32
opto_year          int32
dtype: object

In [13]:
# 类别清理

for col in tqdm(['industryco', 'enttypeitem', 'compform', 'venind', 
                 'enttypeminu', 'protype']):
    base_info[col] = base_info[col].fillna(-1).astype('int')

HBox(children=(FloatProgress(value=0.0, max=6.0), HTML(value='')))




In [14]:
# 数字类型

base_info.select_dtypes(['float64']).describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
empnum,19615.0,4.4,15.39,0.0,2.0,3.0,5.0,1500.0
parnum,2339.0,4.59,8.75,0.0,2.0,2.0,3.0,100.0
exenum,1378.0,77.75,2693.74,0.0,1.0,1.0,3.0,100000.0
regcap,24674.0,5151.44,67770.86,0.0,15.0,80.0,500.0,5000100.0
reccap,7084.0,4198.17,36537.98,0.0,0.0,0.0,100.0,1278900.0
forreccap,227.0,292.64,1841.31,0.0,0.0,0.0,0.0,15428.17
forregcap,250.0,1212.58,8113.27,0.0,0.0,0.0,0.0,88817.92
congro,249.0,2805.26,18131.95,0.0,0.0,0.0,0.0,221453.76


In [15]:
# 数字类型缺失值太多, 且部分 columns std 较大, 先考虑用中值填充

for col in tqdm(base_info.select_dtypes(['float64']).columns):
    base_info[col] = base_info[col].fillna(base_info[col].median())

HBox(children=(FloatProgress(value=0.0, max=8.0), HTML(value='')))




In [16]:
# object 类型

base_info.select_dtypes(['object'])

Unnamed: 0,id,industryphy,opform,oploc
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,M,,2367b4cac96d8598
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,O,10,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,R,,2367b4cac96d8598
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,L,01-以个人财产出资,2367b4cac96d8598
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,R,,2367b4cac96d8598
...,...,...,...,...
24860,f1c1045b13d18329a2bd99d2a7e2227688c0d69bf1d1e325,O,10,f67c1b92f52ac52e424308ab51241cdef9be3e39c8e1c6...
24861,f000950527a6feb6bde38216d7cbbf32e66d3a3a96d4dbda,J,,2367b4cac96d8598
24862,da8691b210adb3f65b43370d3a362f4aa1d3b16b5ba0c9d7,O,10,2367b4cac96d8598
24863,516ab81418ed215dcbbf0614a7b929e691f8eed153d7bb31,O,,2367b4cac96d8598


In [17]:
# 数据清理

base_info['opform'] = base_info['opform'].replace('01', '01-以个人财产出资').replace('02', '02-以家庭共有财产作为个人出资')

In [18]:
# 数据比较长尾, label encoding 和 freq 处理

for col in tqdm(['industryphy', 'opform', 'oploc', 'orgid', 'jobid', 'oplocdistrict',
                 'enttypegb', 'industryco', 'enttype', 'enttypeitem']):
    lbl = LabelEncoder()
    base_info[col] = lbl.fit_transform(base_info[col].astype(str))
    vc = base_info[col].value_counts(dropna=True, normalize=True).to_dict()
    base_info[f'{col}_freq'] = base_info[col].map(vc)
#     base_info[col] = base_info[col].map(vc)

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




In [19]:
# 通过上一次模型的重要度输出发现的分裂次数为 0 的特征

del base_info['forreccap']
del base_info['forregcap']
del base_info['protype']
del base_info['congro']

gc.collect()

8728

In [20]:
base_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 35 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   id                  24865 non-null  object 
 1   oplocdistrict       24865 non-null  int32  
 2   industryphy         24865 non-null  int32  
 3   industryco          24865 non-null  int32  
 4   enttype             24865 non-null  int32  
 5   enttypeitem         24865 non-null  int32  
 6   state               24865 non-null  int64  
 7   orgid               24865 non-null  int32  
 8   jobid               24865 non-null  int32  
 9   adbusign            24865 non-null  int64  
 10  townsign            24865 non-null  int64  
 11  regtype             24865 non-null  int64  
 12  empnum              24865 non-null  float64
 13  compform            24865 non-null  int32  
 14  parnum              24865 non-null  float64
 15  exenum              24865 non-null  float64
 16  opfo

In [21]:
# 划分训练集和测试集

entprise_evaluate.columns = ['id', 'label']

labels = pd.concat([entprise_info, entprise_evaluate])
df = pd.merge(base_info, labels, on='id', how='left')

print(df.shape)
df.head()

(24865, 36)


Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,townsign,regtype,empnum,compform,parnum,exenum,opform,venind,enttypeminu,oploc,regcap,reccap,enttypegb,opfrom_year,opto_year,industryphy_freq,opform_freq,oploc_freq,orgid_freq,jobid_freq,oplocdistrict_freq,enttypegb_freq,industryco_freq,enttype_freq,enttypeitem_freq,label
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,10,12,198,0,5,6,62,112,0,0,1,5.0,-1,2.0,1.0,4,-1,1151,108,50.0,0.0,6,2019,-1,0.28,0.64,0.76,0.04,0.01,0.07,0.19,0.01,0.57,0.23,0.0
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,9,14,244,16,0,6,58,61,0,1,1,3.0,1,2.0,1.0,3,3,-1,1989,10.0,0.0,52,2017,-1,0.41,0.33,0.01,0.02,0.01,0.05,0.33,0.12,0.33,0.33,
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,3,17,337,0,5,6,32,422,0,0,1,2.0,-1,1.0,1.0,4,-1,1151,108,100.0,0.0,6,2020,-1,0.12,0.64,0.76,0.15,0.01,0.22,0.19,0.0,0.57,0.23,0.0
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,8,11,140,9,19,6,50,321,0,1,1,2.0,-1,2.0,1.0,1,-1,-1,108,10.0,0.0,39,2015,-1,0.06,0.03,0.76,0.05,0.0,0.08,0.03,0.03,0.09,0.03,0.0
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,3,17,312,0,3,7,1,346,0,0,1,3.0,-1,2.0,1.0,4,-1,-1,108,100.0,0.0,4,2017,2067,0.12,0.64,0.76,0.09,0.0,0.22,0.3,0.01,0.57,0.3,0.0


In [22]:
# 缺失值填充

df[[col for col in df.columns if col != 'label']].fillna(-1, inplace=True)

In [23]:

train = df[df.label.notna()]
test = df[df.label.isna()]

print(train.shape, test.shape)

(14865, 36) (10000, 36)


In [24]:
train.head()

Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,townsign,regtype,empnum,compform,parnum,exenum,opform,venind,enttypeminu,oploc,regcap,reccap,enttypegb,opfrom_year,opto_year,industryphy_freq,opform_freq,oploc_freq,orgid_freq,jobid_freq,oplocdistrict_freq,enttypegb_freq,industryco_freq,enttype_freq,enttypeitem_freq,label
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,10,12,198,0,5,6,62,112,0,0,1,5.0,-1,2.0,1.0,4,-1,1151,108,50.0,0.0,6,2019,-1,0.28,0.64,0.76,0.04,0.01,0.07,0.19,0.01,0.57,0.23,0.0
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,3,17,337,0,5,6,32,422,0,0,1,2.0,-1,1.0,1.0,4,-1,1151,108,100.0,0.0,6,2020,-1,0.12,0.64,0.76,0.15,0.01,0.22,0.19,0.0,0.57,0.23,0.0
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,8,11,140,9,19,6,50,321,0,1,1,2.0,-1,2.0,1.0,1,-1,-1,108,10.0,0.0,39,2015,-1,0.06,0.03,0.76,0.05,0.0,0.08,0.03,0.03,0.09,0.03,0.0
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,3,17,312,0,3,7,1,346,0,0,1,3.0,-1,2.0,1.0,4,-1,-1,108,100.0,0.0,4,2017,2067,0.12,0.64,0.76,0.09,0.0,0.22,0.3,0.01,0.57,0.3,0.0
6,9c7fa510616a6830b878f3c8c4317d93e1b022e7f22ae231,9,14,239,16,0,6,60,37,0,1,1,5.0,1,2.0,1.0,3,3,-1,3960,20.0,0.0,52,2020,-1,0.41,0.33,0.0,0.0,0.0,0.05,0.33,0.01,0.33,0.33,0.0


In [25]:
ycol = 'label'
feature_names = list(
    filter(lambda x: x not in [ycol, 'id'], train.columns))

model = lgb.LGBMClassifier(objective='binary',
                           boosting_type='gbdt',
                           tree_learner='serial',
                           num_leaves=64,
                           max_depth=8,
                           learning_rate=0.02,
                           n_estimators=10000,
                           subsample=0.8,
                           feature_fraction=0.8,
                           reg_alpha=0.3,
                           reg_lambda=0.5,
                           random_state=2020,
                           is_unbalance=True)


oof = []
prediction = test[['id']]
prediction[f'{ycol}_0'] = 0
prediction[f'{ycol}_1'] = 0
df_importance_list = []

def f1_score_custom(y_true, y_pred):
    y_pred = y_pred.round()
    return 'f1', f1_score(y_true, y_pred), True

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=2020)
for fold_id, (trn_idx, val_idx) in enumerate(kfold.split(train[feature_names], train[ycol])):
    X_train = train.iloc[trn_idx][feature_names]
    Y_train = train.iloc[trn_idx][ycol]

    X_val = train.iloc[val_idx][feature_names]
    Y_val = train.iloc[val_idx][ycol]

    print('\nFold_{} Training ================================\n'.format(fold_id+1))
    

    
    lgb_model = model.fit(X_train,
                          Y_train,
                          eval_names=['train', 'valid'],
                          eval_set=[(X_train, Y_train), (X_val, Y_val)],
                          verbose=500,
                          eval_metric=lambda y_true, y_pred: f1_score_custom(y_true, y_pred),
                          early_stopping_rounds=50)

    pred_val = lgb_model.predict_proba(
        X_val, num_iteration=lgb_model.best_iteration_)
    df_oof = train.iloc[val_idx][['id', ycol]].copy()
    df_oof[f'{ycol}_0'] = pred_val[:,0]
    df_oof[f'{ycol}_1'] = pred_val[:,1]
    oof.append(df_oof)

    pred_test = lgb_model.predict_proba(
        test[feature_names], num_iteration=lgb_model.best_iteration_)
    prediction[f'{ycol}_0'] += pred_test[:,0] / kfold.n_splits
    prediction[f'{ycol}_1'] += pred_test[:,1] / kfold.n_splits

    df_importance = pd.DataFrame({
        'column': feature_names,
        'importance': lgb_model.feature_importances_,
    })
    df_importance_list.append(df_importance)

    del lgb_model, pred_val, pred_test, X_train, Y_train, X_val, Y_val
    gc.collect()
    
    
df_importance = pd.concat(df_importance_list)
df_importance = df_importance.groupby(['column'])['importance'].agg(
    'mean').sort_values(ascending=False).reset_index()
df_importance



Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[36]	train's binary_logloss: 0.0961808	train's f1: 0.843458	valid's binary_logloss: 0.0960928	valid's f1: 0.855721


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	train's binary_logloss: 0.0982822	train's f1: 0.838471	valid's binary_logloss: 0.102158	valid's f1: 0.830918


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[38]	train's binary_logloss: 0.0933097	train's f1: 0.850746	valid's binary_logloss: 0.0970162	valid's f1: 0.826966


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[33]	train's binary_logloss: 0.0989391	train's f1: 0.825377	valid's binary_logloss: 0.102725	valid's f1: 0.826733


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[37]	train's binary_logloss: 0.0947528	train's f1: 0

Unnamed: 0,column,importance
0,industryco,89.8
1,regcap,62.0
2,jobid_freq,47.6
3,enttypegb_freq,45.8
4,enttypegb,40.0
5,reccap,40.0
6,opto_year,36.0
7,jobid,35.4
8,opfrom_year,35.2
9,enttypeitem,33.2


In [26]:
df_oof = pd.concat(oof)
score = f1_score(df_oof[ycol].astype('int'), 
                 np.argmax(df_oof[['label_0', 'label_1']].values, axis=1).astype('int'))
print('f1:', score)

f1: 0.835394862036156


In [27]:
sub = prediction[['id', 'label_1']]
sub.columns = ['id', 'score']

sub = pd.merge(entprise_evaluate, sub, on='id', how='left')
sub.drop(['label'], axis=1, inplace=True)

sub.head()

Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,0.05
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,0.52
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,0.03
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,0.03
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,0.08


In [28]:
sub.to_csv(f'../sub/baseline_{score}.csv', index=False)