In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import lightgbm
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
# 数据集7：entprise_info.csv
# 带标注的企业数据。每一行代表一个企业，每一行2列，其中id列为企业唯一标识，label列为标注（1：有非法集资风险，0：无非法集资风险），列之间采用“,”分隔符分割。
# 训练集 id 及标签
entprise_info = pd.read_csv('../input/train/entprise_info.csv')
print(entprise_info.shape)
entprise_info.head()

(14865, 2)


Unnamed: 0,id,label
0,59b38c56de3836831ff90a77d892a13523b7494f6ed09ff7,1
1,da8691b210adb3f6be8064e006f220070565db287275ad38,0
2,82750f1b9d122350918121f97c99bf96e11aa24ee91504a9,0
3,f000950527a6feb6b2c6de6f85c1e7438ba5590be931e2ec,0
4,f1c1045b13d1832927e3743e49d2917f2d98424f0849a373,0


In [3]:
# 数据集8（验证集）：entprise_evaluate.csv
# 未标注企业数据。参赛队伍需提交的最终结果数据集，每一行代表一个企业，每一行有 2 列, 其中id列为企业唯一标识，score列为空，列之间采用“,”分隔符分割。
# 测试集 id score
entprise_evaluate = pd.read_csv('../input/entprise_evaluate.csv')

print(entprise_evaluate.shape)
entprise_evaluate.head()

(10000, 2)


Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,


In [4]:
# 数据集1：base_info.csv
# 包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型]
# 基础信息表
base_info = pd.read_csv('../input/train/base_info.csv')
print(base_info.shape)
base_info.head()

(24865, 33)


Unnamed: 0,id,oplocdistrict,industryphy,industryco,dom,opscope,enttype,enttypeitem,opfrom,opto,...,enttypeminu,midpreindcode,protype,oploc,regcap,reccap,forreccap,forregcap,congro,enttypegb
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513.0,31487d8f256f16bd6244b7251be2ebb24d1db51663c654...,纳米新材料、机械设备、五金配件加工、销售及技术推广服务，道路货物运输。（依法须经批准的项目，...,1100,1150.0,2019-07-11 00:00:00,,...,1151.0,,,2367b4cac96d8598,50.0,,,,,1151
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090.0,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,健身服务。（依法须经批准的项目，经相关部门批准后方可开展经营活动）,9600,,2017-09-06,,...,,,,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,10.0,,,,,9600
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,R,9053.0,31487d8f256f16bd6244b7251be2ebb2ae36cd652943e8...,文化娱乐经纪人服务；境内文艺活动组织与策划；文化艺术交流活动组织策划；演出经纪；其他文化艺术...,1100,1150.0,2020-09-14 14:46:30,,...,1151.0,,,2367b4cac96d8598,100.0,,,,,1151
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,L,7212.0,746df9aaed8578571760c563abe882c8ba25209fc6d5db...,投资管理及咨询(证券、期货除外)；企业管理。（依法须经批准的项目，经相关部门批准后方可开展经...,4500,4540.0,2015-09-30,,...,,,,2367b4cac96d8598,10.0,,,,,4540
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,R,8810.0,31487d8f256f16bd6244b7251be2ebb2ae36cd652943e8...,境内文化艺术交流活动策划；企业形象策划；礼仪庆典服务；翻译服务；专利代理；广告设计、制作、代...,1100,1130.0,2017-12-01,2067-11-30,...,,,,2367b4cac96d8598,100.0,,,,,1130


In [5]:
base_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             24865 non-null  object 
 1   oplocdistrict  24865 non-null  int64  
 2   industryphy    24865 non-null  object 
 3   industryco     24864 non-null  float64
 4   dom            24865 non-null  object 
 5   opscope        24865 non-null  object 
 6   enttype        24865 non-null  int64  
 7   enttypeitem    16651 non-null  float64
 8   opfrom         24865 non-null  object 
 9   opto           8825 non-null   object 
 10  state          24865 non-null  int64  
 11  orgid          24865 non-null  int64  
 12  jobid          24865 non-null  int64  
 13  adbusign       24865 non-null  int64  
 14  townsign       24865 non-null  int64  
 15  regtype        24865 non-null  int64  
 16  empnum         19615 non-null  float64
 17  compform       10631 non-null  float64
 18  parnum

In [6]:
single_cols = ['ptbusscope', 'midpreindcode']
base_info.drop(single_cols, axis=1, inplace=True)
gc.collect()

0

In [7]:
many_cols = ['dom', 'opscope']
base_info.drop(many_cols, axis=1, inplace=True)
gc.collect()

20

In [8]:
# for col in tqdm(base_info.columns):
#     print('-' * 20)
#     print(col)
#     print(base_info[col].nunique(dropna=False))
#     print(base_info[col].unique())
#     print(base_info[col].isnull().sum())
#     print('\n')

In [9]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(len(to_drop), missing_threshold))
    return to_drop

In [11]:
to_drop = identify_missing(base_info, missing_threshold=0.9)
to_drop

4 features with greater than %0.2f missing values.



['protype', 'forreccap', 'congro', 'forregcap']

In [None]:
# #缺失值太多
# drop = ['enttypeitem', 'opto', 'empnum', 'compform', 'parnum',
#        'exenum', 'opform', 'ptbusscope', 'venind', 'enttypeminu',
#        'midpreindcode', 'protype', 'reccap', 'forreccap',
#        'forregcap', 'congro']

In [13]:
missing_rate = base_info.isnull().sum() / len(base_info)
missing_rate = missing_rate.sort_values(ascending=False)
missing_rate

protype          0.998633
forreccap        0.990871
congro           0.989986
forregcap        0.989946
exenum           0.944581
parnum           0.905932
reccap           0.715102
enttypeminu      0.707621
venind           0.660688
opto             0.645083
opform           0.638045
compform         0.572451
enttypeitem      0.330344
empnum           0.211140
regcap           0.007681
industryco       0.000040
oplocdistrict    0.000000
industryphy      0.000000
enttype          0.000000
opfrom           0.000000
enttypegb        0.000000
state            0.000000
orgid            0.000000
jobid            0.000000
adbusign         0.000000
townsign         0.000000
regtype          0.000000
oploc            0.000000
id               0.000000
dtype: float64