In [1]:
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import lightgbm
import gc
import warnings
warnings.filterwarnings('ignore')

In [2]:
def count_encode(df, cols=[]):
    """
    count编码
    @param df:
    @param cols:
    @return:
    """
    for col in cols:
        print(col)
        vc = df[col].value_counts(dropna=True, normalize=True)
        df[col + '_count'] = df[col].map(vc).astype('float32')


def label_encode(df, cols, verbose=True):
    """
    label encode
    @param df:
    @param cols:
    @param verbose:
    @return:
    """
    for col in cols:
        df[col], _ = df[col].astype(str).factorize(sort=True)
        if df[col].max() > 32000:
            df[col] = df[col].astype('int32')
        else:
            df[col] = df[col].astype('int16')
        if verbose:
            print(col)


def cross_cat_num(df, cat_cols, num_cols):
    """
    类别特征与数据特征groupby统计
    @param df:
    @param cat_col: 类别特征
    @param num_col: 数值特征
    @return:
    """
    def max_min(s):
        return s.max() - s.min()

    def quantile(s, q=0.25):
        return s.quantile(q)

    for f1 in tqdm(cat_cols):
        g = df.groupby(f1, as_index=False)
        for f2 in tqdm(num_cols):
            tmp = g[f2].agg({
                '{}_{}_count'.format(f1, f2): 'count',
                '{}_{}_max'.format(f1, f2): 'max',
                '{}_{}_min'.format(f1, f2): 'min',
                '{}_{}_median'.format(f1, f2): 'median',
                '{}_{}_mean'.format(f1, f2): 'mean',
                '{}_{}_sum'.format(f1, f2): 'sum',
                '{}_{}_skew'.format(f1, f2): 'skew',
                '{}_{}_std'.format(f1, f2): 'std',
                '{}_{}_nunique'.format(f1, f2): 'nunique',
                '{}_{}_max_min'.format(f1, f2): max_min,
                '{}_{}_quantile_25'.format(f1, f2): lambda x: quantile(x, 0.25),
                '{}_{}_quantile_75'.format(f1, f2): lambda x: quantile(x, 0.75)
            })
            df = df.merge(tmp, on=f1, how='left')
            del tmp
            gc.collect()
    return df


def arithmetic(df, cross_features):
    """
    数值特征之间的加减乘除
    @param df:
    @param cross_features: 交叉用的数值特征
    @return:
    """
    for i in tqdm(range(len(cross_features))):
        for j in range(i + 1, len(cross_features)):
            colname_add = '{}_{}_add'.format(cross_features[i], cross_features[j])
            colname_substract = '{}_{}_subtract'.format(cross_features[i], cross_features[j])
            colname_multiply = '{}_{}c_multiply'.format(cross_features[i], cross_features[j])
            df[colname_add] = df[cross_features[i]] + df[cross_features[j]]
            df[colname_substract] = df[cross_features[i]] - df[cross_features[j]]
            df[colname_multiply] = df[cross_features[i]] * df[cross_features[j]]

    for f1 in tqdm(cross_features):
        for f2 in cross_features:
            if f1 != f2:
                colname_ratio = '{}_{}_ratio'.format(f1, f2)
                df[colname_ratio] = df[f1].values / (df[f2].values + 0.001)
    return df

In [3]:
# 数据集7：entprise_info.csv
# 带标注的企业数据。每一行代表一个企业，每一行2列，其中id列为企业唯一标识，label列为标注（1：有非法集资风险，0：无非法集资风险），列之间采用“,”分隔符分割。
# 训练集 id 及标签
entprise_info = pd.read_csv('../input/train/entprise_info.csv')
print(entprise_info.shape)
entprise_info.head()

(14865, 2)


Unnamed: 0,id,label
0,59b38c56de3836831ff90a77d892a13523b7494f6ed09ff7,1
1,da8691b210adb3f6be8064e006f220070565db287275ad38,0
2,82750f1b9d122350918121f97c99bf96e11aa24ee91504a9,0
3,f000950527a6feb6b2c6de6f85c1e7438ba5590be931e2ec,0
4,f1c1045b13d1832927e3743e49d2917f2d98424f0849a373,0


In [4]:
# 数据集8（验证集）：entprise_evaluate.csv
# 未标注企业数据。参赛队伍需提交的最终结果数据集，每一行代表一个企业，每一行有 2 列, 其中id列为企业唯一标识，score列为空，列之间采用“,”分隔符分割。
# 测试集 id score
entprise_evaluate = pd.read_csv('../input/entprise_evaluate.csv')

print(entprise_evaluate.shape)
entprise_evaluate.head()

(10000, 2)


Unnamed: 0,id,score
0,82750f1b9d1223508ee329d47e27d35176c93eb9f35e9c1a,
1,f000950527a6feb670cc1c87c2025f3922aaa4a0206a0a33,
2,e9f7b28ec10e04700ef4db75a494f9a1e8e8b09555e6afa1,
3,beb4aaaa89e0a0ae9d77bd5d7665be6342f552f51840cf19,
4,e9f7b28ec10e0470ee4172cec0133b6826c34f27d3dff204,


In [5]:
# 数据集1：base_info.csv
# 包含数据集7和8中涉及到的所有企业的基本信息，每一行代表一个企业的基本数据，每一行有33列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, oplocdistrict:行政区划代码, industryphy:行业类别代码, industryco:行业细类代码, dom:经营地址, opscope:经营范围, enttype:企业类型, enttypeitem:企业类型小类, opfrom:经营期限起, opto:经营期限止, state:状态, orgid:机构标识, jobid:职位标识, adbusign:是否广告经营, townsign:是否城镇, regtype:主题登记类型, empnum:从业人数, compform:组织形式, parnum:合伙人数, exenum:执行人数, opform:经营方式, ptbusscope:兼营范围, venind:风险行业, enttypeminu:企业类型细类, midpreindcode:中西部优势产业代码, protype:项目类型, oploc:经营场所, regcap:注册资本（金）, reccap:实缴资本, forreccap:实缴资本（外方）, forregcap:注册资本（外方）, congro:投资总额, enttypegb:企业（机构）类型]
# 基础信息表
base_info = pd.read_csv('../input/train/base_info.csv')
print(base_info.shape)
base_info.info()

(24865, 33)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24865 entries, 0 to 24864
Data columns (total 33 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             24865 non-null  object 
 1   oplocdistrict  24865 non-null  int64  
 2   industryphy    24865 non-null  object 
 3   industryco     24864 non-null  float64
 4   dom            24865 non-null  object 
 5   opscope        24865 non-null  object 
 6   enttype        24865 non-null  int64  
 7   enttypeitem    16651 non-null  float64
 8   opfrom         24865 non-null  object 
 9   opto           8825 non-null   object 
 10  state          24865 non-null  int64  
 11  orgid          24865 non-null  int64  
 12  jobid          24865 non-null  int64  
 13  adbusign       24865 non-null  int64  
 14  townsign       24865 non-null  int64  
 15  regtype        24865 non-null  int64  
 16  empnum         19615 non-null  float64
 17  compform       10631 non-null  float64

In [6]:
single_cols = ['ptbusscope', 'midpreindcode']
many_cols = ['dom', 'opscope']
to_drop = single_cols + many_cols
base_info.drop(to_drop, axis=1, inplace=True)
gc.collect()

20

In [7]:
base_info['opform'] = base_info['opform'].replace('01', '01-以个人财产出资').replace('02', '02-以家庭共有财产作为个人出资')

In [8]:
num_cols = ['empnum', 'parnum', 'exenum', 'regcap', 'reccap', 'forreccap', 'forregcap', 'congro']
le_cols = ['oplocdistrict', 'industryphy', 'industryco', 'enttype', 'enttypeitem', 'state',
           'orgid', 'jobid', 'regtype', 'compform', 'opform', 'venind', 'enttypeminu',
           'protype', 'oploc', 'enttypegb']
one_zero_cols = ['adbusign', 'townsign']
cat_cols = le_cols + one_zero_cols
dt_cols = ['opfrom', 'opto']

In [9]:
# 时间转换, 暂时先抽取年份特征
base_info['opfrom'] = pd.to_datetime(base_info.opfrom)
base_info['opfrom_year'] = base_info['opfrom'].dt.year.astype('int')

base_info['opto'] = pd.to_datetime(base_info.opto)
base_info['opto_year'] = base_info['opto'].dt.year.fillna(-1).astype('int')

del base_info['opfrom']
del base_info['opto']
gc.collect()

23

In [10]:
# ['industryphy', 'opform', 'oploc', 'orgid', 'jobid', 'oplocdistrict',
#                  'enttypegb', 'industryco', 'enttype', 'enttypeitem']

In [11]:
base_info.head()

Unnamed: 0,id,oplocdistrict,industryphy,industryco,enttype,enttypeitem,state,orgid,jobid,adbusign,...,protype,oploc,regcap,reccap,forreccap,forregcap,congro,enttypegb,opfrom_year,opto_year
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513.0,1100,1150.0,6,340223010010000000,340200000000115392,0,...,,2367b4cac96d8598,50.0,,,,,1151,2019,-1
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090.0,9600,,6,340222060010000000,340200000000112114,0,...,,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,10.0,,,,,9600,2017,-1
2,59b38c56de3836838082cfcb1a298951abfe15e6940c49ba,340202,R,9053.0,1100,1150.0,6,340202010010000000,400000000000753910,0,...,,2367b4cac96d8598,100.0,,,,,1151,2020,-1
3,e9f7b28ec10e047000d16ab79e1b5e6da434a1697cce7818,340221,L,7212.0,4500,4540.0,6,340221010010000000,400000000000013538,0,...,,2367b4cac96d8598,10.0,,,,,4540,2015,-1
4,f000950527a6feb63ee1ce82bb22ddd1ab8b8fdffa3b91fb,340202,R,8810.0,1100,1130.0,7,340200000000000000,400000000000283237,0,...,,2367b4cac96d8598,100.0,,,,,1130,2017,2067


In [12]:
base_info = count_encode(base_info, cat_cols)

oplocdistrict
industryphy
industryco
enttype
enttypeitem
state
orgid
jobid
regtype
compform
opform
venind
enttypeminu
protype
oploc
enttypegb
adbusign
townsign


In [13]:
base_info.head()

AttributeError: 'NoneType' object has no attribute 'head'

In [None]:
base_info = cross_cat_num(base_info, cat_cols, num_cols)

In [None]:
base_info = arithmetic(base_info, num_cols)

In [None]:
for i in le_cols:
    print(i)
    le = LabelEncoder()
    base_info[i] = le.fit_transform(base_info[i].astype(str))
    base_info[i] = base_info[i].astype('category')

In [None]:
data.head()

In [None]:
base_info.head()

In [None]:
# 划分训练集和测试集
entprise_evaluate.columns = ['id', 'label']
data = pd.concat([entprise_info, entprise_evaluate])
df = pd.merge(data, base_info, on='id', how='left')

print(df.shape)
df.head()

In [None]:
train = df[df.label.notna()]
test = df[df.label.isna()]

print(train.shape, test.shape)

In [None]:
used_cols = [i for i in train.columns if i not in ['id', 'label']]
train = train[used_cols]
test = test[used_cols]
y = train['label']
sub = test[['id']]

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

train_dataset = lgb.Dataset(X_train, y_train)
valid_dataset = lgb.Dataset(X_valid, y_valid, reference=train_dataset)
all_dataset = lgb.Dataset(train, y, reference=train_dataset)

# def self_metric(preds, train_data):
#     # 自定义评估函数（特定阈值下的f1）
#     labels = train_data.get_label()
#     y_preds = np.where(preds > 0.5, 1, 0)
#     f1 = f1_score(labels, y_preds)
#     return 'self_metric', f1, True

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
    # 'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    train_dataset,
    valid_sets=[train_dataset, valid_dataset],
    early_stopping_rounds=200,
    verbose_eval=300
)
pred = valid_model.predict(X_valid)
y_valid_pred = np.where(pred > 0.5, 1, 0)
print('Valid F1: ', np.round(f1_score(y_valid, y_valid_pred), 5))
print('Valid mean label: ', np.mean(y_valid_pred))

train_model = lgb.train(
    params,
    all_dataset,
    num_boost_round=valid_model.best_iteration+20
)
y_test_pred = np.where(train_model.predict(test) > 0.5, 1, 0)

print('Test mean label: ', np.mean(y_test_pred))
sub['score'] = y_test_pred
sub.to_csv('../sub/baseline.csv', index=False)