In [1]:
import time
import numpy as np
import pandas as pd
from tqdm import tqdm
import os
import gc
import missingno as msno
from datetime import datetime
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,precision_recall_fscore_support,roc_curve,auc,roc_auc_score
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import warnings
warnings.filterwarnings('ignore')
pd.set_option('max_columns', None)
pd.set_option('max_rows', None)

In [2]:
base_info = pd.read_csv('../input/train/base_info.csv')
entprise_evaluate = pd.read_csv('../input/train/entprise_evaluate.csv')
entprise_info = pd.read_csv('../input/train/entprise_info.csv')

## base_info

In [3]:
base_info.head(2)

Unnamed: 0,id,oplocdistrict,industryphy,industryco,dom,opscope,enttype,enttypeitem,opfrom,opto,state,orgid,jobid,adbusign,townsign,regtype,empnum,compform,parnum,exenum,opform,ptbusscope,venind,enttypeminu,midpreindcode,protype,oploc,regcap,reccap,forreccap,forregcap,congro,enttypegb
0,47645761dc56bb8c5fae00114b768b5d9b6e917c3aec07c4,340223,M,7513.0,31487d8f256f16bd6244b7251be2ebb24d1db51663c654...,纳米新材料、机械设备、五金配件加工、销售及技术推广服务，道路货物运输。（依法须经批准的项目，...,1100,1150.0,2019-07-11 00:00:00,,6,340223010010000000,340200000000115392,0,0,1,5.0,,,,,,,1151.0,,,2367b4cac96d8598,50.0,,,,,1151
1,9c7fa510616a683058ce97d0bc768a621cd85ab1e87da2a3,340222,O,8090.0,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,健身服务。（依法须经批准的项目，经相关部门批准后方可开展经营活动）,9600,,2017-09-06,,6,340222060010000000,340200000000112114,0,1,1,3.0,1.0,,,10.0,,3.0,,,,31487d8f256f16bd6244b7251be2ebb27b17bdfd95c8f3...,10.0,,,,,9600


In [4]:
base_info.drop(['midpreindcode', 'ptbusscope'], axis=1, inplace=True)

In [5]:
def get_concat(df, col1, col2):
    col = col1 + '_' + col2
    df[col] = df[col1].astype(str) + '_' + df[col2].astype(str)
    return df, col


def count_encode(df, cat_cols, freq):
    for i in cat_cols:
        name_dict = dict(zip(*np.unique(df[i], return_counts=True)))
        df['{}_count'.format(i)] = df[i].apply(lambda x: -999 if name_dict[x] < freq else name_dict[x])
    return df


def label_encode(df, cat_cols, verbose=True):
    for col in cat_cols:
        df[col], _ = df[col].factorize(sort=True)
        if df[col].max() > 32000:
            df[col] = df[col].astype('int32')
        else:
            df[col] = df[col].astype('int16')
        if verbose:
            print(col)
    return df

In [6]:
le_cols = ['industryphy', 'dom', 'opform', 'oploc']
cat_cols = ['industryphy', 'industryco',
            'enttype', 'enttypeitem', 'enttypeminu', 'enttypegb',
            'oplocdistrict', 'dom', 'oploc', 'state']

# 时间相关
base_info['opfrom'] = pd.to_datetime(base_info['opfrom'])
base_info['opto'] = pd.to_datetime(base_info['opto'])
base_info['opfrom_TONOW'] = (datetime.now() - base_info['opfrom']).dt.days
base_info['opfrom_TIME'] = (base_info['opto'] - base_info['opfrom']).dt.days
base_info.drop(['opfrom', 'opto'], axis=1, inplace=True)


# 企业人数相关
base_info['person_SUM'] = base_info[['empnum', 'parnum', 'exenum']].sum(1)
base_info['person_NULL_SUM'] = base_info[['empnum', 'parnum', 'exenum']].isnull().astype(int).sum(1)


# 地址相关
district_cols = ['oplocdistrict', 'dom', 'oploc', 'townsign']
# le_cols += district_cols
# cat_cols += ['oplocdistrict', 'dom', 'oploc']
base_info['district_flag1'] = (base_info['oplocdistrict'].astype(str)[:6] == base_info['orgid'].astype(str)[:6]).astype(int)
base_info['district_flag2'] = (base_info['oplocdistrict'].astype(str)[:6] == base_info['jobid'].astype(str)[:6]).astype(int)
base_info['district_flag3'] = (base_info['orgid'].astype(str)[:6] == base_info['jobid'].astype(str)[:6]).astype(int)
base_info, col = get_concat(base_info, 'oplocdistrict', 'oploc')
le_cols.append(col)
cat_cols.append(col)
# base_info, col = get_concat(base_info, 'oplocdistrict', 'townsign')
# le_cols.append(col)
# cat_cols.append(col)
# base_info, col = get_concat(base_info, 'oploc', 'townsign')
# le_cols.append(col)
# cat_cols.append(col)


# 企业类型相关
enttype_cols = ['enttype', 'enttypeitem', 'enttypeminu', 'enttypegb']
# le_cols += enttype_cols
# cat_cols += enttype_cols
base_info, col = get_concat(base_info, 'enttype', 'enttypeitem')
le_cols.append(col)
cat_cols.append(col)
base_info, col = get_concat(base_info, 'enttype', 'enttypeminu')
le_cols.append(col)
cat_cols.append(col)
base_info, col = get_concat(base_info, 'enttype', 'enttypegb')
le_cols.append(col)
cat_cols.append(col)
base_info, col = get_concat(base_info, 'enttypeitem', 'enttypeminu')
le_cols.append(col)
cat_cols.append(col)
base_info, col = get_concat(base_info, 'enttypeitem', 'enttypegb')
le_cols.append(col)
cat_cols.append(col)
base_info, col = get_concat(base_info, 'enttypeminu', 'enttypegb')
le_cols.append(col)
cat_cols.append(col)


# 行业类型相关
industry_cols = ['industryphy', 'industryco', 'venind']
# le_cols += industry_cols
# cat_cols += industry_cols
base_info, col = get_concat(base_info, 'industryphy', 'industryco')
le_cols.append(col)
cat_cols.append(col)
base_info, col = get_concat(base_info, 'industryphy', 'venind')
le_cols.append(col)
cat_cols.append(col)
base_info, col = get_concat(base_info, 'industryco', 'venind')
le_cols.append(col)
cat_cols.append(col)
base_info['industryphy_industryco_venind'] = base_info['industryphy'].astype(str) + '_' + base_info['industryco'].astype(str) + '_' + base_info['venind'].astype(str)
le_cols.append('industryphy_industryco_venind')
cat_cols.append('industryphy_industryco_venind')

# 标识相关
id_cols = ['orgid', 'jobid']
# le_cols += id_cols
# cat_cols += id_cols
base_info, col = get_concat(base_info, 'orgid', 'jobid')
le_cols.append(col)
cat_cols.append(col)


# 其他
other_cols = ['adbusign', 'opform', 'state', 'protype', 'regtype', 'compform', 'opscope']
# le_cols += other_cols
# cat_cols += ['opform', 'state', 'protype', 'regtype', 'compform', 'opscope']


# Count编码
# base_info = count_encode(base_info, le_cols, 10)
for col in cat_cols:
    base_info[col + '_COUNT'] = base_info[col].map(base_info[col].value_counts())
    col_idx = base_info[col].value_counts()
    for idx in col_idx[col_idx < 10].index:
        base_info[col] = base_info[col].replace(idx, -1)  


# Label Encoder

base_info = label_encode(base_info, le_cols + ['opscope'], verbose=True)

industryphy
dom
opform
oploc
oplocdistrict_oploc
enttype_enttypeitem
enttype_enttypeminu
enttype_enttypegb
enttypeitem_enttypeminu
enttypeitem_enttypegb
enttypeminu_enttypegb
industryphy_industryco
industryphy_venind
industryco_venind
industryphy_industryco_venind
orgid_jobid
opscope


In [7]:
# train = pd.merge(base_info, entprise_info, on='id')

# entprise_evaluate = entprise_evaluate[['id']]
# test = pd.merge(base_info, entprise_evaluate, on='id')


# used_cols = [i for i in train.columns if i not in ['id', 'label']]
# y = train['label']
# train = train[used_cols]
# test = test[used_cols]

# num_folds=5
# kfold = StratifiedKFold(n_splits=num_folds, random_state=1024, shuffle=True)

# oof_probs = np.zeros(train.shape[0])
# output_probs = np.zeros((test.shape[0], 5))
# offline_score = []
# feature_importance_df = pd.DataFrame()

# for fold, (train_idx, valid_idx) in enumerate(kfold.split(train, y)):
#     X_train, y_train = train.iloc[train_idx], y.iloc[train_idx]
#     X_valid, y_valid = train.iloc[valid_idx], y.iloc[valid_idx]
    
#     model=CatBoostClassifier(
#         loss_function="Logloss",
#         eval_metric="F1",
#         task_type="GPU",
#         learning_rate=0.01,
#         iterations=100000,
#         random_seed=2020,
#         od_type="Iter",
#         depth=8,
#         early_stopping_rounds=500
#     )

#     clf = model.fit(X_train, y_train, eval_set=(X_valid,y_valid), verbose=500, cat_features=le_cols)
#     yy_pred_valid=clf.predict(X_valid)
#     y_pred_valid = clf.predict(X_valid, prediction_type='Probability')[:, -1]
#     oof_probs[valid_idx] = y_pred_valid
#     offline_score.append(f1_score(y_valid, yy_pred_valid))
#     output_probs[:, fold] = clf.predict(test, prediction_type='Probability')[:,-1]
    
#     # feature importance
#     fold_importance_df = pd.DataFrame()
#     fold_importance_df["feature"] = model.feature_names_
#     fold_importance_df["importance"] = model.feature_importances_
#     fold_importance_df["fold"] = fold + 1
#     feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)

# print('OOF-MEAN-F1:%.6f, OOF-STD-F1:%.6f' % (np.mean(offline_score), np.std(offline_score)))
# print('feature importance:')
# feature_importance_df_ = feature_importance_df.groupby('feature', as_index=False)['importance'].mean().sort_values(by='importance', ascending=False)
# feature_importance_df_['normalized_importance'] = feature_importance_df_['importance'] / feature_importance_df_['importance'].sum()
# feature_importance_df_['cumulative_importance'] = np.cumsum(feature_importance_df_['normalized_importance'])
# record_low_importance = feature_importance_df_[feature_importance_df_['cumulative_importance'] > 0.99]
# to_drop = list(record_low_importance['feature'])
# print(to_drop)
# # print(feature_importance_df_.head(15))
# print(feature_importance_df_)
# # feature_importance_df_.to_csv("./importance.csv")

# sub['score'] = np.mean(output_probs, axis=1)
# # print(sub['score'])
# sub.to_csv('cat_sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), np.mean(offline_score)), index=False)

In [8]:
train_data = pd.merge(base_info, entprise_info, on='id')

entprise_evaluate = entprise_evaluate[['id']]
test_data = pd.merge(base_info, entprise_evaluate, on='id')

In [9]:
def eval_score(y_test,y_pre):
    _,_,f_class,_=precision_recall_fscore_support(y_true=y_test,y_pred=y_pre,labels=[0,1],average=None)
    fper_class={'合法':f_class[0],'违法':f_class[1],'f1':f1_score(y_test,y_pre)}
    return fper_class


def k_fold_serachParmaters(model,train_val_data,train_val_kind, test_kind):
    mean_f1=0
    mean_f1Train=0
    n_splits=5
    
    cat_features = cat_cols
    
    sk = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=2020)
    pred_Test = np.zeros(len(test_kind))
    for train, test in sk.split(train_val_data, train_val_kind):
        x_train = train_val_data.iloc[train]
        y_train = train_val_kind.iloc[train]
        x_test = train_val_data.iloc[test]
        y_test = train_val_kind.iloc[test]

        model.fit(x_train, y_train, 
                  eval_set=[(x_test, y_test)], 
                  categorical_feature = cat_features,
                 early_stopping_rounds=100,
                 verbose=False)
        
        pred = model.predict(x_test)
        fper_class = eval_score(y_test,pred)
        
        pred_Train = model.predict(x_train)
        pred_Test += model.predict_proba(test_kind)[:, 1]/n_splits
        fper_class_train = eval_score(y_train,pred_Train)

        mean_f1 += fper_class['f1']/n_splits
        mean_f1Train+=fper_class_train['f1']/n_splits
        # print(mean_f1, mean_f1Train)
        
        
    return mean_f1, pred_Test

In [10]:
score_tta = None
score_list = []

tta_fold = 20
for _ in range(tta_fold):
    clf = lgb.LGBMClassifier(
        num_leaves=np.random.randint(6, 10), min_child_samples= np.random.randint(2,5),
        max_depth=5,learning_rate=0.03,
        n_estimators=150,n_jobs=-1)

    score, test_pred = k_fold_serachParmaters(clf,
                           train_data.drop(['id', 'label'], axis=1),
                           train_data['label'],
                           test_data.drop(['id'], axis=1),
                          )

    if score_tta is None:
        score_tta = test_pred/tta_fold
    else:
        score_tta += test_pred/tta_fold
    # print(score)
    score_list.append(score)
    
print(np.array(score_list).mean(), np.array(score_list).std())
# 0.8478168974849689 0.83884757

# 0.8420447002972562 0.00198977186270193
# 0.8430490420761639 0.0022246925904664443
# 0.8359080768530107 0.0014376075368214521
# 0.8353250245440478 0.0011930088571253086
# 0.8368273175940117 0.0018629674140329677

0.8368273175940117 0.0018629674140329677


In [11]:
test_data['score'] = score_tta
test_data[['id', 'score']].to_csv('lgb_sub_{}_{}.csv'.format(time.strftime('%Y%m%d'), np.array(score_list).mean()), index=None)