## 准备工作

### Import所需的库

In [None]:
import pandas as pd
import os
import gc
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
from sklearn.linear_model import SGDRegressor, LinearRegression, Ridge
from sklearn.preprocessing import MinMaxScaler
import math
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score, log_loss
import matplotlib.pyplot as plt
import time
import warnings
warnings.filterwarnings('ignore')

### 读取文件

In [175]:
train = pd.read_csv('C:/Users\86136\OneDrive - stu2019.jnu.edu.cn/mechrevo_of_nardo_86account/Desktop/tianchi_2/train.csv')
testA = pd.read_csv('C:/Users\86136\OneDrive - stu2019.jnu.edu.cn/mechrevo_of_nardo_86account/Desktop/tianchi_2/test.csv')

In [176]:
data = pd.concat([train, testA], axis=0, ignore_index=True)

### 定义基础模型，方便后续调用

In [177]:
def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 6
    seed = 2020
    kf = KFold(n_splits=folds, shuffle=True, random_state=seed)

    train = np.zeros(train_x.shape[0])
    test = np.zeros(test_x.shape[0])

    cv_scores = []

    for i, (train_index, valid_index) in enumerate(kf.split(train_x, train_y)):
        print('************************************ {} ************************************'.format(str(i+1)))
        trn_x, trn_y, val_x, val_y = train_x.iloc[train_index], train_y[train_index], train_x.iloc[valid_index], train_y[valid_index]

        if clf_name == "lgb":
            train_matrix = clf.Dataset(trn_x, label=trn_y)
            valid_matrix = clf.Dataset(val_x, label=val_y)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 0.001,
                'min_child_samples': 19,
                'num_leaves': 10,
                'max_depth': 3,
                'lambda_l2': 10,
                'feature_fraction': 0.95,
                'bagging_fraction': 0.85,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'seed': 2020,
                'nthread': 28,
                'n_jobs':24,
                'silent': True,
                'verbose': -1,
            }

            model = clf.train(params, train_matrix, 50000, valid_sets=[train_matrix, valid_matrix], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(val_x, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)
            
            # print(list(sorted(zip(features, model.feature_importance("gain")), key=lambda x: x[1], reverse=True))[:20])
                
        if clf_name == "xgb":
            train_matrix = clf.DMatrix(trn_x , label=trn_y)
            valid_matrix = clf.DMatrix(val_x , label=val_y)
            test_matrix = clf.DMatrix(test_x)
            
            params = {'booster': 'gbtree',
                      'objective': 'binary:logistic',
                      'eval_metric': 'auc',
                      'gamma': 1,
                      'min_child_samples': 15,
                      'min_child_weight': 0.01,
                      'max_depth': 7,
                      'num_leaves': 15,
                      'lambda': 10,
                      'subsample': 0.7,
                      'colsample_bytree': 0.7,
                      'colsample_bylevel': 0.7,
                      'eta': 0.04,
                      'tree_method': 'exact',
                      'seed': 2020,
                      'nthread': 36,
                      "silent": True,
                      }
            
            watchlist = [(train_matrix, 'train'),(valid_matrix, 'eval')]
            
            model = clf.train(params, train_matrix, num_boost_round=50000, evals=watchlist, verbose_eval=200, early_stopping_rounds=200)
            val_pred  = model.predict(valid_matrix, ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(test_matrix , ntree_limit=model.best_ntree_limit)
                 
        if clf_name == "cat":
            params = {'learning_rate': 0.05, 'depth': 5, 'l2_leaf_reg': 10, 'bootstrap_type': 'Bernoulli',
                      'od_type': 'Iter', 'od_wait': 50, 'random_seed': 11, 'allow_writing_files': False}
            
            model = clf(iterations=20000, **params)
            model.fit(trn_x, trn_y, eval_set=(val_x, val_y),
                      cat_features=[], use_best_model=True, verbose=500)
            
            val_pred  = model.predict(val_x)
            test_pred = model.predict(test_x)
            
        train[valid_index] = val_pred
        test = test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(val_y, val_pred))
        
        print(cv_scores)
       
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return train, test

In [178]:
def lgb_model(x_train, y_train, x_test):
    lgb_train, lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_train, lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_train, xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_train, xgb_test

def cat_model(x_train, y_train, x_test):
    cat_train, cat_test = cv_model(CatBoostRegressor, x_train, y_train, x_test, "cat") 
    return cat_train, cat_test

## 数据清洗

### 观察category数据的情况

In [179]:
cate_features = ['policy_state', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', \
                 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted','incident_city','property_damage', \
                'police_report_available','auto_make','auto_model']
for f in cate_features:
    print(f, '类型数：', data[f].nunique())

policy_state 类型数： 3
insured_sex 类型数： 2
insured_education_level 类型数： 7
insured_occupation 类型数： 14
insured_hobbies 类型数： 20
insured_relationship 类型数： 6
incident_type 类型数： 4
collision_type 类型数： 4
incident_severity 类型数： 4
authorities_contacted 类型数： 5
incident_city 类型数： 7
property_damage 类型数： 3
police_report_available 类型数： 3
auto_make 类型数： 14
auto_model 类型数： 39


### 对低维category数据进行one hot encode，方便进行分析

In [180]:
data=pd.get_dummies(data, columns=['policy_state', 'insured_sex', 'insured_education_level', 'insured_occupation', 'insured_hobbies', 'insured_relationship', \
                 'incident_type', 'collision_type', 'incident_severity', 'authorities_contacted','incident_city','property_damage', \
                'police_report_available','auto_make','auto_model'], drop_first=True)
data.head()

Unnamed: 0,policy_id,age,customer_months,policy_bind_date,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,...,auto_model_Pathfinder,auto_model_RAM,auto_model_RSX,auto_model_Silverado,auto_model_TL,auto_model_Tahoe,auto_model_Ultima,auto_model_Wrangler,auto_model_X5,auto_model_X6
0,122576,37,189,2013-08-21,500/1000,1000,1465.71,5000000,455456,62203,...,0,0,0,0,0,0,0,0,0,0
1,937713,44,234,1998-01-04,250/500,500,821.24,0,591805,31606,...,0,0,0,0,0,0,0,0,0,0
2,680237,33,23,1996-02-06,500/1000,1000,1844.0,0,442490,0,...,0,0,0,0,0,0,0,1,0,0
3,513080,42,210,2008-11-14,500/1000,500,1867.29,0,439408,0,...,0,0,0,0,0,0,0,0,0,0
4,192875,29,81,2002-01-08,100/300,1000,816.25,0,640575,75296,...,0,0,0,0,0,0,0,0,0,0


### incident_state为s+数字，去掉前面的s

In [181]:
data['incident_state'] = data['incident_state'].str[1:]
data['incident_state'] = data['incident_state'].astype(int)

In [182]:
data['incident_state']

0      5
1      5
2      3
3      3
4      2
      ..
995    3
996    3
997    1
998    4
999    7
Name: incident_state, Length: 1000, dtype: int32

### 对日期数据分开年月日三个变量统计，更加符合现实情况

In [183]:
data['policy_bind_date'] = data['policy_bind_date'].apply(lambda s: int(s[0:4]))
data['policy_bind_date']

0      2013
1      1998
2      1996
3      2008
4      2002
       ... 
995    1999
996    2009
997    1999
998    2010
999    1990
Name: policy_bind_date, Length: 1000, dtype: int64

In [184]:
data['incident_year'] = data['incident_date'].apply(lambda s: int(s[0:4]))
data['incident_year']

0      2014
1      2015
2      2015
3      2015
4      2015
       ... 
995    2015
996    2015
997    2014
998    2015
999    2015
Name: incident_year, Length: 1000, dtype: int64

In [185]:
data['incident_month'] = data['incident_date'].apply(lambda s: int(s[5:7]))
data['incident_month']

0      12
1       2
2       1
3       2
4       2
       ..
995     1
996     2
997    12
998     1
999     1
Name: incident_month, Length: 1000, dtype: int64

In [186]:
data['incident_date'] = data['incident_date'].apply(lambda s: int(s[8:]))
data['incident_date']

0      22
1      18
2      18
3       2
4       9
       ..
995    14
996     9
997    21
998    27
999    29
Name: incident_date, Length: 1000, dtype: int64

### 对保险额数据进行处理

In [187]:
data['policy_csl'] = data['policy_csl'].apply(lambda s: int(s[0:3]))

In [188]:
data.head()

Unnamed: 0,policy_id,age,customer_months,policy_bind_date,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,...,auto_model_RSX,auto_model_Silverado,auto_model_TL,auto_model_Tahoe,auto_model_Ultima,auto_model_Wrangler,auto_model_X5,auto_model_X6,incident_year,incident_month
0,122576,37,189,2013,500,1000,1465.71,5000000,455456,62203,...,0,0,0,0,0,0,0,0,2014,12
1,937713,44,234,1998,250,500,821.24,0,591805,31606,...,0,0,0,0,0,0,0,0,2015,2
2,680237,33,23,1996,500,1000,1844.0,0,442490,0,...,0,0,0,0,0,1,0,0,2015,1
3,513080,42,210,2008,500,500,1867.29,0,439408,0,...,0,0,0,0,0,0,0,0,2015,2
4,192875,29,81,2002,100,1000,816.25,0,640575,75296,...,0,0,0,0,0,0,0,0,2015,2


### policy_id、issueDate、fraud不入模

In [189]:
features = [f for f in data.columns if f not in ['policy_id','issueDate','fraud']]

train = data[data.fraud.notnull()].reset_index(drop=True)
test = data[data.fraud.isnull()].reset_index(drop=True)

x_train = train[features]
x_test = test[features]

y_train = train['fraud']

## 简单建模

### 分别调用三个模型进行训练（调参见下一节）

In [190]:
lgb_train, lgb_test = lgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.990727	valid_1's auc: 0.838745
Early stopping, best iteration is:
[66]	training's auc: 0.943942	valid_1's auc: 0.852092
[0.8520923520923522]
************************************ 2 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.991182	valid_1's auc: 0.812071
Early stopping, best iteration is:
[29]	training's auc: 0.895843	valid_1's auc: 0.862826
[0.8520923520923522, 0.8628257887517147]
************************************ 3 ************************************
Training until validation scores don't improve for 200 rounds
[200]	training's auc: 0.990505	valid_1's auc: 0.763603
Early stopping, best iteration is:
[38]	training's auc: 0.927079	valid_1's auc: 0.793199
[0.8520923520923522, 0.8628257887517147, 0.7931985294117647]
*******************************

In [191]:
xgb_train, xgb_test = xgb_model(x_train, y_train, x_test)

************************************ 1 ************************************
Parameters: { "min_child_samples", "num_leaves", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-auc:0.66194	eval-auc:0.66595
[200]	train-auc:0.98489	eval-auc:0.84235
[235]	train-auc:0.98824	eval-auc:0.84127
[0.8571428571428571]
************************************ 2 ************************************
Parameters: { "min_child_samples", "num_leaves", "silent" } might not be used.

  This could be a false alarm, with some parameters getting used by language bindings but
  then being mistakenly passed down to XGBoost core, or some parameter actually being used
  but getting flagged wrongly here. Please open an issue if you find any such cases.


[0]	train-

In [192]:
cat_train, cat_test = cat_model(x_train, y_train, x_test)

************************************ 1 ************************************
0:	learn: 0.4302908	test: 0.4469042	best: 0.4469042 (0)	total: 7.04ms	remaining: 2m 20s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3644039894
bestIteration = 227

Shrink model to first 228 iterations.
[0.8625541125541125]
************************************ 2 ************************************
0:	learn: 0.4272191	test: 0.4611853	best: 0.4611853 (0)	total: 1.39ms	remaining: 27.8s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3713179729
bestIteration = 344

Shrink model to first 345 iterations.
[0.8625541125541125, 0.8545953360768176]
************************************ 3 ************************************
0:	learn: 0.4305578	test: 0.4427615	best: 0.4427615 (0)	total: 1.33ms	remaining: 26.6s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 0.3892175902
bestIteration = 132

Shrink model to first 133 iterations.
[0.8625541125541125, 0.85459533607

### 模型融合以及最终结果输出

In [193]:
rh_test = lgb_test*0.3 + xgb_test*0.4 + cat_test* 0.3

In [194]:
testA['fraud'] = rh_test

In [195]:
testA[['policy_id','fraud']].to_csv('sub.csv', index=False)

## 模型调参

### 首先对max_depth和num_leaves进行调节，提高模型拟合程度

In [None]:
from sklearn.model_selection import GridSearchCV
params_test2={
    'max_depth': [6.5,7,8],
    'num_leaves':[20,25,30,35]
}

gsearch2 = GridSearchCV(estimator=model_lgb, param_grid=params_test2, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch2.fit(x_train, y_train)
gsearch2.best_params_, gsearch2.best_score_


### 反复修改max_depth和num_leaves至基本不发生变化之后，进行降低过拟合，调节min_child_samples和min_child_weight

In [None]:
params_test3={
    'min_child_samples': [15, 17.5, 18],
    'min_child_weight':[0.001, 0.02]
}
model_xgb = xgb.XGBRegressor(num_leaves=25,
                              learning_rate=0.1, n_estimators=43, max_depth=7, 
                              metric='rmse', bagging_fraction = 0.8, feature_fraction = 0.8)
gsearch3 = GridSearchCV(estimator=model_xgb, param_grid=params_test3, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch3.fit(x_train, y_train)
gsearch3.best_params_, gsearch3.best_score_

### 继续调节feature_fraction和bagging_fraction，降低过拟合程度

In [None]:
params_test4={
    'feature_fraction': [0.75, 0.8, 0.85,0.875,0.9],
    'bagging_fraction': [0.85, 0.9, 0.95,0.975,1.0]
}
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=10,
                              learning_rate=0.1, n_estimators=43, max_depth=3, 
                              metric='rmse', bagging_freq = 5,  min_child_samples=19)
gsearch4 = GridSearchCV(estimator=model_lgb, param_grid=params_test4, scoring='neg_mean_squared_error', cv=5, verbose=1, n_jobs=4)
gsearch4.fit(x_train, y_train)
gsearch4.best_params_, gsearch4.best_score_

### 依次使用上述调参方法对三个模型进行调节

### 将最终参数输入到前面建模步骤，输出最终结果