# 数据预处理

数据预处理中，主要目的是了解数据，并将数据转化为可用于模型训练的形式。

首先需要对数据进行总览，了解分布情况，缺失值情况，类型情况等。

然后对数据进行针对性处理，转换，填充，增添，删除等操作，即特征工程。

In [3]:
import pandas as pd
import numpy as np

In [4]:
train = pd.read_csv('downloads/107537/train.csv')
test = pd.read_csv('downloads/107537/testA.csv')

In [5]:
train['term']

0         5
1         5
2         5
3         3
4         3
         ..
799995    3
799996    3
799997    3
799998    3
799999    3
Name: term, Length: 800000, dtype: int64

In [6]:
print(train.shape, test.shape)

(800000, 47) (200000, 46)


In [7]:
train.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,0,35000.0,5,19.52,917.97,E,E2,320.0,2 years,2,...,9.0,8.0,4.0,12.0,2.0,7.0,0.0,0.0,0.0,2.0
1,1,18000.0,5,18.49,461.9,D,D2,219843.0,5 years,0,...,,,,,,13.0,,,,
2,2,12000.0,5,16.99,298.17,D,D3,31698.0,8 years,0,...,0.0,21.0,4.0,5.0,3.0,11.0,0.0,0.0,0.0,4.0
3,3,11000.0,3,7.26,340.96,A,A4,46854.0,10+ years,1,...,16.0,4.0,7.0,21.0,6.0,9.0,0.0,0.0,0.0,1.0
4,4,3000.0,3,12.99,101.07,C,C2,54.0,,1,...,4.0,9.0,10.0,15.0,7.0,12.0,0.0,0.0,0.0,4.0


In [8]:
test.head()

Unnamed: 0,id,loanAmnt,term,interestRate,installment,grade,subGrade,employmentTitle,employmentLength,homeOwnership,...,n5,n6,n7,n8,n9,n10,n11,n12,n13,n14
0,800000,14000.0,3,10.99,458.28,B,B3,7027.0,10+ years,0,...,8.0,4.0,15.0,19.0,6.0,17.0,0.0,0.0,1.0,3.0
1,800001,20000.0,5,14.65,472.14,C,C5,60426.0,10+ years,0,...,1.0,3.0,3.0,9.0,3.0,5.0,0.0,0.0,2.0,2.0
2,800002,12000.0,3,19.99,445.91,D,D4,23547.0,2 years,1,...,1.0,36.0,5.0,6.0,4.0,12.0,0.0,0.0,0.0,7.0
3,800003,17500.0,5,14.31,410.02,C,C4,636.0,4 years,0,...,7.0,2.0,8.0,14.0,2.0,10.0,0.0,0.0,0.0,3.0
4,800004,35000.0,3,17.09,1249.42,D,D1,368446.0,< 1 year,1,...,11.0,3.0,16.0,18.0,11.0,19.0,0.0,0.0,0.0,1.0


In [9]:
train.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'isDefault',
       'purpose', 'postCode', 'regionCode', 'dti', 'delinquency_2years',
       'ficoRangeLow', 'ficoRangeHigh', 'openAcc', 'pubRec',
       'pubRecBankruptcies', 'revolBal', 'revolUtil', 'totalAcc',
       'initialListStatus', 'applicationType', 'earliesCreditLine', 'title',
       'policyCode', 'n0', 'n1', 'n2', 'n3', 'n4', 'n5', 'n6', 'n7', 'n8',
       'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')

In [10]:
test.columns

Index(['id', 'loanAmnt', 'term', 'interestRate', 'installment', 'grade',
       'subGrade', 'employmentTitle', 'employmentLength', 'homeOwnership',
       'annualIncome', 'verificationStatus', 'issueDate', 'purpose',
       'postCode', 'regionCode', 'dti', 'delinquency_2years', 'ficoRangeLow',
       'ficoRangeHigh', 'openAcc', 'pubRec', 'pubRecBankruptcies', 'revolBal',
       'revolUtil', 'totalAcc', 'initialListStatus', 'applicationType',
       'earliesCreditLine', 'title', 'policyCode', 'n0', 'n1', 'n2', 'n3',
       'n4', 'n5', 'n6', 'n7', 'n8', 'n9', 'n10', 'n11', 'n12', 'n13', 'n14'],
      dtype='object')

In [11]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 47 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  800000 non-null  int64  
 1   loanAmnt            800000 non-null  float64
 2   term                800000 non-null  int64  
 3   interestRate        800000 non-null  float64
 4   installment         800000 non-null  float64
 5   grade               800000 non-null  object 
 6   subGrade            800000 non-null  object 
 7   employmentTitle     799999 non-null  float64
 8   employmentLength    753201 non-null  object 
 9   homeOwnership       800000 non-null  int64  
 10  annualIncome        800000 non-null  float64
 11  verificationStatus  800000 non-null  int64  
 12  issueDate           800000 non-null  object 
 13  isDefault           800000 non-null  int64  
 14  purpose             800000 non-null  int64  
 15  postCode            799999 non-nul

In [12]:
numerical_cols = list(train.select_dtypes(exclude=['object']).columns)
numerical_cols.remove('isDefault')

In [13]:
object_cols = list(train.select_dtypes(include=['object']).columns)

In [14]:
len(numerical_cols)+len(object_cols)

46

## 1.1 object类型处理

在这里对一些object类型特征进行一些针对性处理。例如成绩，日期等。

对于成绩，有相对顺序，可以依据专家知识将其转换为数值型特征。

对于日期，可以将object类型的数据拆解为年、月、日，同时可以增加距离当前的时间间隔。

### grade & subGrade

In [15]:
train['grade'].value_counts(dropna=False).sort_index()

A    139661
B    233690
C    227118
D    119453
E     55661
F     19053
G      5364
Name: grade, dtype: int64

In [16]:
train['subGrade'].value_counts(dropna=False).sort_index()

A1    25909
A2    22124
A3    22655
A4    30928
A5    38045
B1    42382
B2    44227
B3    48600
B4    49516
B5    48965
C1    50763
C2    47068
C3    44751
C4    44272
C5    40264
D1    30538
D2    26528
D3    23410
D4    21139
D5    17838
E1    14064
E2    12746
E3    10925
E4     9273
E5     8653
F1     5925
F2     4340
F3     3577
F4     2859
F5     2352
G1     1759
G2     1231
G3      978
G4      751
G5      645
Name: subGrade, dtype: int64

In [17]:
def grade_trans(x):
    grade_map = {'A':1,'B':2,'C':3,'D':4,'E':5,'F':6,'G':7}
    return grade_map[x]

def subGrade_trans(x):
    return grade_trans(x[0])*5 + int(x[1])

for d in [train, test]:
    d['grade'] = d['grade'].apply(lambda x: grade_trans(x))
    d['subGrade'] = d['subGrade'].apply(lambda x: subGrade_trans(x))

### employmentLength

In [18]:
train['employmentLength'].value_counts(dropna=False).sort_index()

1 year        52489
10+ years    262753
2 years       72358
3 years       64152
4 years       47985
5 years       50102
6 years       37254
7 years       35407
8 years       36192
9 years       30272
< 1 year      64237
NaN           46799
Name: employmentLength, dtype: int64

In [19]:
def employmentLength_trans(x):
    if pd.isnull(x):
        return x
    elif x == '< 1 year':
        return 0.5
    elif x == '10+ years':
        return 10
    else:
        return int(x.split()[0])

In [20]:
for d in [train, test]:
    d['employmentLength'] = d['employmentLength'].apply(lambda x: employmentLength_trans(x))

### issueDate

In [21]:
import datetime

In [22]:
train['issueDate']

0         2014-07-01
1         2012-08-01
2         2015-10-01
3         2015-08-01
4         2016-03-01
             ...    
799995    2016-07-01
799996    2013-04-01
799997    2015-10-01
799998    2015-02-01
799999    2018-08-01
Name: issueDate, Length: 800000, dtype: object

In [23]:
startdate = datetime.datetime.strptime(str(train['issueDate'].min()), '%Y-%m-%d')

In [24]:
def interval_trans(x):
    enddate = datetime.datetime.strptime(x, '%Y-%m-%d')
    return (enddate-startdate).days

In [25]:
for d in [train, test]:
    d['issueDate_year'] = d['issueDate'].apply(lambda x: int(x.split('-')[0]))
    d['issueDate_month'] = d['issueDate'].apply(lambda x: int(x.split('-')[1]))
    d['interval'] = d['issueDate'].apply(lambda x: interval_trans(x))
    del d['issueDate']

### earliesCreditLine

In [26]:
train['earliesCreditLine']

0         Aug-2001
1         May-2002
2         May-2006
3         May-1999
4         Aug-1977
            ...   
799995    Aug-2011
799996    May-1989
799997    Jul-2002
799998    Jan-1994
799999    Feb-2002
Name: earliesCreditLine, Length: 800000, dtype: object

In [27]:
train['earliesCreditLine'].max()

'Sep-2015'

In [28]:
def earliesCreditLine_month_trans(x):
    month_map = {'Jan': 1, 'Feb': 2, 'Mar': 3, 'Apr': 4, 'May': 5, 'Jun': 6, 'Jul': 7, 'Aug': 8, 'Sep': 9, 'Oct': 10, 'Nov': 11, 'Dec': 12}
    return month_map[x.split('-')[0]]

for d in [train, test]:
    d['earliesCreditLine_year'] = d['earliesCreditLine'].apply(lambda x: 2015 - (int(x.split('-')[-1])))
    d['earliesCreditLine_month'] = d['earliesCreditLine'].apply(lambda x: earliesCreditLine_month_trans(x))
    del d['earliesCreditLine']

## 1.2 缺失值填充

对于类别型特征，利用众数填充缺失值；对于数值型特征，利用中位数填充缺失值。

In [29]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 800000 entries, 0 to 799999
Data columns (total 50 columns):
 #   Column                   Non-Null Count   Dtype  
---  ------                   --------------   -----  
 0   id                       800000 non-null  int64  
 1   loanAmnt                 800000 non-null  float64
 2   term                     800000 non-null  int64  
 3   interestRate             800000 non-null  float64
 4   installment              800000 non-null  float64
 5   grade                    800000 non-null  int64  
 6   subGrade                 800000 non-null  int64  
 7   employmentTitle          799999 non-null  float64
 8   employmentLength         753201 non-null  float64
 9   homeOwnership            800000 non-null  int64  
 10  annualIncome             800000 non-null  float64
 11  verificationStatus       800000 non-null  int64  
 12  isDefault                800000 non-null  int64  
 13  purpose                  800000 non-null  int64  
 14  post

In [30]:
cols = list(train.columns)
cols.remove('isDefault')

In [31]:
categorical_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title', 'policyCode']

In [32]:
numerical_features = list(set(cols)-set(categorical_features))

In [33]:
train[numerical_features] = train[numerical_features].fillna(train[numerical_features].median())
test[numerical_features] = test[numerical_features].fillna(train[numerical_features].median())
for fea in categorical_features:
    train[fea] = train[fea].fillna(train[fea].mode().values[0])
    test[fea] = test[fea].fillna(train[fea].mode().values[0])

In [34]:
train.isnull().sum()
test.isnull().sum()

id                         0
loanAmnt                   0
term                       0
interestRate               0
installment                0
grade                      0
subGrade                   0
employmentTitle            0
employmentLength           0
homeOwnership              0
annualIncome               0
verificationStatus         0
purpose                    0
postCode                   0
regionCode                 0
dti                        0
delinquency_2years         0
ficoRangeLow               0
ficoRangeHigh              0
openAcc                    0
pubRec                     0
pubRecBankruptcies         0
revolBal                   0
revolUtil                  0
totalAcc                   0
initialListStatus          0
applicationType            0
title                      0
policyCode                 0
n0                         0
n1                         0
n2                         0
n3                         0
n4                         0
n5            

## 1.3特征编码

在类别型特征的处理中，我们可以将类别型特征分为以下两种：
* 存在内在顺序
* 不存在内在顺序

对于存在内在顺序的特征，可以人为依据含义进行编码，将object类型转为numerical类型。例如上文中的grade和subGrade。

对于不存在内在顺序的特征，可以按特征维度，对高纬特征进行label encoding/target encoding，对低纬特征进行onehot encoding。
之所以不对高维特征进行onehot encoding，是为了防止编码之后特征空间过大，影响模型训练。

此外，还有一种方式可以考虑讲类别型特征（无论高维低维）均经过onehot encoding，然后利用神经网络进行稠密编码（降维）获取embedding，然后用于下游的模型中。

### onehot encoding

In [35]:
categorical_features = ['grade', 'subGrade', 'employmentTitle', 'homeOwnership', 'verificationStatus', 'purpose', 'postCode', 'regionCode', 'applicationType', 'initialListStatus', 'title', 'policyCode']

In [36]:
for fea in categorical_features:
    print(fea, train[fea].nunique())

grade 7
subGrade 35
employmentTitle 248683
homeOwnership 6
verificationStatus 3
purpose 14
postCode 932
regionCode 51
applicationType 2
initialListStatus 2
title 39644
policyCode 1


In [37]:
for d in [train, test]:
    d = pd.get_dummies(d, columns=['homeOwnership', 'verificationStatus', 'purpose', 'regionCode'], drop_first=True)

### label encode

In [38]:
from sklearn.preprocessing import LabelEncoder

In [39]:
for col in ['employmentTitle', 'postCode', 'title']:
    le = LabelEncoder()
    le.fit(list(train[col].astype(str).values) + list(test[col].astype(str).values))
    train[col] = le.transform(list(train[col].astype(str).values))
    test[col] = le.transform(list(test[col].astype(str).values))

# 模型训练

模型训练，即基于处理后的数据用于模型的生成，同时对目标数据进行预测。

在模型训练中一般采用cross validation的方式，即将数据分为train set和valid set，多次划分，进行多轮的训练。这样做的好处一是可以进行离线的模型效果评估，二是得到更为稳定的模型。

在这里我们用到的模型包括：
* logistic regression：是广义线性模型的一种，能够处理数值和离散型特征，具有简单、易解释、易实现等特点。
* svm：是一种二分类模型，其基本思想是在特征空间中寻找最优的超平面，使得不同类别的样本点之间的间隔最大化。SVM 通过将数据转化到高维空间，并使用核函数来解决非线性分类问题。
* xgboost：是一种基于GBDT（Gradient Boosting Decision Tree）的机器学习框架。xgboost基于二阶泰勒展开，并对特征进行预排序和buffer的预存储。
* lightgbm：是一种基于GBDT（Gradient Boosting Decision Tree）的机器学习框架。在xgboost的基础上实现了精度，效率和功能上的提升。基于直方图算法。
* MLP（neural network）：Multilayer Perceptron是一种最基本的前馈神经网络，一般包含输入层，隐藏层，输出层。MLP通过每层的非线性映射，将输入数据映射到高维的隐空间，在隐空间进行计算后，再将结果映射回输出空间，给出预测结果。

In [40]:
import xgboost as xgb
import lightgbm as lgb
from sklearn import svm
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [41]:
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

In [42]:
import tensorflow as tf
class nn(object):
    def __init__(self, hidden_size = [32, 4]):
        self.layers = []
        for size in hidden_size:
            self.layers.append(tf.keras.layers.Dense(size, activation="relu"))
        
        self.output_layer = tf.keras.layers.Dense(1, activation="linear")
    
    def __call__(self, input_tensor):
        x = input_tensor
        for layer in self.layers:
            x = layer(x)
        out = self.output_layer(x)
        return out

In [43]:
def standardization(mean, std, x):
    return (x-mean)/std

def cv_model(clf, train_x, train_y, test_x, clf_name):
    folds = 3
    kf = KFold(n_splits=folds, shuffle=True, random_state=100)

    test = np.zeros(test_x.shape[0])

    cv_scores = []
    
    mean_x = pd.concat([train_x, test_x]).mean()
    std_x = pd.concat([train_x, test_x]).std()

    for i, (train_idx, valid_idx) in enumerate(kf.split(train_x, train_y)):
        print('iteration {}'.format(str(i+1)))
        train_x_i, train_y_i = train_x.iloc[train_idx], train_y[train_idx]
        valid_x_i, valid_y_i = train_x.iloc[valid_idx], train_y[valid_idx]
        
        
        if clf_name == "lgb":
            train_D = clf.Dataset(train_x_i, label=train_y_i)
            valid_D = clf.Dataset(valid_x_i, label=valid_y_i)

            params = {
                'boosting_type': 'gbdt',
                'objective': 'binary',
                'metric': 'auc',
                'min_child_weight': 5,
                'num_leaves': 2 ** 5,
                'lambda_l2': 10,
                'feature_fraction': 0.8,
                'bagging_fraction': 0.8,
                'bagging_freq': 4,
                'learning_rate': 0.1,
                'nthread': 28,
                'n_jobs':24,
                'verbose': -1,
            }

            model = clf.train(params, train_D, 50000, valid_sets=[train_D, valid_D], verbose_eval=200,early_stopping_rounds=200)
            val_pred = model.predict(valid_x_i, num_iteration=model.best_iteration)
            test_pred = model.predict(test_x, num_iteration=model.best_iteration)


        if clf_name == 'svm':
            for x in [train_x_i, valid_x_i, test_x]:
                x.iloc[:] = standardization(mean_x, std_x, x)

            # model = clf.SVC(kernel='linear')
            model = clf(C=1,dual=False,max_iter=1000,verbose=1,class_weight='balanced')#max_iter=10000
            model.fit(train_x_i, train_y_i)
            val_pred = model.predict(valid_x_i)
            test_pred = model.predict(test_x)
        

        if clf_name == 'lr':
            for x in [train_x_i, valid_x_i, test_x]:
                x.iloc[:] = standardization(mean_x, std_x, x)
            model = clf(verbose=1,class_weight='balanced')
            model.fit(train_x_i, train_y_i)
            val_pred = model.predict_proba(valid_x_i)[:,1]
            test_pred = model.predict_proba(test_x)[:,1]
    

        if clf_name == 'nn':
            for x in [train_x_i, valid_x_i, test_x]:
                x.iloc[:] = standardization(mean_x, std_x, x)
            
            n_epochs = 10
            batch_size = 256
            num_steps = train_x_i.shape[0] // batch_size + 1

            x = tf.placeholder("float", [None, train_x_i.shape[1]])
            y_true = tf.placeholder("float", [None, 1])

            model = clf()
            y_logits = model(x)
            y_pred = tf.sigmoid(y_logits)
            loss = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits=y_logits, labels=y_true))
            optimizer = tf.train.AdamOptimizer(3e-3).minimize(loss)
            init = tf.global_variables_initializer()
            sess = tf.Session()
            sess.run(init)
            for epoch in range(n_epochs):
                print("epoch", epoch)
                for step in range(num_steps):
                    
                    if step % 200 == 0:
                        print(step)
                        val_pred = np.squeeze(sess.run(y_pred, feed_dict={x: valid_x_i}))
                        print('auc: ', roc_auc_score(valid_y_i, val_pred))

                    offset = (step * batch_size)
                    batch_train = (train_x_i.values)[offset:(offset + batch_size), :]
                    batch_labels = (train_y_i.values)[:,np.newaxis][offset:(offset + batch_size), :]
                    _, l = sess.run([optimizer, loss], feed_dict={
                                    x: batch_train, y_true: batch_labels})
                    
            val_pred = np.squeeze(sess.run(y_pred, feed_dict={x: valid_x_i}))
            test_pred = np.squeeze(sess.run(y_pred, feed_dict={x: test_x}))
            

        if clf_name == "xgb":
            train_D = clf.DMatrix(train_x_i, label=train_y_i)
            valid_D = clf.DMatrix(valid_x_i, label=valid_y_i)
            params = {
                    'eval_metric': 'auc',
                    'objective': 'binary:logistic',
                    'gamma': 1,
                    'max_depth': 5
            }
            model = clf.train(params, train_D, num_boost_round=200, evals=[(train_D, 'train'),(valid_D, 'eval')], verbose_eval=2,early_stopping_rounds=10)
            val_pred = model.predict(clf.DMatrix(valid_x_i), ntree_limit=model.best_ntree_limit)
            test_pred = model.predict(clf.DMatrix(test_x), ntree_limit=model.best_ntree_limit)

        test += test_pred / kf.n_splits
        cv_scores.append(roc_auc_score(valid_y_i, val_pred))
        
        print("cv_socres:", cv_scores)
        
            
        
    print("%s_scotrainre_list:" % clf_name, cv_scores)
    print("%s_score_mean:" % clf_name, np.mean(cv_scores))
    print("%s_score_std:" % clf_name, np.std(cv_scores))
    return test

In [44]:
def lgb_model(x_train, y_train, x_test):
    lgb_test = cv_model(lgb, x_train, y_train, x_test, "lgb")
    return lgb_test

def xgb_model(x_train, y_train, x_test):
    xgb_test = cv_model(xgb, x_train, y_train, x_test, "xgb")
    return xgb_test

def svm_model(x_train, y_train, x_test):
    svm_test = cv_model(LinearSVC, x_train, y_train, x_test, "svm")
    return svm_test

def nn_model(x_train, y_train, x_test):
    nn_test = cv_model(nn, x_train, y_train, x_test, "nn")
    return nn_test

def lr_model(x_train, y_train, x_test):
    lr_test = cv_model(LogisticRegression, x_train, y_train, x_test, "lr")
    return lr_test

In [45]:
features = [f for f in train.columns if f not in ['id','isDefault','policyCode']]
x_train = train[features]
x_test = test[features]
y_train = train['isDefault']

In [46]:
# lgb_test = lgb_model(x_train, y_train, x_test) # test auc 0.7337
# svm_test = svm_model(x_train, y_train, x_test) # test auc 0.6507
# nn_test = nn_model(x_train, y_train, x_test) # test auc 0.7238
# lr_test = lr_model(x_train, y_train, x_test) # test auc 0.6937
xgb_test = xgb_model(x_train, y_train, x_test) # test auc 0.7314

iteration 1
[0]	train-auc:0.70071	eval-auc:0.70257
[2]	train-auc:0.70898	eval-auc:0.71029
[4]	train-auc:0.71219	eval-auc:0.71336
[6]	train-auc:0.71502	eval-auc:0.71591
[8]	train-auc:0.71754	eval-auc:0.71836
[10]	train-auc:0.72097	eval-auc:0.72142
[12]	train-auc:0.72303	eval-auc:0.72324
[14]	train-auc:0.72475	eval-auc:0.72461
[16]	train-auc:0.72637	eval-auc:0.72578
[18]	train-auc:0.72739	eval-auc:0.72642
[20]	train-auc:0.72848	eval-auc:0.72701
[22]	train-auc:0.72984	eval-auc:0.72782
[24]	train-auc:0.73070	eval-auc:0.72822
[26]	train-auc:0.73156	eval-auc:0.72865
[28]	train-auc:0.73258	eval-auc:0.72924
[30]	train-auc:0.73340	eval-auc:0.72968
[32]	train-auc:0.73413	eval-auc:0.72978
[34]	train-auc:0.73508	eval-auc:0.73025
[36]	train-auc:0.73588	eval-auc:0.73065
[38]	train-auc:0.73653	eval-auc:0.73079
[40]	train-auc:0.73703	eval-auc:0.73102
[42]	train-auc:0.73768	eval-auc:0.73128
[44]	train-auc:0.73838	eval-auc:0.73149
[46]	train-auc:0.73881	eval-auc:0.73157
[48]	train-auc:0.73952	eval-auc:0



cv_socres: [0.7343723639111743]
iteration 2
[0]	train-auc:0.70141	eval-auc:0.70018
[2]	train-auc:0.71006	eval-auc:0.70767
[4]	train-auc:0.71323	eval-auc:0.71035
[6]	train-auc:0.71637	eval-auc:0.71329
[8]	train-auc:0.71906	eval-auc:0.71567
[10]	train-auc:0.72228	eval-auc:0.71848
[12]	train-auc:0.72428	eval-auc:0.72001
[14]	train-auc:0.72624	eval-auc:0.72153
[16]	train-auc:0.72799	eval-auc:0.72270
[18]	train-auc:0.72902	eval-auc:0.72339
[20]	train-auc:0.73029	eval-auc:0.72420
[22]	train-auc:0.73124	eval-auc:0.72478
[24]	train-auc:0.73220	eval-auc:0.72541
[26]	train-auc:0.73298	eval-auc:0.72609
[28]	train-auc:0.73395	eval-auc:0.72667
[30]	train-auc:0.73505	eval-auc:0.72743
[32]	train-auc:0.73576	eval-auc:0.72780
[34]	train-auc:0.73643	eval-auc:0.72794
[36]	train-auc:0.73719	eval-auc:0.72827
[38]	train-auc:0.73779	eval-auc:0.72862
[40]	train-auc:0.73840	eval-auc:0.72882
[42]	train-auc:0.73919	eval-auc:0.72917
[44]	train-auc:0.73983	eval-auc:0.72932
[46]	train-auc:0.74046	eval-auc:0.72951
[



cv_socres: [0.7343723639111743, 0.7311354121156903]
iteration 3
[0]	train-auc:0.70208	eval-auc:0.69908
[2]	train-auc:0.71048	eval-auc:0.70679
[4]	train-auc:0.71371	eval-auc:0.70964
[6]	train-auc:0.71668	eval-auc:0.71224
[8]	train-auc:0.71952	eval-auc:0.71478
[10]	train-auc:0.72207	eval-auc:0.71680
[12]	train-auc:0.72461	eval-auc:0.71876
[14]	train-auc:0.72667	eval-auc:0.72035
[16]	train-auc:0.72837	eval-auc:0.72137
[18]	train-auc:0.72957	eval-auc:0.72210
[20]	train-auc:0.73050	eval-auc:0.72241
[22]	train-auc:0.73187	eval-auc:0.72328
[24]	train-auc:0.73297	eval-auc:0.72388
[26]	train-auc:0.73376	eval-auc:0.72432
[28]	train-auc:0.73456	eval-auc:0.72463
[30]	train-auc:0.73539	eval-auc:0.72502
[32]	train-auc:0.73643	eval-auc:0.72548
[34]	train-auc:0.73714	eval-auc:0.72569
[36]	train-auc:0.73769	eval-auc:0.72588
[38]	train-auc:0.73847	eval-auc:0.72615
[40]	train-auc:0.73904	eval-auc:0.72624
[42]	train-auc:0.73996	eval-auc:0.72673
[44]	train-auc:0.74066	eval-auc:0.72685
[46]	train-auc:0.7412



cv_socres: [0.7343723639111743, 0.7311354121156903, 0.7293684562845655]
xgb_scotrainre_list: [0.7343723639111743, 0.7311354121156903, 0.7293684562845655]
xgb_score_mean: 0.7316254107704768
xgb_score_std: 0.0020720114052417606


# 代码提交

In [49]:
sub = test[['id']].copy()

In [50]:
sub['isDefault']=xgb_test#lr_test#svm_test#nn_test#svm_test#lgb_test

In [51]:
sub.to_csv('prediction.csv', index=False)