In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn import cross_validation, metrics

import matplotlib.pylab as plt
%matplotlib inline

  from numpy.core.umath_tests import inner1d


In [2]:
#导入数据
train = pd.read_csv('train_modified.csv')

In [3]:
print(train.shape)
print(train.head())

(20000, 51)
   Disbursed  Existing_EMI           ID  Loan_Amount_Applied  \
0          0           0.0  ID000002C20               300000   
1          0           0.0  ID000004E40               200000   
2          0           0.0  ID000007H20               600000   
3          0           0.0  ID000008I30              1000000   
4          0       25000.0  ID000009J40               500000   

   Loan_Tenure_Applied  Monthly_Income  Var4  Var5  Age  \
0                    5           20000     1     0   37   
1                    2           35000     3    13   30   
2                    4           22500     1     0   34   
3                    5           35000     3    10   28   
4                    2          100000     3    17   31   

   EMI_Loan_Submitted_Missing    ...     Var2_2  Var2_3  Var2_4  Var2_5  \
0                           1    ...          0       0       0       0   
1                           0    ...          0       0       0       0   
2                      

In [4]:
#类别分布
target='Disbursed' # Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts() 

0    19680
1      320
Name: Disbursed, dtype: int64

In [5]:
#生成训练数据
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [7]:
#训练模型-使用默认参数
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X,y)
print (rf0.oob_score_)
y_predprob = rf0.predict_proba(X)[:,1]
print ("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

0.98005
AUC Score (Train): 0.999833


  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])


In [8]:
#调参-n_estimators
param_test1 = {'n_estimators':list(range(10,71,10))}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100,
                                                           min_samples_leaf=20,
                                                           max_depth=8,
                                                           max_features='sqrt',
                                                           random_state=10), 
                        param_grid = param_test1, 
                        scoring='roc_auc',cv=5)
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

([mean: 0.80681, std: 0.02236, params: {'n_estimators': 10},
  mean: 0.81600, std: 0.03275, params: {'n_estimators': 20},
  mean: 0.81818, std: 0.03136, params: {'n_estimators': 30},
  mean: 0.81838, std: 0.03118, params: {'n_estimators': 40},
  mean: 0.82034, std: 0.03001, params: {'n_estimators': 50},
  mean: 0.82113, std: 0.02966, params: {'n_estimators': 60},
  mean: 0.81992, std: 0.02836, params: {'n_estimators': 70}],
 {'n_estimators': 60},
 0.8211334476626017)

In [9]:
#调参-对决策树最大深度max_depth和内部节点再划分所需最小样本数min_samples_split进行网格搜索
param_test2 = {'max_depth':list(range(3,14,2)), 'min_samples_split':list(range(50,201,20))}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           min_samples_leaf=20,
                                                           max_features='sqrt' ,
                                                           oob_score=True, 
                                                           random_state=10),
                        param_grid = param_test2, 
                        scoring='roc_auc',
                        iid=False, 
                        cv=5)
gsearch2.fit(X,y)
gsearch2.grid_scores_, gsearch2.best_params_, gsearch2.best_score_

([mean: 0.79379, std: 0.02347, params: {'max_depth': 3, 'min_samples_split': 50},
  mean: 0.79339, std: 0.02410, params: {'max_depth': 3, 'min_samples_split': 70},
  mean: 0.79350, std: 0.02462, params: {'max_depth': 3, 'min_samples_split': 90},
  mean: 0.79367, std: 0.02493, params: {'max_depth': 3, 'min_samples_split': 110},
  mean: 0.79387, std: 0.02521, params: {'max_depth': 3, 'min_samples_split': 130},
  mean: 0.79373, std: 0.02524, params: {'max_depth': 3, 'min_samples_split': 150},
  mean: 0.79378, std: 0.02532, params: {'max_depth': 3, 'min_samples_split': 170},
  mean: 0.79349, std: 0.02542, params: {'max_depth': 3, 'min_samples_split': 190},
  mean: 0.80960, std: 0.02602, params: {'max_depth': 5, 'min_samples_split': 50},
  mean: 0.80920, std: 0.02629, params: {'max_depth': 5, 'min_samples_split': 70},
  mean: 0.80888, std: 0.02522, params: {'max_depth': 5, 'min_samples_split': 90},
  mean: 0.80923, std: 0.02777, params: {'max_depth': 5, 'min_samples_split': 110},
  mean: 0.

In [11]:
#根据调参结果训练模型
rf1 = RandomForestClassifier(n_estimators= 60, 
                             max_depth=13, 
                             min_samples_split=110,
                             min_samples_leaf=20,
                             max_features='sqrt' ,
                             oob_score=True, 
                             random_state=10)
rf1.fit(X,y)
print (rf1.oob_score_)

0.984


In [12]:
#内部节点再划分所需最小样本数min_samples_split和叶子节点最少样本数min_samples_leaf一起调参
param_test3 = {'min_samples_split':list(range(80,150,20)), 'min_samples_leaf':list(range(10,60,10))}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           max_depth=13,
                                                           max_features='sqrt',
                                                           oob_score=True, 
                                                           random_state=10),
                        param_grid = param_test3, 
                        scoring='roc_auc',
                        iid=False, 
                        cv=5)
gsearch3.fit(X,y)
gsearch3.grid_scores_, gsearch3.best_params_, gsearch3.best_score_

([mean: 0.82093, std: 0.02287, params: {'min_samples_leaf': 10, 'min_samples_split': 80},
  mean: 0.81913, std: 0.02141, params: {'min_samples_leaf': 10, 'min_samples_split': 100},
  mean: 0.82048, std: 0.02328, params: {'min_samples_leaf': 10, 'min_samples_split': 120},
  mean: 0.81798, std: 0.02099, params: {'min_samples_leaf': 10, 'min_samples_split': 140},
  mean: 0.82094, std: 0.02535, params: {'min_samples_leaf': 20, 'min_samples_split': 80},
  mean: 0.82097, std: 0.02327, params: {'min_samples_leaf': 20, 'min_samples_split': 100},
  mean: 0.82487, std: 0.02110, params: {'min_samples_leaf': 20, 'min_samples_split': 120},
  mean: 0.82169, std: 0.02406, params: {'min_samples_leaf': 20, 'min_samples_split': 140},
  mean: 0.82352, std: 0.02271, params: {'min_samples_leaf': 30, 'min_samples_split': 80},
  mean: 0.82164, std: 0.02381, params: {'min_samples_leaf': 30, 'min_samples_split': 100},
  mean: 0.82070, std: 0.02528, params: {'min_samples_leaf': 30, 'min_samples_split': 120},
  

In [13]:
#最大特征数max_features做调参
param_test4 = {'max_features':list(range(3,11,2))}
gsearch4 = GridSearchCV(estimator = RandomForestClassifier(n_estimators= 60, 
                                                           max_depth=13, 
                                                           min_samples_split=120,
                                                           min_samples_leaf=20 ,
                                                           oob_score=True, 
                                                           random_state=10),
                        param_grid = param_test4, 
                        scoring='roc_auc',
                        iid=False, 
                        cv=5)
gsearch4.fit(X,y)
gsearch4.grid_scores_, gsearch4.best_params_, gsearch4.best_score_

([mean: 0.81981, std: 0.02586, params: {'max_features': 3},
  mean: 0.81639, std: 0.02533, params: {'max_features': 5},
  mean: 0.82487, std: 0.02110, params: {'max_features': 7},
  mean: 0.81704, std: 0.02209, params: {'max_features': 9}],
 {'max_features': 7},
 0.8248650279471544)

In [14]:
#根据调参结果训练模型
rf2 = RandomForestClassifier(n_estimators= 60, 
                             max_depth=13, 
                             min_samples_split=120,
                             min_samples_leaf=20,
                             max_features=7 ,
                             oob_score=True, 
                             random_state=10)
rf2.fit(X,y)
print (rf2.oob_score_)

0.984
