scikit-learn随机森林调参小结

In [4]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn import model_selection, metrics

import matplotlib.pylab as plt
%matplotlib inline

In [7]:
train = pd.read_csv('dataset/train_modified.csv')
target='Disbursed' # Disbursed的值就是二元分类的输出
IDcol = 'ID'
train['Disbursed'].value_counts() 

0    19680
1      320
Name: Disbursed, dtype: int64

In [8]:
x_columns = [x for x in train.columns if x not in [target, IDcol]]
X = train[x_columns]
y = train['Disbursed']

In [9]:
rf0 = RandomForestClassifier(oob_score=True, random_state=10)
rf0.fit(X,y)
print(rf0.oob_score_)
y_predprob = rf0.predict_proba(X)[:,1]
print("AUC Score (Train): %f" % metrics.roc_auc_score(y, y_predprob))

0.98315
AUC Score (Train): 0.999994


In [11]:
param_test1 = {'n_estimators':[10,20,30,40,50,60,70]}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=100, min_samples_leaf=20,
                                                           max_depth=8, max_features='sqrt',random_state=10), 
                        param_grid = param_test1, scoring='roc_auc', cv=5)
gsearch1.fit(X,y)
gsearch1.best_params_, gsearch1.best_score_

({'n_estimators': 60}, 0.8211334476626015)

In [12]:
param_test2 = {'max_depth':[3,5,7,9,11,13], 'min_samples_split':[50,70,90,110,130,150,170,190]}
gsearch2 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=60, min_samples_leaf=20,
                                                           max_features='sqrt', oob_score=True, random_state=10),
                        param_grid = param_test2, scoring='roc_auc', cv=5)
gsearch2.fit(X,y)
gsearch2.best_params_, gsearch2.best_score_

({'max_depth': 13, 'min_samples_split': 110}, 0.8242016800050813)

In [13]:
rf1 = RandomForestClassifier(n_estimators=60, max_depth=13, min_samples_split=110, 
                             min_samples_leaf=20, max_features='sqrt', oob_score=True, random_state=10)
rf1.fit(X,y)
print (rf1.oob_score_)

0.984


In [14]:
param_test3 = {'min_samples_split':[80,100,120,140], 'min_samples_leaf':[10,20,30,40,50]}
gsearch3 = GridSearchCV(estimator = RandomForestClassifier(n_estimators=60, max_depth=13, max_features='sqrt',
                                                           oob_score=True, random_state=10), 
                        param_grid = param_test3, scoring='roc_auc',iid=False, cv=5)
gsearch3.fit(X,y)
gsearch3.best_params_, gsearch3.best_score_

([mean: 0.82093, std: 0.02287, params: {'min_samples_leaf': 10, 'min_samples_split': 80},
  mean: 0.81913, std: 0.02141, params: {'min_samples_leaf': 10, 'min_samples_split': 100},
  mean: 0.82048, std: 0.02328, params: {'min_samples_leaf': 10, 'min_samples_split': 120},
  mean: 0.81798, std: 0.02099, params: {'min_samples_leaf': 10, 'min_samples_split': 140},
  mean: 0.82094, std: 0.02535, params: {'min_samples_leaf': 20, 'min_samples_split': 80},
  mean: 0.82097, std: 0.02327, params: {'min_samples_leaf': 20, 'min_samples_split': 100},
  mean: 0.82487, std: 0.02110, params: {'min_samples_leaf': 20, 'min_samples_split': 120},
  mean: 0.82169, std: 0.02406, params: {'min_samples_leaf': 20, 'min_samples_split': 140},
  mean: 0.82352, std: 0.02271, params: {'min_samples_leaf': 30, 'min_samples_split': 80},
  mean: 0.82164, std: 0.02381, params: {'min_samples_leaf': 30, 'min_samples_split': 100},
  mean: 0.82070, std: 0.02528, params: {'min_samples_leaf': 30, 'min_samples_split': 120},
  

In [15]:
param_test4 = {'max_features':[3,5,7,9]}
gsearch4 = GridSearchCV(estimator=RandomForestClassifier(n_estimators=60, max_depth=13, min_samples_split=120,
                                                           min_samples_leaf=20 ,oob_score=True, random_state=10),
                        param_grid=param_test4, scoring='roc_auc', cv=5)
gsearch4.fit(X,y)
gsearch4.best_params_, gsearch4.best_score_

({'max_features': 7}, 0.8248650279471545)

In [16]:
rf2 = RandomForestClassifier(n_estimators=60, max_depth=13, min_samples_split=120, min_samples_leaf=20, 
                             max_features=7 ,oob_score=True, random_state=10)
rf2.fit(X,y)
print (rf2.oob_score_)

0.984
