In [1]:
import numpy as np
import pandas as pd
np.random.seed(10)
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV

X, y = make_classification(n_samples=800)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4)

In [3]:
X.shape,y.shape

((800, 20), (800,))

### 采用默认参数

In [5]:
rf0 = RandomForestClassifier(random_state=10)
rf0.fit(X_train,y_train)
y_trainprob = rf0.predict_proba(X_train)[:,1]
y_testprob = rf0.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % roc_auc_score(y_train, y_trainprob))
print ("AUC Score (Test): %f" % roc_auc_score(y_test, y_testprob))

AUC Score (Train): 0.999931
AUC Score (Test): 0.973297


In [6]:
rf0.get_params

<bound method RandomForestClassifier.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=10,
            verbose=0, warm_start=False)>

In [218]:
##  n_estimators：10， max_depth：None， min_samples_split， min_samples_leaf， max_features：

# The number of features to consider when looking for the best split:
# If int, then consider max_features features at each split.
# If float, then max_features is a percentage and int(max_features * n_features) features are considered at each split.
# If “auto”, then max_features=sqrt(n_features).
# If “sqrt”, then max_features=sqrt(n_features) (same as “auto”).
# If “log2”, then max_features=log2(n_features).
# If None, then max_features=n_features.

In [8]:
n_estimators=10
max_depth=None
min_samples_split=2
min_samples_leaf=1
max_features='auto'

In [11]:
param_test1 = {'n_estimators':range(10,101,10)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(min_samples_split=min_samples_split,
                                  min_samples_leaf=min_samples_leaf,max_depth=max_depth,
                                    max_features=max_features,random_state=10),
                                    param_grid = param_test1, scoring='roc_auc',cv=3)

In [12]:
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.96902, std: 0.00832, params: {'n_estimators': 10},
  mean: 0.97642, std: 0.00934, params: {'n_estimators': 20},
  mean: 0.97686, std: 0.01098, params: {'n_estimators': 30},
  mean: 0.97807, std: 0.00981, params: {'n_estimators': 40},
  mean: 0.97849, std: 0.01029, params: {'n_estimators': 50},
  mean: 0.97841, std: 0.00988, params: {'n_estimators': 60},
  mean: 0.97884, std: 0.00989, params: {'n_estimators': 70},
  mean: 0.97861, std: 0.01010, params: {'n_estimators': 80},
  mean: 0.97821, std: 0.00931, params: {'n_estimators': 90},
  mean: 0.97771, std: 0.01030, params: {'n_estimators': 100}],
 {'n_estimators': 70},
 0.9788427926158679)

In [13]:
n_estimators=70

# n_estimators 查看

In [14]:
rf0 = RandomForestClassifier(n_estimators=n_estimators,random_state=10)
rf0.fit(X_train,y_train)
y_trainprob = rf0.predict_proba(X_train)[:,1]
y_testprob = rf0.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % roc_auc_score(y_train, y_trainprob))
print ("AUC Score (Test): %f" % roc_auc_score(y_test, y_testprob))

AUC Score (Train): 1.000000
AUC Score (Test): 0.983611


# max_depth , min_samples_split 查看

In [15]:
param_test1 = {'max_depth':range(10,41,10),'min_samples_split':range(2,11,2)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(
                                  min_samples_leaf=min_samples_leaf,n_estimators=n_estimators,max_features=max_features,random_state=10),
                                    param_grid = param_test1, scoring='roc_auc',cv=3)

In [16]:
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.97886, std: 0.00932, params: {'min_samples_split': 2, 'max_depth': 10},
  mean: 0.97910, std: 0.01029, params: {'min_samples_split': 4, 'max_depth': 10},
  mean: 0.97987, std: 0.01060, params: {'min_samples_split': 6, 'max_depth': 10},
  mean: 0.97970, std: 0.01209, params: {'min_samples_split': 8, 'max_depth': 10},
  mean: 0.97838, std: 0.01280, params: {'min_samples_split': 10, 'max_depth': 10},
  mean: 0.97884, std: 0.00989, params: {'min_samples_split': 2, 'max_depth': 20},
  mean: 0.97967, std: 0.01005, params: {'min_samples_split': 4, 'max_depth': 20},
  mean: 0.98015, std: 0.01082, params: {'min_samples_split': 6, 'max_depth': 20},
  mean: 0.97945, std: 0.01215, params: {'min_samples_split': 8, 'max_depth': 20},
  mean: 0.97843, std: 0.01256, params: {'min_samples_split': 10, 'max_depth': 20},
  mean: 0.97884, std: 0.00989, params: {'min_samples_split': 2, 'max_depth': 30},
  mean: 0.97967, std: 0.01005, params: {'min_samples_split': 4, 'max_depth': 30},
  mean: 0.9801

In [17]:
max_depth=20
min_samples_split=6

In [19]:
rf0 = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,
                             min_samples_split=min_samples_split,random_state=10)
rf0.fit(X_train,y_train)
y_trainprob = rf0.predict_proba(X_train)[:,1]
y_testprob = rf0.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % roc_auc_score(y_train, y_trainprob))
print ("AUC Score (Test): %f" % roc_auc_score(y_test, y_testprob))

AUC Score (Train): 0.999931
AUC Score (Test): 0.979762


# min_samples_leaf 

In [20]:
param_test1 = {'min_samples_leaf':range(1,5,1)}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(max_depth=max_depth,min_samples_split=min_samples_split,
                                  n_estimators=n_estimators,max_features=max_features,random_state=10),
                                    param_grid = param_test1, scoring='roc_auc',cv=3)

In [21]:
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.98015, std: 0.01082, params: {'min_samples_leaf': 1},
  mean: 0.97903, std: 0.01132, params: {'min_samples_leaf': 2},
  mean: 0.97603, std: 0.01211, params: {'min_samples_leaf': 3},
  mean: 0.97607, std: 0.01161, params: {'min_samples_leaf': 4}],
 {'min_samples_leaf': 1},
 0.9801527606329256)

In [22]:
min_samples_leaf=1

In [23]:
rf0 = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_split=min_samples_split,min_samples_leaf=min_samples_leaf,random_state=10)
rf0.fit(X_train,y_train)
y_trainprob = rf0.predict_proba(X_train)[:,1]
y_testprob = rf0.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % roc_auc_score(y_train, y_trainprob))
print ("AUC Score (Test): %f" % roc_auc_score(y_test, y_testprob))

AUC Score (Train): 0.999931
AUC Score (Test): 0.979762


# max_features

In [24]:
param_test1 = {'max_features':[3,4,5,6]}
gsearch1 = GridSearchCV(estimator = RandomForestClassifier(max_depth=max_depth,min_samples_split=min_samples_split,
                                  n_estimators=n_estimators,min_samples_leaf=min_samples_leaf,random_state=10),
                                    param_grid = param_test1, scoring='roc_auc',cv=3)

In [25]:
gsearch1.fit(X,y)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_



([mean: 0.97921, std: 0.00900, params: {'max_features': 3},
  mean: 0.98015, std: 0.01082, params: {'max_features': 4},
  mean: 0.97905, std: 0.01231, params: {'max_features': 5},
  mean: 0.97781, std: 0.01277, params: {'max_features': 6}],
 {'max_features': 4},
 0.9801527606329256)

In [26]:
max_features=4

In [27]:
rf0 = RandomForestClassifier(n_estimators=n_estimators,max_depth=max_depth,min_samples_split=min_samples_split,
                             min_samples_leaf=min_samples_leaf,max_features=max_features,random_state=10)
rf0.fit(X_train,y_train)
y_trainprob = rf0.predict_proba(X_train)[:,1]
y_testprob = rf0.predict_proba(X_test)[:,1]
print ("AUC Score (Train): %f" % roc_auc_score(y_train, y_trainprob))
print ("AUC Score (Test): %f" % roc_auc_score(y_test, y_testprob))

AUC Score (Train): 0.999931
AUC Score (Test): 0.979762


In [28]:
print("n_estimators:",n_estimators)
print("max_depth:",max_depth)
print("min_samples_split:",min_samples_split)
print("min_samples_leaf:",min_samples_leaf)
print("max_features:",max_features)

('n_estimators:', 70)
('max_depth:', 20)
('min_samples_split:', 6)
('min_samples_leaf:', 1)
('max_features:', 4)


In [29]:
rf0.get_params

<bound method RandomForestClassifier.get_params of RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=20, max_features=4, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=6, min_weight_fraction_leaf=0.0,
            n_estimators=70, n_jobs=1, oob_score=False, random_state=10,
            verbose=0, warm_start=False)>