In [2]:
# Pandas is used for data manipulation
import pandas as pd

# Read in data as a dataframe
features = pd.read_csv('./datalab/62821/temps_extended.csv')

In [3]:
# One Hot Encoding
features = pd.get_dummies(features)

# Extract features and labels
labels = features['actual']
features = features.drop('actual', axis = 1)

# List of features for later use
feature_list = list(features.columns)

# Convert to numpy arrays
import numpy as np

features = np.array(features)
labels = np.array(labels)

# Training and Testing Sets
from sklearn.model_selection import train_test_split

train_features, test_features, train_labels, test_labels = train_test_split(features, labels, 
                                                                            test_size = 0.25, random_state = 42)

In [4]:
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

Training Features Shape: (1643, 17)
Training Labels Shape: (1643,)
Testing Features Shape: (548, 17)
Testing Labels Shape: (548,)


In [5]:
print('{:0.1f} years of data in the training set'.format(train_features.shape[0] / 365.))
print('{:0.1f} years of data in the test set'.format(test_features.shape[0] / 365.))

4.5 years of data in the training set
1.5 years of data in the test set


## 选择那6个重要性比较高的特征


In [6]:
# Names of five importances accounting for 95% of total importance
important_feature_names = ['temp_1', 'average', 'ws_1', 'temp_2', 'friend', 'year']

# Find the columns of the most important features
important_indices = [feature_list.index(feature) for feature in important_feature_names]

# Create training and testing sets with only the important features
important_train_features = train_features[:, important_indices]
important_test_features = test_features[:, important_indices]

# Sanity check on operations
print('Important train features shape:', important_train_features.shape)
print('Important test features shape:', important_test_features.shape)

Important train features shape: (1643, 6)
Important test features shape: (548, 6)


In [7]:
# Use only the most important features
train_features = important_train_features[:]
test_features = important_test_features[:]

# Update feature list for visualizations
feature_list = important_feature_names[:]

### 开始调节新的参数

In [8]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(random_state = 42)

from pprint import pprint

# 打印所有参数
pprint(rf.get_params())

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': 42,
 'verbose': 0,
 'warm_start': False}


# 开始尝试

RandomizedSearchCV

In [9]:
from sklearn.model_selection import RandomizedSearchCV

# 建立树的个数
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# 最大特征的选择方式
max_features = ['auto', 'sqrt']
# 树的最大深度
max_depth = [int(x) for x in np.linspace(10, 20, num = 2)]
max_depth.append(None)
# 节点最小分裂所需样本个数
min_samples_split = [2, 5, 10]
# 叶子节点最小样本数，任何分裂不能让其子节点样本数少于此值
min_samples_leaf = [1, 2, 4]
# 样本采样方法
bootstrap = [True, False]

# Random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [15]:
# 随机选择最合适的参数组合
rf = RandomForestRegressor()

rf_random = RandomizedSearchCV(estimator=rf, param_distributions=random_grid,
                              n_iter = 100, scoring='neg_mean_absolute_error', 
                              cv = 3, verbose=2, random_state=42, n_jobs=-1)

# 执行寻找操作
rf_random.fit(train_features, train_labels)

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] n_estimators=800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=True 
[CV] n_estimators=800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=True 
[CV] n_estimators=800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=True 
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False 
[CV]  n_estimators=800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=True, total=   3.4s
[CV]  n_estimators=800, min_samples_split=2, min_samples_leaf=4, max_features=auto, max_depth=None, bootstrap=True, total=   3.4s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=10, bootstrap=False 
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=1, max_features=auto, max_depth=10, boot

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   47.3s


[CV]  n_estimators=400, min_samples_split=5, min_samples_leaf=2, max_features=auto, max_depth=10, bootstrap=False, total=   2.8s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=False 
[CV]  n_estimators=1800, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=False, total=   6.4s
[CV] n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=False 
[CV]  n_estimators=1800, min_samples_split=10, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=False, total=   6.7s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimators=1000, min_samples_split=2, min_samples_leaf=1, max_features=sqrt, max_depth=10, bootstrap=False, total=   4.4s
[CV] n_estimators=1800, min_samples_split=2, min_samples_leaf=4, max_features=sqrt, max_depth=10, bootstrap=True 
[CV]  n_estimator

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  3.6min


[CV]  n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True, total=   3.0s
[CV] n_estimators=400, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True, total=   2.4s
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True 
[CV]  n_estimators=800, min_samples_split=5, min_samples_leaf=2, max_features=sqrt, max_depth=None, bootstrap=True, total=   3.0s
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True 
[CV]  n_estimators=400, min_samples_split=10, min_samples_leaf=1, max_features=auto, max_depth=None, bootstrap=True, total=   2.0s
[CV] n_estimators=2000, min_samples_split=5, min_samples_leaf=1, max_features=auto, max_depth=20, bootstrap=True 
[CV]  n_estimators=4

[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  6.9min finished


RandomizedSearchCV(cv=3, error_score='raise',
          estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
          fit_params=None, iid=True, n_iter=100, n_jobs=-1,
          param_distributions={'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]},
          pre_dispatch='2*n_jobs', random_state=42, refit=True,
          return_train_score='warn', scoring='neg_mean_absolute_error',
          verbose=2)

这里给大家解释一下RandomizedSearchCV中常用的参数，其实在API文档中都给出了说明，还是建议大家养成这个查阅文档的习惯。

- Estimator：RandomizedSearchCV这个方法是一个通用的，并不是专为随机森林设计的，所以我们需要指定选择的算法模型是什么。
- Distributions：参数的候选空间，我们之间已经用字典格式给出了所需的参数分布。
- n_iter：随机寻找参数组合的个数，比如在这里我们赋值了100代表接下来要随机找100组参数的组合，在其中找到最好的一个。
- Scoring：评估方法，按照该方法去找到最好的参数组合
- Cv：交叉验证，咱们之前已经唠过了。
- Verbose：打印信息的数量，看自己的需求了。
- random_state：随机种子，为了使得咱们的结果能够一致，排除掉随机成分的干扰，一般我们都会指定成一个值，用你自己的幸运数字就好。
- n_jobs：多线程来跑这个程序，如果是-1就会用所有的，但是可能会有点卡。

即便我把n_jobs设置成了-1，程序运行的还是很慢，因为我们建立100次模型来选择参数，并且还是带有3折交叉验证的，那就相当于300个任务了，结果如下图所示：

In [16]:
rf_random.best_params_

{'n_estimators': 1800,
 'min_samples_split': 10,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

## 评估函数

接下来就对比一下，经过调参后的结果和用默认参数结果的差异，所有默认参数在API中都有说明，比如n_estimators : integer, optional (default=10)，这里就说明在随机森林模型中，默认要建立树的个数是10个。先给出评估标准：

In [17]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    errors = abs(predictions - test_labels)
    mape = 100 * np.mean(errors / test_labels)
    accuracy = 100 - mape

    print('平均气温误差.',np.mean(errors))
    print('Accuracy = {:0.2f}%.'.format(accuracy))

#### 看看效果吧

老模型

In [18]:
base_model = RandomForestRegressor( random_state = 42)
base_model.fit(train_features, train_labels)
evaluate(base_model, test_features, test_labels)

平均气温误差. 3.91989051095
Accuracy = 93.36%.


#### 新配方（最好的参数）

In [19]:
best_random = rf_random.best_estimator_
evaluate(best_random, test_features, test_labels)

平均气温误差. 3.72006719575
Accuracy = 93.73%.


# Grid Search ，之前不是找到差不多的方案了嘛，再来微调！

可以看到模型的效果提升了一些，但是这已经是上限了嘛？还有没有可以进步的空间了呢？接下来我们又要介绍下位参选选手了：GridSearchCV()，它的意思是进行网络搜索，说白了就是一个一个的遍历，就像我们之前说的组合有多少种，就全部走一遍，其所需的参数都是类似的，没记住的话赶紧先翻一遍API文档：

In [20]:
from sklearn.model_selection import GridSearchCV

# 网络搜索
param_grid = {
    'bootstrap': [True],
    'max_depth': [8,10,12],
    'max_features': ['auto'],
    'min_samples_leaf': [2,3, 4, 5,6],
    'min_samples_split': [3, 5, 7],
    'n_estimators': [800, 900, 1000, 1200]
}

# 选择基本算法模型
rf = RandomForestRegressor()

# 网络搜索
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           scoring = 'neg_mean_absolute_error', cv = 3, 
                           n_jobs = -1, verbose = 2)

In [21]:
# 执行搜索
grid_search.fit(train_features, train_labels)

Fitting 3 folds for each of 180 candidates, totalling 540 fits
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800 
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800 
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800 
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=900 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800, total=   3.4s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=900 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800, total=   3.4s
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800, total=   3

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   37.2s


[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=7, n_estimators=1200, total=   5.2s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=800 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=800, total=   3.7s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=800 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=7, n_estimators=1200, total=   5.6s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=900 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=2, min_samples_split=7, n_estimators=1200, total=   5.4s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=3, min_samples_split=3, n_estimators=900 
[CV]  bootstrap=True, max_depth=8, max_fe

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.9min


[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=1200, total=   5.1s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=5, n_estimators=800 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=5, n_estimators=800, total=   3.2s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=5, n_estimators=900 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=3, n_estimators=1200, total=   4.9s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=5, n_estimators=900 
[CV]  bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=5, n_estimators=800, total=   3.0s
[CV] bootstrap=True, max_depth=8, max_features=auto, min_samples_leaf=6, min_samples_split=5, n_estimators=900 
[CV]  bootstrap=True, max_depth=8, max_fea

[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  6.9min


[CV]  bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   5.2s
[CV] bootstrap=True, max_depth=12, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800 
[CV]  bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   4.7s
[CV] bootstrap=True, max_depth=12, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800 
[CV]  bootstrap=True, max_depth=12, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=800, total=   3.9s
[CV] bootstrap=True, max_depth=12, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=900 
[CV]  bootstrap=True, max_depth=10, max_features=auto, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   4.8s
[CV] bootstrap=True, max_depth=12, max_features=auto, min_samples_leaf=2, min_samples_split=3, n_estimators=900 
[CV]  bootstrap=True, max_depth=1

[Parallel(n_jobs=-1)]: Done 540 out of 540 | elapsed: 10.7min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [8, 10, 12], 'max_features': ['auto'], 'min_samples_leaf': [2, 3, 4, 5, 6], 'min_samples_split': [3, 5, 7], 'n_estimators': [800, 900, 1000, 1200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=2)

In [22]:
grid_search.best_params_

{'bootstrap': True,
 'max_depth': 8,
 'max_features': 'auto',
 'min_samples_leaf': 6,
 'min_samples_split': 5,
 'n_estimators': 900}

In [23]:
best_grid = grid_search.best_estimator_
evaluate(best_grid, test_features, test_labels)

平均气温误差. 3.67701957407
Accuracy = 93.79%.


## 另一组参赛选手 Grid Search

经过了再调整之后我们的算法模型效果又有了一点提升，虽然只是一小点，但是把每一小步累计在一次就是一个大成绩了。再用网络搜索的时候，遍历的次数太多，我们通常并不把所有的可能性都放进去，而是分成不同的组来分别执行，下面我们再来看看另外一组网络搜索的参赛选手:

In [24]:
param_grid = {
    'bootstrap': [True],
    'max_depth': [12, 15, None],
    'max_features': [3, 4,'auto'],
    'min_samples_leaf': [5, 6, 7],
    'min_samples_split': [7,10,13],
    'n_estimators': [900, 1000, 1200]
}

# 选择算法模型
rf = RandomForestRegressor()

# 继续寻找
grid_search_ad = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           scoring = 'neg_mean_absolute_error', cv = 3, 
                           n_jobs = -1, verbose = 2)

grid_search_ad.fit(train_features, train_labels)

Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=900 
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=900 
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=900 
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=1000 
[CV]  bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=900, total=   2.6s
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=1000 
[CV]  bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=900, total=   2.7s
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=5, min_samples_split=7, n_estimators=1000 
[CV]  bootstrap=True, ma

[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   31.5s


[CV]  bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   3.4s
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=10, n_estimators=900 
[CV]  bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   3.7s
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=10, n_estimators=900 
[CV]  bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   3.5s
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=10, n_estimators=1000 
[CV]  bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=10, n_estimators=900, total=   2.7s
[CV] bootstrap=True, max_depth=12, max_features=3, min_samples_leaf=6, min_samples_split=10, n_estimators=1000 
[CV]  bootstrap=True, max_depth=12, max_features=3

[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:  2.4min


[CV]  bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=900, total=   3.7s
[CV] bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1000 
[CV]  bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=900, total=   4.1s
[CV] bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1200 
[CV]  bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1000, total=   4.1s
[CV] bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1200 
[CV]  bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1000, total=   4.2s
[CV] bootstrap=True, max_depth=12, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1200 
[CV]  bootstrap=True, max_depth=12, max_featur

[Parallel(n_jobs=-1)]: Done 357 tasks      | elapsed:  5.9min


[CV]  bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   3.9s
[CV] bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=10, n_estimators=900 
[CV]  bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   4.0s
[CV] bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=10, n_estimators=900 
[CV]  bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=7, n_estimators=1200, total=   3.8s
[CV] bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=10, n_estimators=1000 
[CV]  bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=10, n_estimators=900, total=   3.0s
[CV] bootstrap=True, max_depth=15, max_features=4, min_samples_leaf=6, min_samples_split=10, n_estimators=1000 
[CV]  bootstrap=True, max_depth=15, max_features=4

[Parallel(n_jobs=-1)]: Done 640 tasks      | elapsed: 10.4min


[CV]  bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=900, total=   3.0s
[CV] bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1000 
[CV]  bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=900, total=   2.9s
[CV] bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1200 
[CV]  bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1000, total=   3.3s
[CV] bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1200 
[CV]  bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1000, total=   3.7s
[CV] bootstrap=True, max_depth=None, max_features=4, min_samples_leaf=7, min_samples_split=13, n_estimators=1200 
[CV]  bootstrap=True, max_dept

[Parallel(n_jobs=-1)]: Done 729 out of 729 | elapsed: 12.1min finished


GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=None, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'bootstrap': [True], 'max_depth': [12, 15, None], 'max_features': [3, 4, 'auto'], 'min_samples_leaf': [5, 6, 7], 'min_samples_split': [7, 10, 13], 'n_estimators': [900, 1000, 1200]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='neg_mean_absolute_error', verbose=2)

In [25]:
grid_search_ad.best_params_

{'bootstrap': True,
 'max_depth': 15,
 'max_features': 4,
 'min_samples_leaf': 7,
 'min_samples_split': 10,
 'n_estimators': 900}

In [27]:
best_grid_ad = grid_search_ad.best_estimator_
evaluate(best_grid_ad, test_features, test_labels)

平均气温误差. 3.66093700651
Accuracy = 93.82%.


93.82%最优

## 最终模型

In [28]:
print('最终模型参数:\n')
pprint(best_grid_ad.get_params())

最终模型参数:

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 15,
 'max_features': 4,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 7,
 'min_samples_split': 10,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 900,
 'n_jobs': 1,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}
