## 训练集：N个样本/M个特征

In [None]:
# 训练集随机 - N个样本中随机有放回的抽样N个
# Bootstrap 随机有放回抽样
# [1, 2, 3, 4, 5]
# 新的树的训练集
# [2, 2, 3, 1, 5]

# 特征随机 - 从M个特征中随机抽取m个特征
# M >> m
# 降维

## 参数
RandomForestClassifier()

n_estimators 随机森林里决策树的个数，默认是10

criterion 决策树分裂的标准，默认是基尼指数（cart），也可以选择entropy（id3）

max_depth 决策树最大深度，默认是none，不限制，也可以自行限制

n_jobs 拟合和预测的时候CPU核数

In [6]:
# 1、获取数据
import pandas as pd

data = pd.read_csv('/Users/zhangqihao/Desktop/PYTHON/机器学习/机器学习资料2/02-代码/titanic.csv')

In [7]:
# 2、数据处理

x = data[["pclass", "age", "sex"]]
y = data["survived"]

In [8]:
# 缺失值处理
x["age"].fillna(x["age"].mean(), inplace=True)

In [9]:
# 特征值->字典类型
x = x.to_dict(orient="records")

In [10]:
# 3、数据集划分
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test  = train_test_split(x, y, random_state = 2)

In [11]:
# 4、字典特征抽取
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeClassifier

transfer = DictVectorizer()
x_train = transfer.fit_transform(x_train)
x_test = transfer.transform(x_test)

In [13]:
# 5、随机森林/网格搜索
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

estimator = RandomForestClassifier()
param_dict = {"n_estimators":[120,200,300,500,800,1200],
              "max_depth":[5,8,15,25,30]}
estimator = GridSearchCV(estimator, param_grid=param_dict, cv=3)
estimator.fit(x_train, y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False,
                                              random_state=None, verbose=0,
                                              warm_start=False),
             iid

In [14]:
# 6、模型预估

# 方法1:直接比对真实值和预测值
y_predict = estimator.predict(x_test)

print("y_predict:\n",y_predict)
print("直接比对真实值和预测值:\n", y_test == y_predict)

# 方法2:计算准确率
score = estimator.score(x_test, y_test)
print("准确率为:\n",score)

# 查看参数

print("最佳参数：/n", estimator.best_params_)
print("最佳结果：/n", estimator.best_score_)
print("最佳估计器：/n", estimator.best_estimator_)
print("交叉验证结果：/n", estimator.cv_results_)

y_predict:
 [0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0 0 0
 0 0 0 0 0 0 1 0 0 1 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0
 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 0 0 0 1 0 1 0 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0
 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 1 0 0 0 1 1 0 0 0 0 1 0 0 1 0 1 0 0 1
 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 1 0 1 0 0 1
 0 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1
 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 0 1 0 0
 0 0 0 0 0 0 0 1 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 1 1 0
 1 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0]
直接比对真实值和预测值:
 811     True
681     True
757     True
1223    True
846     True
        ... 
874     True
322     True
707     True
740     True
705     True
Name: survived, Length: 329, dtype: bool
准确率为:
 0.8206686930091185
最佳参数：/n {'max_depth': 5, 'n_estimators': 200}
最佳结果：/n 0.8262195121951219
最佳估计器：/n RandomForestCl