# owenliang/introduction-to-machine-learning-with-python

Fetching contributors…
Cannot retrieve contributors at this time
1343 lines (967 sloc) 42.6 KB

# 样本数据说明

## 2个低维度数据集

``````import mglearn
import matplotlib.pyplot as plt

# 生成forge样本的特征X和目标y
X, y = mglearn.datasets.make_forge()

# 使用样本的第0列特征和第1列特征作为绘制的横坐标和纵坐标，目标y作为图案
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
# 在右下角画一个图案的文字说明，即2个分类
plt.legend(["Class 0", "Class 1"], loc=4)
# 绘制横坐标的说明
plt.xlabel("First feature")
# 绘制纵坐标的说明
plt.ylabel("Second feature")
# 样本的个数和特征的维度
print("X.shape: {}".format(X.shape))
``````

``````import mglearn
import matplotlib.pyplot as plt

#构造40个样本
X, y = mglearn.datasets.make_wave(n_samples=40)
#因为X只有1维, 所以直接可以画散点图
plt.plot(X, y, 'o')
#y的连续值范围
plt.ylim(-3, 3)
# 画横坐标说明
plt.xlabel("Feature")
# 画纵坐标说明
plt.ylabel("Target")
``````

## 2个高维度数据集

``````from sklearn.datasets import load_breast_cancer
import numpy as np

# 加载数据集
# 打印样本规模和特征规模
print(cancer.data.shape)
# 打印不同分类的样本数量, np.bincount统计不同分类的个数, 然后与分类的名字做1:1 zip，得到每个分类的样本数量
print("Sample counts per class:\n{}".format(
{n: v for n, v in zip(cancer.target_names, np.bincount(cancer.target))}))
``````
``````(569, 30)
Sample counts per class:
{'malignant': 212, 'benign': 357}
``````

``````from sklearn.datasets import load_boston
print("Data shape: {}".format(boston.data.shape))
``````

``````from sklearn.datasets import load_boston
print("X.shape: {}".format(X.shape))
``````

# K邻近

## 分类forge数据集

``````from sklearn.model_selection import train_test_split
import mglearn

X, y = mglearn.datasets.make_forge()
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 计算最近3个点中最多出现的分类作为预测标签
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
print("Test set accuracy: {:.2f}".format(clf.score(X_test, y_test)))
``````

## 分类cancer数据集

``````from sklearn.datasets import load_breast_cancer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

# 加载数据

# 切分
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=66)

# 记录不同n_neighbors情况下，模型的训练集精度与测试集精度的变化
training_accuracy = []
test_accuracy = []

# n_neighbors取值从1到10
neighbors_settings = range(1, 11)
for n_neighbors in neighbors_settings:
# 模型对象
clf = KNeighborsClassifier(n_neighbors=n_neighbors)
# 训练
clf.fit(X_train, y_train)
# 记录训练集精度
training_accuracy.append(clf.score(X_train, y_train))
# 记录测试集精度
test_accuracy.append(clf.score(X_test, y_test))

# 画出2条曲线，横坐标是邻居个数，纵坐标分别是训练集精度和测试集精度
plt.plot(neighbors_settings, training_accuracy, label="training accuracy")
plt.plot(neighbors_settings, test_accuracy, label="test accuracy")
plt.ylabel("Accuracy")
plt.xlabel("n_neighbors")
plt.legend()
``````

## 回归wave数据集

``````from sklearn.neighbors import KNeighborsRegressor

# 加载数据集
X, y = mglearn.datasets.make_wave(n_samples=40)

# 将wave数据集分为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 模型实例化，并将邻居个数设为3
reg = KNeighborsRegressor(n_neighbors=3)
# 利用训练数据和训练目标值来拟合模型
reg.fit(X_train, y_train)
# predict测试集
print("Test set predictions:\n{}".format(reg.predict(X_test)))
# 评估模型
print("Test set R^2: {:.2f}".format(reg.score(X_test, y_test)))
``````

## 结论

• knn模型2个重要参数：邻居个数与数据点之间的距离度量方式。
• 推荐选择3-5个邻居
• 在大数据集上处理慢，特征过多或者特征0值多均导致效果不佳

# 线性模型

## 线性回归

### 回归wave数据集

``````from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn

# 生成60个样本数据, 一维特征
X, y = mglearn.datasets.make_wave(n_samples=60)
# 切分数据集
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
# 训练线性回归模型
lr = LinearRegression().fit(X_train, y_train)

# coef_就是斜率w, 即每个特征对应一个权重
print("lr.coef_: {}".format(lr.coef_))
# intercept_是截距b
print("lr.intercept_: {}".format(lr.intercept_))

# 训练集精度
print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
# 测试机精度
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))
``````
``````Training set score: 0.67
Test set score: 0.66
``````

### 回归extended_boston数据集

``````from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
import mglearn

# 波士顿extended数据集

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lr = LinearRegression().fit(X_train, y_train)

print("Training set score: {:.2f}".format(lr.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lr.score(X_test, y_test)))
``````
``````Training set score: 0.95
Test set score: 0.61
``````

## 岭回归

``````from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Ridge
import mglearn

# 波士顿extended数据集

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

ridge = Ridge().fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge.score(X_test, y_test)))

``````
``````Training set score: 0.89
Test set score: 0.75
``````

``````ridge10 = Ridge(alpha=10).fit(X_train, y_train)
print("Training set score: {:.2f}".format(ridge10.score(X_train, y_train)))
print("Test set score: {:.2f}".format(ridge10.score(X_test, y_test)))
``````
``````Training set score: 0.79
Test set score: 0.64
``````

## lasso回归

``````from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import numpy as np
import mglearn

# 波士顿extended数据集

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

lasso = Lasso().fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso.score(X_test, y_test)))
# lasso.coef_是w斜率向量，数一下有几个特征的系数不为0
print("Number of features used: {}".format(np.sum(lasso.coef_ != 0)))

``````
``````Training set score: 0.29
Test set score: 0.21
Number of features used: 4
``````

``````from sklearn.linear_model import Lasso
from sklearn.model_selection import train_test_split
import numpy as np
import mglearn

# 波士顿extended数据集

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

# 我们增大max_iter的值，否则模型会警告我们，说应该增大max_iter
lasso001 = Lasso(alpha=0.01, max_iter=100000).fit(X_train, y_train)
print("Training set score: {:.2f}".format(lasso001.score(X_train, y_train)))
print("Test set score: {:.2f}".format(lasso001.score(X_test, y_test)))
print("Number of features used: {}".format(np.sum(lasso001.coef_ != 0)))
``````

``````Training set score: 0.90
Test set score: 0.77
Number of features used: 33
``````

## 线性分类

ŷ = w[0] * x[0] + w[1] * x[1] + ...+ w[p] * x[p] + b > 0

• w和b对训练集拟合好坏的度量方式（损失函数）
• 是否使用正则化以及使用哪种正则化

• LogisticRegression：Logistic回归分类器（注意只是名字叫回归，但是分类算法）
• LinearSVC：线性支持向量机分类器

### 分类forge数据集

``````from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import mglearn
import matplotlib.pyplot as plt

X, y = mglearn.datasets.make_forge()

# subplots(m,n,figsize)函数是说把n个图画在m行里，每个图片的长宽由figsize指定
# 返回的第二个值是每个图的绘制位置，稍后会用
fig, axes = plt.subplots(1, 2, figsize=(10, 3))

# 利用zip组合：让LinearSVC画在第一个图片中，LogisticRegression画在第二个图片中
for model, ax in zip([LinearSVC(), LogisticRegression()], axes):
# 训练模型
clf = model.fit(X, y)
# 应该是画出了这个线性model的图像，是一个斜线
mglearn.plots.plot_2d_separator(clf, X, fill=False, eps=0.5,
ax=ax, alpha=.7)
# 取数据集第1个特征和第2个特征分别作为图的横纵轴，画出标签的分布
mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
ax.set_title("{}".format(clf.__class__.__name__))
ax.set_xlabel("Feature 0")
ax.set_ylabel("Feature 1")
axes[0].legend()
``````

### 分类cencer数据集

``````from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import mglearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)
logreg = LogisticRegression().fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))
``````
``````Training set score: 0.955
Test set score: 0.958
``````

``````logreg = LogisticRegression(C=100).fit(X_train, y_train)
print("Training set score: {:.3f}".format(logreg.score(X_train, y_train)))
print("Test set score: {:.3f}".format(logreg.score(X_test, y_test)))
``````
``````Training set score: 0.974
Test set score: 0.965
``````

### 指定L1正则化

``````from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
import mglearn
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)

# 不同的C参数使用不同的标记符号绘图
for C, marker in zip([0.001, 1, 100], ['o', '^', 'v']):
# 利用C控制正则化强弱，penalty指定了L1正则化（penalty意思是惩罚）
lr_l1 = LogisticRegression(C=C, penalty="l1").fit(X_train, y_train)
# 训练集精度
print("Training accuracy of l1 logreg with C={:.3f}: {:.2f}".format(
C, lr_l1.score(X_train, y_train)))
# 测试集精度
print("Test accuracy of l1 logreg with C={:.3f}: {:.2f}".format(
C, lr_l1.score(X_test, y_test)))
``````
``````Training accuracy of l1 logreg with C=0.001: 0.91
Test accuracy of l1 logreg with C=0.001: 0.92
Training accuracy of l1 logreg with C=1.000: 0.96
Test accuracy of l1 logreg with C=1.000: 0.96
Training accuracy of l1 logreg with C=100.000: 0.99
Test accuracy of l1 logreg with C=100.000: 0.98
``````

### 多分类

``````from sklearn.svm import LinearSVC
import mglearn
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
import numpy as np

# 2特征，3分类的数据集
X, y = make_blobs(random_state=42)

# 训练
linear_svm = LinearSVC().fit(X, y)
# 有3组斜率，分别对应3个分类的one-vs-rest模型，每一组斜率包含了2个w系数对应2个特征
print("Coefficient shape: ", linear_svm.coef_.shape)
# 有3组截距b，分别对应3个分类的one-vs-rest模型
print("Intercept shape: ", linear_svm.intercept_.shape)

# 绘制样本分布，取2个特征作为绘图的横纵轴，分类画成点
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)
# 在-15到15的范围内均匀生成15个点作为特征1的取值
line = np.linspace(-15, 15)
print(line)
# 分别绘制3个分类的模型对应的线
for coef, intercept, color in zip(linear_svm.coef_, linear_svm.intercept_, ['b', 'r', 'g']):
# line是特征1的取值（横坐标），-(line * coef[0] + intercept) / coef[1]是特征2的取值（纵坐标）
plt.plot(line, -(line * coef[0] + intercept) / coef[1], c=color)
# y坐标的范围
plt.ylim(-10, 15)
``````

## 总结

• 线性模型主要参数就是正则化参数，包括L1/L2，以及回归的alpha以及分类的C值。
• alpha越大或者C越小，则正则化越强，可以理解为w系数都很小，模型很简单，对训练集精度也会下降。
• 线性模型无论训练还是预测都很快，但是大数据集需要考虑solver='sag'加速训练
• L1正则化因为会让很多w系数为0，所以更容易模型的表现更容易分析。

# 朴素贝叶斯分类器

• GaussianNB: 特征可以是任意连续数据
• BernoulliNB：特征必须是2分类的数据
• MultinomialNB：特征是计数性质的数据

GaussianNB适合高维数据，后两者适合文本领域的稀疏数据。

# 决策树

• 回归：DecisionTreeRegressor
• 分类：DecisionTreeClassifier

## 决策树分类

``````from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# 数据集
# 切分
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)

# 决策树分类
tree = DecisionTreeClassifier(random_state=0)
# 训练
tree.fit(X_train, y_train)
#精度
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
``````
``````Accuracy on training set: 1.000
Accuracy on test set: 0.937
``````

``````# 决策树分类
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
# 训练
tree.fit(X_train, y_train)
#精度
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))
``````
``````Accuracy on training set: 0.988
Accuracy on test set: 0.951
``````

feature_importances_属性记录了每个特征的重要程度，可以绘制出来：

``````from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.tree import export_graphviz
import graphviz

# 数据集
# 切分
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, stratify=cancer.target, random_state=42)

# 决策树分类
tree = DecisionTreeClassifier(max_depth=4, random_state=0)
# 训练
tree.fit(X_train, y_train)
#精度
print("Accuracy on training set: {:.3f}".format(tree.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(tree.score(X_test, y_test)))

def plot_feature_importances_cancer(model):
# 样本有几个特征
n_features = cancer.data.shape[1]
# 画出每个特征的对决策的重要程度
plt.barh(range(n_features), model.feature_importances_, align='center')
plt.yticks(np.arange(n_features), cancer.feature_names)
plt.xlabel("Feature importance")
plt.ylabel("Feature")
plot_feature_importances_cancer(tree)
``````

## 决策树回归

DecisionTreeRegressor用于回归，但是它非常特别。

## 总结

• 决策树回归不能外推，但是分类是没问题的。
• 控制决策树模型复杂度的参数就是预剪枝，在树构造的过程中及时停止向下构造，参数包括：max_depth，max_leaf_nodes，min_samples_leaf。
• 决策树很容易可视化，就是一颗问答树
• 决策树不需要特征预处理（例如归一化/标准化），因为每一个问题都是针对单一特征的，不同特征的尺度不同不影响模型。
• 决策树即便剪枝也容易过拟合，泛化能力不好。

# �决策树集成

• 随机森林
• 梯度提升决策树

## 随机森林

n_estimators参数指定决策树的个数，而树之间的随机性是通过2种方式构造的：

• 每个决策树的训练数据不同，这是通过对训练集进行随机抽样实现的，但是样本数量都是一致的，因此对同一个树来说可能同样的样本出现多次
• 每个决策树的每个节点可以参考的特征数量受到约束，通过max_features参数可以限制，越小则树之间的差异越大

5个树的随机森林分类：

``````from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)
forest = RandomForestClassifier(n_estimators=5, random_state=2)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))
``````
``````Accuracy on training set: 0.960
Accuracy on test set: 0.920
``````

``````from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=100, noise=0.25, random_state=3)
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

forest = RandomForestClassifier(n_estimators=100, random_state=0)
forest.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(forest.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(forest.score(X_test, y_test)))
``````
``````Accuracy on training set: 1.000
Accuracy on test set: 0.972
``````

### 总结

• 随机森林基本不需要调参，默认值就很好
• 随机森林也不用特征缩放处理
• 随机森林耗费训练性能，可以通过n_jobs多线程并发训练不同的树
• 树越多，模型越好，但是也越慢，内存花费越高
• 可以调节的参数有：n_estimators、max_features、max_depth等预剪枝参数

## 梯度提升回归树

``````from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)

gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
``````
``````Accuracy on training set: 1.000
Accuracy on test set: 0.958
``````

``````gbrt = GradientBoostingClassifier(learning_rate=0.01, random_state=0)
gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
``````
``````Accuracy on training set: 0.988
Accuracy on test set: 0.965
``````

``````gbrt = GradientBoostingClassifier(max_depth=1, random_state=0)
gbrt.fit(X_train, y_train)
print("Accuracy on training set: {:.3f}".format(gbrt.score(X_train, y_train)))
print("Accuracy on test set: {:.3f}".format(gbrt.score(X_test, y_test)))
``````
``````Accuracy on training set: 0.991
Accuracy on test set: 0.972
``````

### 总结

• 梯度提升决策树是最强大最常用的监督模型
• 缺点是调参敏感，影响很大，训练时间也很长
• 不适合稀疏数据
• 支持的参数：n_estimators，learning_rate，max_depth，其中learning_rate越低则需要更大的n_estimators进行迭代修正
• xgboost库更快更准，可以尝试

# 核支持向量机

## 核技巧

• 多项式核：比如：feature1 ** 2 * feature2 ** 5)
• 高斯核：很难解释

``````from sklearn.svm import SVC

# 加载数据
X, y = mglearn.tools.make_handcrafted_dataset()

# 训练，用RBF核完成高维映射
svm = SVC(kernel='rbf', C=10, gamma=0.1).fit(X, y)

# 画分类的分界线
mglearn.plots.plot_2d_separator(svm, X, eps=.5)

# 画样本点
mglearn.discrete_scatter(X[:, 0], X[:, 1], y)

# 画出支持向量，支持向量的类别标签由dual_coef_的正负号给出
sv = svm.support_vectors_
sv_labels = svm.dual_coef_.ravel() > 0

# 画支持向量点
mglearn.discrete_scatter(sv[:, 0], sv[:, 1], sv_labels, s=15, markeredgewidth=3)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")
``````

``````from sklearn.svm import SVC

# 加载数据

# 切分
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)

# 训练
svc = SVC()
svc.fit(X_train, y_train)

# 精度
print("Accuracy on training set: {:.2f}".format(svc.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(svc.score(X_test, y_test)))
``````
``````Accuracy on training set: 1.00
Accuracy on test set: 0.63
``````

### 预处理数据

``````from sklearn.svm import SVC

# 加载数据

# 切分
X_train, X_test, y_train, y_test = train_test_split(
cancer.data, cancer.target, random_state=0)

#### 特征预处理
# 计算训练集中每个特征的最小值, 传axis=0是计算每列的最小值
min_on_training = X_train.min(axis=0)
# 计算训练集中每个特征的范围=最大值-最小值
range_on_training = (X_train - min_on_training).max(axis=0)

# 把每个特征缩放到0-1之间
X_train_scaled = (X_train - min_on_training) / range_on_training
X_test_scaled = (X_test - min_on_training) / range_on_training

# 打印缩放后，每个特征的最大最小值
#print("Minimum for each feature\n{}".format(X_train_scaled.min(axis=0)))
#print("Maximum for each feature\n {}".format(X_train_scaled.max(axis=0)))

# 训练
svc = SVC()
svc.fit(X_train_scaled, y_train)

# 精度
print("Accuracy on training set: {:.2f}".format(svc.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.2f}".format(svc.score(X_test_scaled, y_test)))

``````
``````Accuracy on training set: 0.95
Accuracy on test set: 0.95
``````

``````# 训练
svc = SVC(C=1000)
svc.fit(X_train_scaled, y_train)

# 精度
print("Accuracy on training set: {:.2f}".format(svc.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.2f}".format(svc.score(X_test_scaled, y_test)))
``````
``````Accuracy on training set: 0.99
Accuracy on test set: 0.97
``````

## 总结

• 在低维和高维数据表现都很好（应该是因为它会用核方法映射数据到高维的原因吧）
• 需要对特征预处理，缩放到相同区间
• 调参敏感，也很难解释
• 重要参数：C、核方法以及核方法参数，对于rbf核有参数gamma。

# 神经网络

## 小数据集

``````from sklearn.neural_network import MLPClassifier
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split
import mglearn
import matplotlib.pyplot as plt

# 数据集
X, y = make_moons(n_samples=100, noise=0.25, random_state=3)

# 切分
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y,random_state=42)

# 训练神经网络
mlp = MLPClassifier(solver='lbfgs', random_state=0).fit(X_train, y_train)

# 绘制模型分类边界
mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3)

# 画样本点
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

# 精度
print(mlp.score(X_train, y_train))
print(mlp.score(X_test, y_test))
``````
``````1.0
0.88
``````

``````# 训练神经网络
mlp = MLPClassifier(solver='lbfgs', hidden_layer_sizes=[10], random_state=0).fit(X_train, y_train)

# 绘制模型分类边界
mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3)

# 画样本点
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

print(mlp.score(X_train, y_train))
print(mlp.score(X_test, y_test))
``````
``````0.9866666666666667
0.88
``````

``````# 训练神经网络, 2个隐层，每层100个隐单元，降低L2惩罚为0.0001
mlp = MLPClassifier(solver='lbfgs',hidden_layer_sizes=[100, 100],alpha=0.0001, random_state=0).fit(X_train, y_train)

# 绘制模型分类边界
mglearn.plots.plot_2d_separator(mlp, X_train, fill=True, alpha=.3)

# 画样本点
mglearn.discrete_scatter(X_train[:, 0], X_train[:, 1], y_train)
plt.xlabel("Feature 0")
plt.ylabel("Feature 1")

# 精度
print(mlp.score(X_train, y_train))
print(mlp.score(X_test, y_test))
``````
``````1.0
0.88
``````

## 大数据集

``````from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# 数据集

# 切分
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# 训练神经网络
mlp = MLPClassifier(random_state=0).fit(X_train, y_train)

# 精度
print("Accuracy on training set: {:.2f}".format(mlp.score(X_train, y_train)))
print("Accuracy on test set: {:.2f}".format(mlp.score(X_test, y_test)))
``````
``````Accuracy on training set: 0.91
Accuracy on test set: 0.91
``````

``````from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# 数据集

# 切分
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# 计算训练集中每个特征的平均值
mean_on_train = X_train.mean(axis=0)
# 计算训练集中每个特征的标准差
std_on_train = X_train.std(axis=0)

# 减去平均值，然后乘以标准差的倒数, 如此运算之后，mean=0，std=1
X_train_scaled = (X_train - mean_on_train) / std_on_train
# 对测试集做相同的变换(使用训练集的平均值和标准差)
X_test_scaled = (X_test - mean_on_train) / std_on_train

# 训练
mlp = MLPClassifier(random_state=0, max_iter=100)
mlp.fit(X_train_scaled, y_train)

# 精度
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test_scaled, y_test)))
``````
``````Accuracy on training set: 0.991
Accuracy on test set: 0.972
``````

``````from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split

# 数据集

# 切分
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, random_state=0)

# 计算训练集中每个特征的平均值
mean_on_train = X_train.mean(axis=0)
# 计算训练集中每个特征的标准差
std_on_train = X_train.std(axis=0)

# 减去平均值，然后乘以标准差的倒数, 如此运算之后，mean=0，std=1
X_train_scaled = (X_train - mean_on_train) / std_on_train
# 对测试集做相同的变换(使用训练集的平均值和标准差)
X_test_scaled = (X_test - mean_on_train) / std_on_train

# 训练
mlp = MLPClassifier(random_state=0, alpha=1, max_iter=100)
mlp.fit(X_train_scaled, y_train)

# 精度
print("Accuracy on training set: {:.3f}".format(mlp.score(X_train_scaled, y_train)))
print("Accuracy on test set: {:.3f}".format(mlp.score(X_test_scaled, y_test)))
``````
``````Accuracy on training set: 0.986
Accuracy on test set: 0.972
``````

## 总结

• 神经网络是最先进的模型，能够获取大量数据中包含的信息
• 训练时间长，需要预处理数据，调参敏感。
• 模型复杂度高，100特征，1个隐层，100个隐单元，从输入到隐层需要学习出100*100个权重系数，从隐层到输出需要学习100个权重系数
• 调参关注：隐层数量，每层的隐单元数量，正则化，激活函数。
• 激活函数有relu和tanh，用来在隐单元完成y的非线性变化。

# 分类器的不确定性估计

• decision_function决策函数
• predict_proba预测概率

## 二分类情况

### 决策函数

``````from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_circles

#数据集
X, y = make_circles(noise=0.25, factor=0.5, random_state=1)

# 为了便于说明，我们将两个类别重命名为"blue"和"red"
y_named = np.array(["blue", "red"])[y]

# 切分
X_train, X_test, y_train_named, y_test_named, y_train, y_test = \
train_test_split(X, y_named, y, random_state=0)

# 构建梯度提升模型
gbrt.fit(X_train, y_train_named)

# 决策函数表达了2种分类的偏好
print(gbrt.decision_function(X_test))
print(gbrt.classes_)
``````
``````[ 4.13592629 -1.7016989  -3.95106099 -3.62599351  4.28986668  3.66166106
-7.69097177  4.11001634  1.10753883  3.40782247 -6.46262729  4.28986668
3.90156371 -1.20031192  3.66166106 -4.17231209 -1.23010022 -3.91576275
4.03602808  4.11001634  4.11001634  0.65708962  2.69826291 -2.65673325
-1.86776597]
['blue' 'red']
``````

### 预测概率

``````from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_circles

#数据集
X, y = make_circles(noise=0.25, factor=0.5, random_state=1)

# 为了便于说明，我们将两个类别重命名为"blue"和"red"
y_named = np.array(["blue", "red"])[y]

# 切分
X_train, X_test, y_train_named, y_test_named, y_train, y_test = \
train_test_split(X, y_named, y, random_state=0)

# 构建梯度提升模型
gbrt.fit(X_train, y_train_named)

# 预测概率
print("Predicted probabilities:\n{}".format(gbrt.predict_proba(X_test[:6])))
``````
``````Predicted probabilities:
[[0.01573626 0.98426374]
[0.84575649 0.15424351]
[0.98112869 0.01887131]
[0.97406775 0.02593225]
[0.01352142 0.98647858]
[0.02504637 0.97495363]]
``````

## 多分类情况

### 决策函数

``````from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=42)

gbrt.fit(X_train, y_train)

# 决策函数
print(gbrt.decision_function(X_test[:6]))
print(gbrt.classes_)
``````
``````[[-0.52931069  1.46560359 -0.50448467]
[ 1.51154215 -0.49561142 -0.50310736]
[-0.52379401 -0.4676268   1.51953786]
[-0.52931069  1.46560359 -0.50448467]
[-0.53107259  1.28190451  0.21510024]
[ 1.51154215 -0.49561142 -0.50310736]]
[0 1 2]
``````

### 预测概率

``````from sklearn.ensemble import GradientBoostingClassifier

X_train, X_test, y_train, y_test = train_test_split(
iris.data, iris.target, random_state=42)

gbrt.fit(X_train, y_train)

# 决策函数
print(gbrt.predict_proba(X_test[:6]))
print(gbrt.classes_)
``````
``````[[0.10664722 0.7840248  0.10932798]
[0.78880668 0.10599243 0.10520089]
[0.10231173 0.10822274 0.78946553]
[0.10664722 0.7840248  0.10932798]
[0.10825347 0.66344934 0.22829719]
[0.78880668 0.10599243 0.10520089]]
[0 1 2]
``````

# 整章总结

• 最邻近（KNN）：适用于小型数据集，是很好的基准模型，很容易解释。
• 线性模型：非常可靠的首选算法，适用于非常大的数据集，也适用于高维数据。
• 朴素贝叶斯：只适用于分类问题。比线性模型速度还快，适用于非常大的数据集和高维数据。精度通常要低于线性模型。
• 决策树：速度很快，不需要数据缩放，可以可视化，很容易解释。
• 随机森林：几乎总是比单棵决策树的表现要好，鲁棒性很好，非常强大。不需要数据缩放。不适用于高维稀疏数据。
• 梯度提升决策树：精度通常比随机森林略高。与随机森林相比，训练速度更慢，但预测速度更快，需要的内存也更少。比随机森林需要更多的参数调节。
• 支持向量机：对于特征含义相似的中等大小的数据集很强大。需要数据缩放，对参数敏感。
• 神经网络：可以构建非常复杂的模型，特别是对于大型数据集而言。对数据缩放敏感，对参数选取敏感。大型网络需要很长的训练时间。

You can’t perform that action at this time.