# Chapter 3 - Classification (分类)

_This notebook contains all the sample code and solutions to the exercises in chapter 3_

# Setup
First, let's make sure this notebook works well in both python 2 and python 3, import a new common modules, ensure MatplotLib plots figures inline and prepare a funciton to save the figures:

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "classification"

def save_fig(fig_id, tight_layout=True):
    path = os.path.join(PROJECT_ROOT_DIR, "images", CHAPTER_ID, fig_id + ".png")
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format='png', dpi=300)

In [None]:
def sort_by_target(mnist):
    reorder_train = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[:60000])]))[:,1]
    reorder_test = np.array(sorted([(target, i) for i, target in enumerate(mnist.target[60000:])]))[:, 1]
    mnist.data[:60000] = mnist.data[reorder_train]
    mnist.target[:60000] = mnist.target[reorder_train]
    mnist.data[60000:] = mnist.data[reorder_test + 60000]
    mnist.target[60000:] = mnist.target[reorder_test + 60000]

try:
    from sklearn.datasets import fetch_openml
    mnist = fetch_openml('mnist_784', version=1, cache=True)
    mnist.target = mnist.target.astype(np.int8)
    sort_by_target(mnist)
except ImportError:
    from sklearn.datasets import fetch_mldata
    mnist = fetch_mldata("MNIST original")

X, y = mnist["data"], mnist["target"]


In [None]:
X.shape, y.shape

In [None]:
# 显示单个图片
def plot_digit(data):
    image = data.reshape(28,28)
    plt.imshow(image, cmap=mpl.cm.binary, interpolation="nearest")
    plt.axis("off")


In [None]:
some_digit = X[30000]
plot_digit(some_digit)
print(y[30000])

In [None]:
def plot_digits(instances, images_per_row=10, **options):
    size = 28
    images_per_row = min(len(instances), images_per_row)
    images = [instance.reshape(size,size) for instance in instances]
    n_rows = (len(instances) - 1) // images_per_row + 1
    row_images = []
    n_empty = n_rows * images_per_row - len(instances)
    images.append(np.zeros((size, size * n_empty)))
    for row in range(n_rows):
        rimages = images[row * images_per_row : (row + 1) * images_per_row]
        row_images.append(np.concatenate(rimages, axis=1))
    image = np.concatenate(row_images, axis=0)
    plt.imshow(image, cmap = mpl.cm.binary, **options)
    plt.axis("off")

In [None]:
plt.figure(figsize=(9,9))
example_images = np.r_[X[:12000:600],X[13000:30600:600],X[30600:60000:590]]
plot_digits(example_images, images_per_row=10)
plt.show()

***对数据初步探索后，在深入研究这些数据之前，先创建测试集***

In [None]:
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]


In [None]:
# 给数据洗牌
import numpy as np

shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]


In [None]:
# 先训练一个二元分类器，简化问题，比如只识别4和非4(教程是5，但数据显示是4)
# 创建目标向量
y_train_4 = (y_train == 4)
y_test_4 = (y_test == 4)

# 选择一个分类器并开始训练，一个好的选择是使用SGD（随机梯度下降）分类器
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(random_state=42)
sgd_clf.fit(X_train, y_train_4)

In [None]:
# 检测
sgd_clf.predict([some_digit])

In [None]:
# 评估模型性能
# 实施交叉验证
from sklearn.model_selection import cross_val_score
cross_val_score(sgd_clf, X_train, y_train_4, cv=3, scoring="accuracy")

In [None]:
# 使用StratifiedKFold，与上面的cross_val_score一致
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

skfolds = StratifiedKFold(n_splits=3, random_state=42)
for train_index, test_index in skfolds.split(X_train, y_train_4):
    clone_clf = clone(sgd_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = (y_train_4[train_index])
    X_test_fold = X_train[test_index]
    y_test_fold = (y_train_4[test_index])

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_test_fold)
    n_correct = sum(y_pred == y_test_fold)
    print(n_correct/len(y_pred))


In [None]:
# 一个更简单的分类器，预测出来的准确率也达到了90%+，说明使用准确率无法作为分类器的首选性能指标
from sklearn.base import BaseEstimator
class Never5Classifier(BaseEstimator):
    def fit(self, X, y=None):
        pass
    def predict(self, X):
        return np.zeros((len(X),1), dtype=bool)

never_5_clf = Never5Classifier()
cross_val_score(never_5_clf, X_train, y_train_4, cv=3, scoring="accuracy")



In [None]:
# 评估分类器的更好方法是：混淆矩阵
# 总体思路就是：统计A类别实例 被 分成 B类别的次数
# 要计算混淆矩阵，首先需要有一组预测才能将其与实际目标进行比较。可以使用cross_val_predict函数
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(sgd_clf, X_train, y_train_4, cv=3)

# 使用confusion_matrix()函数来获取混淆矩阵；行表示实际类别，列表示预测类别
from sklearn.metrics import confusion_matrix
confusion_matrix(y_train_4, y_train_pred)

**行表示实际类别，列表示预测类别**

实际非4\[53633,   525], 真负例TN，假正例FP

  实际4\[  998,  4844]  假负例FN，真正例TP

  其中：TN=53633，FP=525，FN=998，TP=4844

  精度 = TP/(TP+FP)  __正类__预测的准确率；(在所有判断为恐怖分子中，真正的恐怖分子比例)

  召回率 = TP/(TP+FN)  __分类器正确检测__到的正类实例的比例（也就是正确判为恐怖分子占实际所有恐怖分子的比例）

## 表格-混淆矩阵 ##
![avatar](./混淆矩阵.jpg)

In [None]:
# 精度和召回率函数
from sklearn.metrics import precision_score, recall_score
precision_score(y_train_4, y_train_pred)


In [None]:
recall_score(y_train_4, y_train_pred)

In [None]:
# F1分数
from sklearn.metrics import f1_score
f1_score(y_train_4, y_train_pred)

In [None]:
# 精度/召回率权衡
y_scores = sgd_clf.decision_function([some_digit])
y_scores

In [None]:
threshold = 0
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
# 调整阈值（提高阈值，可以降低召回率）
threshold = 20000
y_some_digit_pred = (y_scores > threshold)
y_some_digit_pred

In [None]:
# cross_val_predict，可以返回决策分数，而非结果
y_scores = cross_val_predict(sgd_clf, X_train, y_train_4, cv=3, method="decision_function")
# 利用precision_recall_curve函数计算所有可能的阈值的精度和召回率
from sklearn.metrics import precision_recall_curve
precisions, recalls, thresholds = precision_recall_curve(y_train_4, y_scores)

In [None]:
# 显示曲线
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], "b--", label="Precision")
    plt.plot(thresholds, recalls[:-1], "g-", label="Recall")
    plt.xlabel("Threshold")
    plt.legend(loc="upper left")
    plt.ylim([0,1])
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)  

In [None]:
# 绘制精度和召回率的函数图，
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, "b-", linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.show()

In [None]:
# ROC(受试者工作特征曲线),AUC(面积)
from sklearn.metrics import roc_curve
fpr, tpr, thresholds = roc_curve(y_train_4, y_scores)
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel('False Positive Rate', fontsize=16)
    plt.ylabel('True Positive Rate', fontsize=16)

plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_train_4, y_scores)

In [None]:
# 使用随机森林算法
from sklearn.ensemble import RandomForestClassifier
forest_clf = RandomForestClassifier(n_estimators=10, random_state=42)
y_probas_forest = cross_val_predict(forest_clf, X_train, y_train_4, cv=3, method="predict_proba")

In [None]:
y_scores_forest = y_probas_forest[:,1]
fpr_forest, tpr_forest, thresholds_forest = roc_curve(y_train_4, y_scores_forest)

In [None]:
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, "b:", linewidth=2, label="SGD")
plot_roc_curve(fpr_forest, tpr_forest, "Random Forest")
plt.legend(loc="lower right", fontsize=16)
plt.show()

In [None]:
roc_auc_score(y_train_4, y_scores_forest)

In [None]:
y_train_pred_forest = cross_val_predict(forest_clf, X_train, y_train_4, cv=3)
precision_score(y_train_4, y_train_pred_forest)


In [None]:
recall_score(y_train_4, y_train_pred_forest)

# Multiclass classification 

一般多类别分类器使用OvA或者OvO，用的最多的是OvA，其组合数较少；而支持向量机使用OvO

In [None]:
# 使用SGDClassifier进行验证
sgd_clf.fit(X_train, y_train)
sgd_clf.predict([some_digit])

In [None]:
some_digit_scores = sgd_clf.decision_function([some_digit])
some_digit_scores

In [None]:
np.argmax(some_digit_scores)

In [None]:
sgd_clf.classes_

In [None]:
sgd_clf.classes_[4]

In [None]:
# 强制sklearn使用OvO，或者OvA
from sklearn.multiclass import OneVsOneClassifier
ovo_clf = OneVsOneClassifier(SGDClassifier(random_state=42))
ovo_clf.fit(X_train, y_train)
ovo_clf.predict([some_digit])
len(ovo_clf.estimators_)

In [None]:
# 使用随机森林会自动将实例分为多个类别
forest_clf.fit(X_train, y_train)
forest_clf.predict([some_digit])
forest_clf.predict_proba([some_digit])

In [None]:
# 评估分类器
cross_val_score(sgd_clf, X_train, y_train, cv=3, scoring="accuracy")

In [None]:
# 进行数据标准化处理，再进行评估（结果应该是准确率提升）
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train.astype(np.float64))
cross_val_score(sgd_clf, X_train_scaled, y_train, cv=3, scoring="accuracy")

# 错误分析

In [None]:
# 根据混淆矩阵，分析其错误类型(行是实际值，列是预测值)
from sklearn.metrics import confusion_matrix
y_train_pred = cross_val_predict(sgd_clf, X_train_scaled, y_train, cv=3)
conf_mx = confusion_matrix(y_train, y_train_pred)
conf_mx

In [None]:
# 使用图像表示
plt.matshow(conf_mx, cmap=plt.cm.gray)
plt.show()

5对应的图片比较暗，说明可能5的数量比较少，或者5的执行效果不是很好。但需要进一步验证（对角线是反映的事正确的情况，越亮越好）

越暗代表正确的数量越少


In [None]:
# 可能我们更关心错误情况，错误比例更能反映问题，因此，需要求和后再计算比例
row_sums = conf_mx.sum(axis=1) # 按行汇总，也就是按实际的情况汇总
norm_conf_mx = conf_mx/row_sums
np.fill_diagonal(norm_conf_mx, 0) # 用0填充对角线
plt.matshow(norm_conf_mx, cmap=plt.cm.gray)

In [None]:
# 深入分析数字3和5的例子
cl_a, cl_b = 3, 5
X_aa = X_train[(y_train==cl_a) & (y_train_pred==cl_a)]
X_ab = X_train[(y_train==cl_a) & (y_train_pred==cl_b)]
X_ba = X_train[(y_train==cl_b) & (y_train_pred==cl_a)]
X_bb = X_train[(y_train==cl_b) & (y_train_pred==cl_b)]

plt.figure(figsize = (8, 8))
plt.subplot(221); plot_digits(X_aa[:25], images_per_row=5)
plt.subplot(222); plot_digits(X_ab[:25], images_per_row=5)
plt.subplot(223); plot_digits(X_ba[:25], images_per_row=5)
plt.subplot(224); plot_digits(X_bb[:25], images_per_row=5)

plt.show()

In [None]:
# 多标签分类
from sklearn.neighbors import KNeighborsClassifier
y_train_large = (y_train >= 7)
y_train_odd = (y_train % 2 == 1)
y_mutilabel = np.c_[y_train_large, y_train_odd]

knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_mutilabel)
knn_clf.predict([some_digit])

In [None]:
# 计算F1的值
y_train_knn_pred = cross_val_predict(knn_clf, X_train, y_mutilabel, cv=3)
f1_score(y_mutilabel, y_train_knn_pred, average="macro")


In [None]:
# 多输出-多类别分类，简称多输出分类
noise_train = np.random.randint(0,100, (len(X_train), 784))
noise_test = np.random.randint(0,100, (len(X_test), 784))
X_train_mod = X_train + noise_train
X_test_mod = X_test + noise_test
y_train_mod = X_train
y_test_mod = X_test


In [None]:
some_index = 5500
plt.subplot(121); plot_digit(X_test_mod[some_index])
plt.subplot(122); plot_digit(y_test_mod[some_index])
plt.show()