# 实现混淆矩阵以及精准率和召回率
![癌症预测为例对比精准率和召回率](../02-精准率和召回率/images/癌症预测为例对比精准率和召回率.png)

In [1]:
import numpy as np
from sklearn import datasets

In [2]:
digits = datasets.load_digits() # 加载手写数字识别数据集
X = digits.data
y = digits.target.copy()

In [3]:
# 多分类问题转换为二分类问题，即等于9和不等于9,数据比例大约是1:9,也就是说我们只要全认为是非9，按照传统计算正确率的方法我们也有90%的正确率
y[digits.target==9] = 1
y[digits.target!=9] = 0

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=666)

In [5]:
from sklearn.linear_model import LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
log_reg.score(X_test, y_test)



0.9755555555555555

In [6]:
y_log_predict = log_reg.predict(X_test)
y_log_predict

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [7]:
def TN(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 0) & (y_predict == 0)) # 见上面的图，即预测和真实值都是0才算是TN

In [8]:
tn = TN(y_test, y_log_predict)
tn

403

In [9]:
def FP(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 0) & (y_predict == 1)) # 见上面的图，即预测为0，真实值为1，才算是FP

In [10]:
fp = FP(y_test, y_log_predict)
fp

2

In [11]:
def FN(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 1) & (y_predict == 0)) # 见上面的图，即预测为1，真实值为0，才算是FN

In [12]:
fn = FN(y_test, y_log_predict)
fn

9

In [13]:
def TP(y_true, y_predict):
    assert len(y_true) == len(y_predict)
    return np.sum((y_true == 1) & (y_predict == 1)) # 见上面的图，即预测值和真实值都为1,才算是FN

In [14]:
tp = TP(y_test, y_log_predict)
tp

36

In [15]:
def confusion_matrix(y_true, y_predict):
    """
    混淆矩阵计算
    
    @param y_true    真实值
    @param y_predict 预测值
    """
    return np.array([
        [TN(y_test, y_log_predict), FP(y_test, y_log_predict)],
        [FN(y_test, y_log_predict), TP(y_test, y_log_predict)]
    ])

In [16]:
confusion_matrix(y_test, y_log_predict)

array([[403,   2],
       [  9,  36]])

In [17]:
def precision_score(y_true, y_predict):
    conf_matrix = confusion_matrix(y_true, y_predict)
    try:
        return conf_matrix[1][1] / (conf_matrix[0][1] + conf_matrix[1][1])
    except:
        return 0.0

In [18]:
precision_score(y_test, y_log_predict) # 精准率

0.9473684210526315

In [19]:
def recall_score(y_true, y_predict):
    conf_matrix = confusion_matrix(y_true, y_predict)
    try:
        return conf_matrix[1][1] / (conf_matrix[1][0] + conf_matrix[1][1])
    except:
        return 0.0

In [20]:
recall_score(y_test, y_log_predict) # 召回率

0.8

## scikit-learn中的精准率和召回率
### 1.混淆矩阵

In [21]:
from sklearn.metrics import confusion_matrix

In [22]:
confusion_matrix(y_test, y_log_predict) # 计算结果和上面的是一样地

array([[403,   2],
       [  9,  36]], dtype=int64)

### 2.精准率

In [23]:
from sklearn.metrics import precision_score

In [24]:
precision_score(y_test, y_log_predict) # 和上面的自定义方法得到的结果是一样地

0.9473684210526315

### 3.召回率

In [25]:
from sklearn.metrics import recall_score

In [26]:
recall_score(y_test, y_log_predict) # 和上面的自定义方法得到的结果是一样地

0.8