In [None]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings

In [None]:
# matplotlib参数设置
matplotlib.rcParams['axes.labelsize'] = 14
matplotlib.rcParams['xtick.labelsize'] = 12
matplotlib.rcParams['ytick.labelsize'] = 12
warnings.filterwarnings('ignore')
# 设定了单例RandomState实例的种子
np.random.seed(42)

In [None]:
from sklearn.datasets import fetch_openml

mnist = fetch_openml('mnist_784', parser='auto')
X = mnist['data'].values
y = mnist['target'].values.codes
print(X.shape, y.shape)

In [None]:
# 展示第一个样本
x_0 = X[0, :].reshape(28, 28)
plt.imshow(x_0, cmap="gray_r")
plt.axis('off')
plt.show()

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(max_iter=50, random_state=42)
log_clf.fit(X_train, y_train)

# 进行预测并展示
m = 3500
y_pred_0 = log_clf.predict([X_test[m]])
print("预测的值：", y_pred_0, "实际的值：", y_test[m])
x_3500 = X_test[m].reshape(28, 28)
plt.imshow(x_3500, cmap="gray_r")
plt.axis('off')
plt.show()

In [None]:
# 用sklearn自带的交叉验证函数
from sklearn.model_selection import cross_val_score

cross_score = cross_val_score(log_clf, X_train, y_train, cv=5, scoring='accuracy')
print(cross_score)

In [None]:
from sklearn.model_selection import StratifiedKFold
from sklearn.base import clone

# 手动分块进行交叉验证
sk_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for train_index, val_index in sk_folds.split(X_train, y_train):
    clone_clf = clone(log_clf)
    X_train_folds = X_train[train_index]
    y_train_folds = y_train[train_index]
    X_val_folds = X_train[val_index]
    y_val_folds = y_train[val_index]

    clone_clf.fit(X_train_folds, y_train_folds)
    y_pred = clone_clf.predict(X_val_folds)
    n_correct = sum(y_pred == y_val_folds)
    print(n_correct/len(y_pred))

In [None]:
# 混淆矩阵
from sklearn.model_selection import cross_val_predict

labels = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
y_test_pred = cross_val_predict(log_clf, X_test, y_test, cv=5)
from sklearn.metrics import confusion_matrix
y_confusion_matrix = confusion_matrix(y_test, y_test_pred, labels=labels)
print(y_confusion_matrix)

In [None]:
# 评估指标计算
TP = y_confusion_matrix.diagonal()
TN = TP.sum() - TP
FP = y_confusion_matrix.sum(axis=0) - TP
FN = y_confusion_matrix.sum(axis=1) - TP

# 准确率
Acc = TP.sum() / y_confusion_matrix.sum()
print("Accuracy:", Acc)

import pandas as pd
# 精确率
Ppv = TP / (TP + FP)
# 召回率
Tpr = TP / (TP + FN)
# 特异度
Tnr = TN / (TN + FP)
# F1 得分
F1 = TP / (TP + (FN + FP) / 2)
df = pd.DataFrame({'Precision': Ppv, 'Recall': Tpr, 'Specificity': Tnr, 'F1 score': F1}, index=labels)
print(df)

In [None]:
y_pred_scores_0 = log_clf.decision_function([X_test[m]])
print(y_pred_scores_0)

In [None]:
y_train_scores = cross_val_predict(log_clf, X_train, y_train, cv=5, 
                                    method="decision_function")

In [None]:
from sklearn.preprocessing import label_binarize

y_train_one_hot = label_binarize(y_train, classes=labels)
print(y_train[0], "--->", y_train_one_hot[0])

In [None]:
from sklearn.metrics import precision_recall_curve

precisions, recalls, thresholds = precision_recall_curve(y_train_one_hot.ravel(), y_train_scores.ravel())

In [None]:
def plot_precision_recall_vs_threshold(precisions, recalls, thresholds):
    plt.plot(thresholds, precisions[:-1], 'b--', label="Precision")
    plt.plot(thresholds, recalls[:-1], 'g-', label="Recall")
    plt.xlabel("Threshold", fontsize=16)
    plt.legend(loc="upper left", fontsize=16)
    plt.ylim([0, 1])

plt.figure(figsize=(8, 4))
plot_precision_recall_vs_threshold(precisions, recalls, thresholds)
plt.xlim([-30, 30])
plt.show()

In [None]:
def plot_precision_vs_recall(precisions, recalls):
    plt.plot(recalls, precisions, 'b-', linewidth=2)
    plt.xlabel("Recall", fontsize=16)
    plt.ylabel("Precision", fontsize=16)
    plt.axis([0, 1, 0, 1])

plt.figure(figsize=(8, 6))
plot_precision_vs_recall(precisions, recalls)
plt.show()

In [None]:
from sklearn.metrics import roc_curve

fpr, tpr, thresholds = roc_curve(y_train_one_hot.ravel(), y_train_scores.ravel())

In [None]:
def plot_roc_curve(fpr, tpr, label=None):
    plt.plot(fpr, tpr, linewidth=2, label=label)
    plt.plot([0, 1], [0, 1], 'k--')
    plt.axis([0, 1, 0, 1])
    plt.xlabel("False Positive Rate", fontsize=16)
    plt.ylabel("True Positive Rate", fontsize=16)
    
plt.figure(figsize=(8, 6))
plot_roc_curve(fpr, tpr)
plt.show()

In [None]:
from sklearn.metrics import roc_auc_score

Roc_auc = roc_auc_score(y_train_one_hot.ravel(), y_train_scores.ravel())
print("ROC AUC:", Roc_auc)