In [25]:
from sklearn.datasets import load_iris
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
import numpy as np

In [9]:
data = load_iris()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
gbdt_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
gbdt_clf.fit(X_train, y_train)

In [13]:
y_pred = gbdt_clf.predict(X_test)

In [14]:
y_pred

array([1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2,
       0, 2, 2, 2, 2, 2, 0, 0, 0, 0, 1, 0, 0, 2, 1, 0, 0, 0, 2, 1, 1, 0,
       0])

In [17]:
scores = gbdt_clf.predict_proba(X_test)

In [20]:
scores

array([[4.30284264e-05, 9.99571472e-01, 3.85499133e-04],
       [9.99866446e-01, 1.30140009e-04, 3.41433220e-06],
       [1.18544236e-05, 1.06834768e-04, 9.99881311e-01],
       [3.11252838e-05, 9.99360342e-01, 6.08532418e-04],
       [1.33978048e-05, 9.99798156e-01, 1.88445962e-04],
       [9.99866446e-01, 1.30140009e-04, 3.41433220e-06],
       [3.99373646e-05, 9.99729745e-01, 2.30317716e-04],
       [5.65654755e-06, 8.41598844e-04, 9.99152745e-01],
       [8.63572040e-05, 9.96545257e-01, 3.36838602e-03],
       [2.26947186e-05, 9.99846461e-01, 1.30843883e-04],
       [4.97101103e-06, 3.92143771e-04, 9.99602885e-01],
       [9.99930217e-01, 6.63655059e-05, 3.41782101e-06],
       [9.99866446e-01, 1.30140009e-04, 3.41433220e-06],
       [9.99930217e-01, 6.63655059e-05, 3.41782101e-06],
       [9.99913132e-01, 8.34503513e-05, 3.41776262e-06],
       [1.44344187e-05, 9.99834852e-01, 1.50713290e-04],
       [2.16235018e-06, 3.01690697e-05, 9.99967669e-01],
       [2.26947186e-05, 9.99846

In [21]:
scores = gbdt_clf.predict_proba(X_test).max(axis=1)

In [22]:
scores

array([0.99957147, 0.99986645, 0.99988131, 0.99936034, 0.99979816,
       0.99986645, 0.99972974, 0.99915274, 0.99654526, 0.99984646,
       0.99960289, 0.99993022, 0.99986645, 0.99993022, 0.99991313,
       0.99983485, 0.99996767, 0.99984646, 0.99960344, 0.99998176,
       0.99993022, 0.99847931, 0.99991313, 0.99998176, 0.99995713,
       0.99996248, 0.99987233, 0.99992807, 0.99993022, 0.99993022,
       0.99993022, 0.99986645, 0.99995377, 0.99993022, 0.99993022,
       0.99960546, 0.99968097, 0.99988748, 0.99991313, 0.99988748,
       0.99920579, 0.99981914, 0.99991243, 0.99986645, 0.99986645])

In [23]:
scores.shape

(45,)

In [28]:
# 获取排名前 1% 样本的索引 (ix)
top_1_percent = int(len(scores) * 0.1)  # 计算前 10% 样本的数量
if top_1_percent == 0:  # 如果样本数量太少，至少取 1 个
    top_1_percent = 1
ix = np.argsort(scores)[-top_1_percent:]  # 获取概率最高的样本索引

In [29]:
ix

array([25, 16, 23, 19], dtype=int64)

In [30]:
# 获取这些样本的预测类别 (best_pred)
best_pred = gbdt_clf.predict(X_test)[ix]

In [31]:
best_pred

array([2, 2, 2, 2])

In [32]:
# 获取这些样本的真实标签
best_true = y_test[ix]

In [33]:
best_true

array([2, 2, 2, 2])

In [34]:
# 计算这些样本的预测是否正确
accuracy = accuracy_score(best_true, best_pred)
print("Accuracy of top 1% samples:", accuracy)

# 输出分类报告
print("\nClassification Report for top 1% samples:\n")
print(classification_report(best_true, best_pred))

Accuracy of top 1% samples: 1.0

Classification Report for top 1% samples:

              precision    recall  f1-score   support

           2       1.00      1.00      1.00         4

    accuracy                           1.00         4
   macro avg       1.00      1.00      1.00         4
weighted avg       1.00      1.00      1.00         4



---

In [48]:
from sklearn.datasets import fetch_california_housing
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [35]:
# 加载数据
data = fetch_california_housing()
X, y = data.data, data.target

print(data.DESCR)  # 查看数据集描述

.. _california_housing_dataset:

California Housing dataset
--------------------------

**Data Set Characteristics:**

    :Number of Instances: 20640

    :Number of Attributes: 8 numeric, predictive attributes and the target

    :Attribute Information:
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

    :Missing Attribute Values: None

This dataset was obtained from the StatLib repository.
https://www.dcc.fc.up.pt/~ltorgo/Regression/cal_housing.html

The target variable is the median house value for California districts,
expressed in hundreds of thousands of dollars ($100,000).

This dataset was derived

In [36]:
X.shape

(20640, 8)

In [38]:
y.shape

(20640,)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
reg = GradientBoostingRegressor(
    n_estimators=100,  # 弱学习器的数量
    learning_rate=0.1,  # 学习率
    max_depth=3,  # 每棵树的最大深度
    random_state=42
)

In [43]:
reg.fit(X_train, y_train)

In [59]:
predictions = reg.predict(X_test)
mean_prediction = np.mean(predictions)
std_prediction = np.std(predictions)

# 计算每个预测值的 Z 分数（标准化偏离程度）
scores = np.abs((predictions - mean_prediction) / std_prediction)

In [62]:
# 对 Z 分数进行排序（Z 分数越小，预测值越接近均值，越“正常”）
# 1. 计算前 10% 的样本数量
num_top_1_percent = int(len(scores) * 0.1)

if num_top_1_percent == 0:  # 如果样本数量太少，至少取 1 个
    num_top_1_percent = 1

# 2. 对 Z 分数进行排序，获取从小到大的索引
sorted_indices = np.argsort(scores)

# 3. 提取前 1% 的样本索引
top_1_percent_indices = sorted_indices[:num_top_1_percent]

# 打印结果
print(f"前 1% 样本的索引: {top_1_percent_indices}")

前 1% 样本的索引: [2020  778 1912 1970 2389 3419  196 2885 3319 4019   26 2200  899 2244
 3999 3980 3574 3405  691 1553 2064 1367 3580  311 1301 3988 1584 3472
  324 3024  333 3676  871 1120 3661 2766  664  958 4113  669  119 3338
 3300 1795 2476  836 1377 2448 1642 2089  326 3005 2878 2906  409 2103
 1718  390 1038 2690 2522  707  502 3804 2000 1019 2006 2009 3992  431
 1770  671 1207 3289  422 1182  260  692 3374 3191 2561 3785 1672 3397
 2489 3892 2642  889 2707 3946 3714   77 1798  204 2421 3638 2257 1407
 1635 1773 1127 3402 2848   75  375  361 3840 2879 1559  529 3277  351
 2798 3354 1289  478 2413 1871 1400  482 2746 3854 3326 2418 2444  702
 2385 3800  626 3067 3786  134  670 3149 3522 1991 3311 3432 2002  500
 2634 4036 3143 2008 2957   17 4123  350 3117 4022 3879 3809 3330 3408
 2136  952 2514 1032 2303 2351 2710 2810 3898  362 2270 3362  106 2453
 1044 3969  115 3994  967 3329 3989 2828 3811  724 3719 3782 1809  774
  251 2347 1709 3158 3106 1383 2255 1883  293 2782 2143 3489 2075

In [63]:
best_pred = predictions[ix]

In [64]:
best_pred

array([0.45352044, 2.18407834, 4.1201981 , 1.05476902])

In [65]:
best_true = y_test[ix]

In [66]:
best_true

array([0.514  , 1.784  , 5.00001, 1.375  ])

In [67]:
# 6. 评估模型性能
mse = mean_squared_error(best_true, best_pred)  # 均方误差
r2 = r2_score(best_true, best_pred)  # R^2 分数

print(f"测试集上的均方误差 (MSE): {mse:.4f}")
print(f"测试集上的 R^2 分数: {r2:.4f}")

测试集上的均方误差 (MSE): 0.2601
测试集上的 R^2 分数: 0.9098


In [56]:
np.mean(predictions)

2.055199732989832

In [46]:
y_test

array([0.477  , 0.458  , 5.00001, ..., 5.00001, 0.723  , 1.515  ])

In [49]:
# 5. 在测试集上进行预测
y_pred = reg.predict(X_test)

# 6. 评估模型性能
mse = mean_squared_error(y_test, y_pred)  # 均方误差
r2 = r2_score(y_test, y_pred)  # R^2 分数

print(f"测试集上的均方误差 (MSE): {mse:.4f}")
print(f"测试集上的 R^2 分数: {r2:.4f}")

测试集上的均方误差 (MSE): 0.2940
测试集上的 R^2 分数: 0.7756
