In [22]:
# 导入所需的库
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import time

# 加载鸢尾花数据集
iris = load_iris()
X = iris.data
y = iris.target

# 划分数据集为训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=38)

# 创建随机森林分类器
# 这里使用100个决策树
rf_classifier = RandomForestClassifier(n_estimators=100)

# 训练模型
# 记录训练时间
start_time = time.time()
rf_classifier.fit(X_train, y_train)
rf_training_time = time.time() - start_time

# 对测试集进行预测
# 记录预测时间
start_time = time.time()
y_pred = rf_classifier.predict(X_test)
rf_prediction_time = time.time() - start_time

# 评估模型性能
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

# 打印结果
print("Accuracy:", accuracy)
print("Training Time:", rf_training_time)
print("Prediction Time:", rf_prediction_time)
print("Classification Report:")
print(report)


Accuracy: 0.9777777777777777
Training Time: 0.06540107727050781
Prediction Time: 0.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       1.00      0.92      0.96        13
           2       0.94      1.00      0.97        15

    accuracy                           0.98        45
   macro avg       0.98      0.97      0.98        45
weighted avg       0.98      0.98      0.98        45



参数调整和交叉验证

In [13]:
from sklearn.model_selection import cross_val_score
import numpy as np

# Random Forest with different number of trees
rf_10 = RandomForestClassifier(n_estimators=10)
rf_50 = RandomForestClassifier(n_estimators=50)
rf_100 = RandomForestClassifier(n_estimators=100)
rf_200 = RandomForestClassifier(n_estimators=200)

cv_scores_rf_10 = cross_val_score(rf_10, X, y, cv=5)
cv_scores_rf_50 = cross_val_score(rf_50, X, y, cv=5)
cv_scores_rf_100 = cross_val_score(rf_100, X, y, cv=5)
cv_scores_rf_200 = cross_val_score(rf_200, X, y, cv=5)

cv_results = {
    "RF 10 Trees": np.mean(cv_scores_rf_10),
    "RF 50 Trees": np.mean(cv_scores_rf_50),
    "RF 100 Trees": np.mean(cv_scores_rf_100),
    "RF 200 Trees": np.mean(cv_scores_rf_200)

}

cv_results

{'RF 10 Trees': 0.9399999999999998,
 'RF 50 Trees': 0.9466666666666667,
 'RF 100 Trees': 0.96,
 'RF 200 Trees': 0.9666666666666668}