<a href="https://colab.research.google.com/github/oorange-ocean/thermx-data/blob/main/%E8%81%9A%E7%B1%BB.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 多步kmeans聚类

In [17]:
# 导入必要的库
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
from google.colab import drive
from ipywidgets import interact, IntSlider  # 用于交互式选择

# 挂载Google Drive
drive.mount('/content/drive')

# 读取数据
data = pd.read_csv('/content/drive/MyDrive/steady_state_data.csv')

# 数据预处理：处理NaN值
def fill_na_with_group_mean(df, group_col):
    for col in df.columns:
        if col != group_col and df[col].dtype in ['float64', 'int64']:
            df[col] = df.groupby(group_col)[col].transform(lambda x: x.fillna(x.mean()))
    df = df.fillna(df.mean(numeric_only=True))
    return df

data = fill_na_with_group_mean(data, '稳态区间编号')

# 按稳态区间编号分组并计算均值
grouped_data = data.groupby('稳态区间编号').mean(numeric_only=True).reset_index()

# 一步聚类：基于机组负荷
feature = '机组负荷'
X = grouped_data[[feature]].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 计算轮廓系数作为参考
silhouette_scores = []
K_range = range(2, 10)
for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    labels = kmeans.fit_predict(X_scaled)
    score = silhouette_score(X_scaled, labels)
    silhouette_scores.append(score)

# 推荐的最佳K值（基于轮廓系数）
optimal_k = K_range[np.argmax(silhouette_scores)]
print(f"基于轮廓系数推荐的最佳聚类数: {optimal_k}")

# 可视化函数：根据用户选择的K值进行聚类和展示
def plot_clusters(selected_k):
    # 应用KMeans
    kmeans = KMeans(n_clusters=selected_k, random_state=42, n_init=10)
    grouped_data['cluster'] = kmeans.fit_predict(X_scaled)

    # 可视化
    plt.figure(figsize=(12, 5))

    # 1. 轮廓系数图（作为参考）
    plt.subplot(1, 2, 1)
    plt.plot(K_range, silhouette_scores, marker='o', linestyle='-', color='b', label='轮廓系数')
    plt.axvline(x=selected_k, color='r', linestyle='--', label=f'当前K={selected_k}')
    plt.title('轮廓系数 vs 聚类数', fontsize=12)
    plt.xlabel('聚类数 K', fontsize=10)
    plt.ylabel('轮廓系数', fontsize=10)
    plt.legend()
    plt.grid(True, linestyle='--', alpha=0.7)

    # 2. 散点图：以机组负荷和汽轮机热耗率q展示聚类结果
    plt.subplot(1, 2, 2)
    palette = sns.color_palette("tab10", selected_k)
    sns.scatterplot(data=grouped_data, x='机组负荷', y='汽轮机热耗率q', hue='cluster',
                    palette=palette, s=50, alpha=0.7, edgecolor='k')
    plt.title(f'聚类结果 (K={selected_k})', fontsize=12)
    plt.xlabel('机组负荷', fontsize=10)
    plt.ylabel('汽轮机热耗率q', fontsize=10)
    plt.legend(title='簇', bbox_to_anchor=(1.05, 1), loc='upper left')
    plt.grid(True, linestyle='--', alpha=0.7)

    plt.tight_layout()
    plt.show()

# 使用交互式滑块让用户选择K值
interact(plot_clusters, selected_k=IntSlider(min=2, max=9, step=1, value=optimal_k, description='选择聚类数 K:'))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
基于轮廓系数推荐的最佳聚类数: 2


interactive(children=(IntSlider(value=2, description='选择聚类数 K:', max=9, min=2), Output()), _dom_classes=('widg…