## 02 Traditional Machine Learning – Decision Tree
- Name：Yiling Li
- Matrikelnummer: 108980

 1. Data Loading
 2. Signal Processing
 3. Manual Feature Extraction
 4. Feature Matrix Construction
 5. Decision Tree Training
 6. Evaluation & Discussion

In [2]:
# ---------------------------------------------------------
# Signal Processing + Manual Feature Extraction (Starter)
# ---------------------------------------------------------

import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, balanced_accuracy_score
from sklearn.utils.class_weight import compute_class_weight
SEED = 42

In [5]:
# 1. Data Loading
from src.data_loader import load_sensor_data, load_target

sensor_data = load_sensor_data()
y = load_target()
y.head()

0    100
1    100
2    100
3    100
4    100
Name: 1, dtype: int64

In [9]:
# ---------------------------------------------------------
# 1) (可选) 简单 Signal Processing：移动平均平滑
#    - 对每一行(每次测量)做 rolling mean
#    - window=1 等于不处理；你可以先用 1，后面再调参
# ---------------------------------------------------------
def smooth_rolling_rowwise(df: pd.DataFrame, window: int = 1) -> pd.DataFrame:
    if window <= 1:
        return df
    # rolling along columns (time axis)
    return df.rolling(window=window, axis=1, min_periods=1).mean()

# ---------------------------------------------------------
# 2) Feature Extraction：从每个传感器的 60s 时序（每一行）提取特征
#    先用“最低安全线”特征：mean/std/min/max + rms + slope
# ---------------------------------------------------------
def extract_features_from_sensor(df: pd.DataFrame, prefix: str) -> pd.DataFrame:
    """
    df: shape (n_samples, n_timesteps)
    return: shape (n_samples, n_features)
    """
    x = df.to_numpy(dtype=float)
    n, t = x.shape

    mean = x.mean(axis=1)
    std = x.std(axis=1)
    vmin = x.min(axis=1)
    vmax = x.max(axis=1)
    rms = np.sqrt(np.mean(x**2, axis=1))

    # 线性趋势 slope：用最简单的 (last - first)/(t-1)
    if t > 1:
        slope = (x[:, -1] - x[:, 0]) / (t - 1)
    else:
        slope = np.zeros(n)

    # 可选：分位数（先留着，想加分再打开）
    q25 = np.quantile(x, 0.25, axis=1)
    q50 = np.quantile(x, 0.50, axis=1)
    q75 = np.quantile(x, 0.75, axis=1)

    return pd.DataFrame({
        f"{prefix}__mean": mean,
        f"{prefix}__std": std,
        f"{prefix}__min": vmin,
        f"{prefix}__max": vmax,
        f"{prefix}__rms": rms,
        f"{prefix}__slope": slope,
        f"{prefix}__q25": q25,
        f"{prefix}__q50": q50,
        f"{prefix}__q75": q75,
    })

# ---------------------------------------------------------
# 3) 组装特征矩阵 X（把所有传感器特征横向拼起来）
# ---------------------------------------------------------
SMOOTH_WINDOW = 1  # 先不平滑；如果想尝试，改成 5/11/21 等

feature_blocks = []
for sensor_name, df in sensor_data.items():
    df_sp = smooth_rolling_rowwise(df, window=SMOOTH_WINDOW)
    feats = extract_features_from_sensor(df_sp, prefix=sensor_name)
    feature_blocks.append(feats)

X = pd.concat(feature_blocks, axis=1)

print("X shape:", X.shape)    # (2205, n_features)
print("y shape:", y.shape)    # (2205,)

# ---------------------------------------------------------
# 4) Baseline：Decision Tree（按作业要求：每个模型训练 3 次不同 split）
# ---------------------------------------------------------
def run_decision_tree_3splits(X: pd.DataFrame, y: pd.Series, base_seed: int = 42):
    results = []

    classes = np.unique(y)
    class_weights = compute_class_weight(class_weight="balanced", classes=classes, y=y)
    class_weight_dict = {c: w for c, w in zip(classes, class_weights)}

    for k in range(3):
        rs = base_seed + k

        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=0.2,
            random_state=rs,
            stratify=y
        )

        clf = DecisionTreeClassifier(
            random_state=rs,
            class_weight=class_weight_dict,
            max_depth=None,          # 先做 baseline；后面你可以调参
            min_samples_leaf=1
        )
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        bal_acc = balanced_accuracy_score(y_test, y_pred)
        results.append(bal_acc)

        print(f"\n--- Split {k+1}/3 | random_state={rs} ---")
        print("Balanced Acc:", round(bal_acc, 4))
        print(classification_report(y_test, y_pred, digits=4))

    results = np.array(results)
    print("\n=== Summary (3 splits) ===")
    print("Balanced Acc mean:", round(results.mean(), 4))
    print("Balanced Acc std :", round(results.std(ddof=1), 4))

    return results

scores = run_decision_tree_3splits(X, y, base_seed=SEED)


X shape: (2205, 153)
y shape: (2205,)

--- Split 1/3 | random_state=42 ---
Balanced Acc: 0.9164
              precision    recall  f1-score   support

          73     0.9452    0.9583    0.9517        72
          80     0.8590    0.9306    0.8933        72
          90     0.8267    0.8611    0.8435        72
         100     0.9581    0.9156    0.9364       225

    accuracy                         0.9161       441
   macro avg     0.8972    0.9164    0.9062       441
weighted avg     0.9184    0.9161    0.9167       441


--- Split 2/3 | random_state=43 ---
Balanced Acc: 0.9022
              precision    recall  f1-score   support

          73     0.9333    0.9722    0.9524        72
          80     0.8472    0.8472    0.8472        72
          90     0.8356    0.8472    0.8414        72
         100     0.9593    0.9422    0.9507       225

    accuracy                         0.9161       441
   macro avg     0.8939    0.9022    0.8979       441
weighted avg     0.9166    0.91

In [10]:
results = np.array([0.9164, 0.9022, 0.9103])

results_df = pd.DataFrame({
    "balanced_accuracy": results
})

print("Per-split results:")
print(results_df)

# 计算 summary
summary_df = pd.DataFrame({
    "balanced_accuracy_mean": [results_df["balanced_accuracy"].mean()],
    "balanced_accuracy_std": [results_df["balanced_accuracy"].std(ddof=1)]
})

print("\nSummary:")
print(summary_df)


Per-split results:
   balanced_accuracy
0             0.9164
1             0.9022
2             0.9103

Summary:
   balanced_accuracy_mean  balanced_accuracy_std
0                0.909633               0.007123
