In [None]:
import os
import pandas as pd

DATA_DIR = r"C:\Users\***"

print("Files in DATA_DIR:")
for f in os.listdir(DATA_DIR):
    if f.lower().endswith((".xlsx", ".xls", ".csv")):
        print("  -", f)

DATA_FILE = "merged_all.xlsx"

path = os.path.join(DATA_DIR, DATA_FILE)

if DATA_FILE.lower().endswith((".xlsx", ".xls")):
    df = pd.read_excel(path)
else:
    df = pd.read_csv(path)

print("Loaded df.shape =", df.shape)
print(df.head(2))
print("\nColumns sample:", list(df.columns)[:15])

from sklearn.model_selection import StratifiedKFold, PredefinedSplit

RANDOM_STATE = 2025
TARGET_COL = "Level_OA"

if TARGET_COL not in df.columns:
    raise ValueError(
        f"Target column '{TARGET_COL}' not found. "
        f"Available columns (first 30): {df.columns.tolist()[:30]} ..."
    )

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=RANDOM_STATE)
df["fold_id"] = -1
for fold, (_, val_idx) in enumerate(skf.split(df, df[TARGET_COL])):
    df.loc[val_idx, "fold_id"] = fold

df["fold_id"] = df["fold_id"].astype(int)
print("\nfold_id counts:\n", df["fold_id"].value_counts().sort_index())

cv = PredefinedSplit(test_fold=df["fold_id"].values)

# ==============================
DROP_COLS = ["Name", TARGET_COL, "fold_id", "PP_OA"]
DROP_COLS = [c for c in DROP_COLS if c in df.columns]
feature_cols_xyz = [c for c in df.columns if c not in DROP_COLS]

Y_COLS = ["G"]
Z_COLS = ["P_PPAR", "P_PI3K", "P_PPAR", "P_ROS", "P_LPS"]

for c in (Y_COLS + Z_COLS):
    if c not in df.columns:
        raise ValueError(
            f"Column '{c}' is not in df. Please update Y_COLS/Z_COLS "
            f"to match the real column names in your dataset."
        )

feature_cols_x = [c for c in feature_cols_xyz if c not in set(Y_COLS + Z_COLS)]
feature_cols_xy = [c for c in feature_cols_xyz if c not in set(Z_COLS)]
feature_cols_xz = [c for c in feature_cols_xyz if c not in set(Y_COLS)]

print("\n#features:")
print("  X-only:", len(feature_cols_x))
print("  X+Y:", len(feature_cols_xy))
print("  X+Z:", len(feature_cols_xz))
print("  X+Y+Z:", len(feature_cols_xyz))

import numpy as np
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_validate

lgbm = LGBMClassifier(
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=300,
    random_state=RANDOM_STATE,
    n_jobs=1
)

pipe = Pipeline([
    ("scaler", StandardScaler()),
    ("clf", lgbm),
])

scoring = {
    "acc": "accuracy",
    "f1": "f1_macro",
    "auc": "roc_auc_ovr",
}

def eval_feature_set(cols, name):
    X = df[cols].values
    y = df[TARGET_COL].values
    out = cross_validate(
        pipe, X, y,
        cv=cv,
        scoring=scoring,
        n_jobs=-1,
        return_train_score=False
    )
    return {
        "FeatureSet": name,
        "n_features": len(cols),
        "Accuracy_mean": out["test_acc"].mean(),
        "Accuracy_std": out["test_acc"].std(),
        "MacroF1_mean": out["test_f1"].mean(),
        "MacroF1_std": out["test_f1"].std(),
        "MacroAUC_mean": out["test_auc"].mean(),
        "MacroAUC_std": out["test_auc"].std(),
    }

rows = []
rows.append(eval_feature_set(feature_cols_x, "X-only"))
rows.append(eval_feature_set(feature_cols_xy, "X+Y"))
rows.append(eval_feature_set(feature_cols_xz, "X+Z"))
rows.append(eval_feature_set(feature_cols_xyz, "X+Y+Z"))

res_df = pd.DataFrame(rows)

def pm(m, s, nd=3):
    return f"{m:.{nd}f}±{s:.{nd}f}"

pretty = pd.DataFrame({
    "FeatureSet": res_df["FeatureSet"],
    "n_features": res_df["n_features"],
    "Accuracy": [pm(m, s) for m, s in zip(res_df["Accuracy_mean"], res_df["Accuracy_std"])],
    "Macro-F1": [pm(m, s) for m, s in zip(res_df["MacroF1_mean"], res_df["MacroF1_std"])],
    "Macro-AUC": [pm(m, s) for m, s in zip(res_df["MacroAUC_mean"], res_df["MacroAUC_std"])],
})

print("\n=== LightGBM 10-fold (fixed folds) ===")
print(pretty)

out_path = os.path.join(DATA_DIR, "lgbm_ablation_results.xlsx")
pretty.to_excel(out_path, index=False)
print("\nSaved:", out_path)


In [None]:
import os
import time
import random
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, cross_validate
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, label_binarize
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score

from scipy.special import softmax
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from lightgbm import LGBMClassifier


SEED = 2025
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

PATH_MERGED = r"C:\PY3\pythonProject\merged_all.xlsx"

PATH_IN_7565 = r"C:\PY3\pythonProject\7565.xlsx"
PATH_OUT_7565 = rf"C:\PY3\pythonProject\7565_filled_{time.strftime('%Y%m%d_%H%M%S')}.xlsx"

LABEL_COL = "Level_OA"
NAME_COL = "Name"
FOLD_COL = "fold_id"
N_SPLITS = 10

def nan_inf_to_nan(X: np.ndarray) -> np.ndarray:

    X = X.astype(float, copy=True)
    X[~np.isfinite(X)] = np.nan
    return X


def mean_impute_fit_transform(X_train_raw: np.ndarray):

    Xn = nan_inf_to_nan(X_train_raw)
    col_mean = np.nanmean(Xn, axis=0)

    col_mean = np.where(np.isfinite(col_mean), col_mean, 0.0)

    inds = np.where(np.isnan(Xn))
    Xn[inds] = np.take(col_mean, inds[1])
    return Xn.astype(np.float32), col_mean.astype(np.float32)


def mean_impute_transform(X_raw: np.ndarray, col_mean: np.ndarray):
    Xn = nan_inf_to_nan(X_raw)
    inds = np.where(np.isnan(Xn))
    Xn[inds] = np.take(col_mean, inds[1])
    return Xn.astype(np.float32)
df = pd.read_excel(PATH_MERGED)
assert LABEL_COL in df.columns, f"缺列：{LABEL_COL}"
df[LABEL_COL] = df[LABEL_COL].astype(int)

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
df[FOLD_COL] = -1
for fold, (_, val_idx) in enumerate(skf.split(df, df[LABEL_COL])):
    df.loc[val_idx, FOLD_COL] = fold

print("Number of rows:", len(df))
print("Fold distribution:", df[FOLD_COL].value_counts().to_dict())


baseline_drop = {NAME_COL, LABEL_COL, FOLD_COL, "P_OA"}  # 跟你之前一致
baseline_features = [c for c in df.columns if c not in baseline_drop]

X_base = df[baseline_features]
y_base = df[LABEL_COL].values

lgbm = LGBMClassifier(
    num_leaves=31,
    learning_rate=0.05,
    n_estimators=300,
    random_state=SEED,
    verbose=-1
)

pipe = Pipeline([
    ("imputer", SimpleImputer(strategy="mean")), 
    ("model", lgbm)
])

cv = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)
cv_results = cross_validate(
    pipe, X_base, y_base, cv=cv,
    scoring=["accuracy", "f1_macro", "roc_auc_ovr"],
    return_train_score=False
)
print("Accuracy:", cv_results["test_accuracy"].mean())
print("Macro-F1:", cv_results["test_f1_macro"].mean())
print("AUC:", cv_results["test_roc_auc_ovr"].mean())

need_cols = ["G", "P_PI3K", "P_PPAR", "P_ROS", "P_LPS"]
for c in need_cols:
    assert c in df.columns, f"缺列：{c}"

exclude_for_X = {NAME_COL, LABEL_COL, FOLD_COL, "G", "P_PI3K", "P_PPAR", "P_ROS", "P_LPS", "P_OA"}
X_cols = [c for c in df.columns if c not in exclude_for_X and np.issubdtype(df[c].dtype, np.number)]
print(f"结构描述符个数: {len(X_cols)}")


def feature_columns(mode: str):
    if mode == "X-only":
        return X_cols
    if mode == "X+Y":
        return X_cols + ["G"]
    if mode == "X+Z":
        return X_cols + ["P_PI3K", "P_PPAR", "P_ROS", "P_LPS"]
    if mode == "X+Y+Z":
        return X_cols + ["G", "P_PI3K", "P_PPAR", "P_ROS", "P_LPS"]
    raise ValueError(mode)


class TinyTabTransformer(nn.Module):
    def __init__(self, in_dim, n_classes=3, d_model=64, n_heads=4, n_layers=2, dropout=0.35):
        super().__init__()
        self.proj = nn.Linear(in_dim, d_model)
        enc_layer = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads,
            dim_feedforward=d_model * 2,
            dropout=dropout, batch_first=True, activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(enc_layer, num_layers=n_layers)
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model // 2),
            nn.GELU(),
            nn.Dropout(dropout),
            nn.Linear(d_model // 2, n_classes)
        )

    def forward(self, x):
        x = self.proj(x).unsqueeze(1)
        x = self.encoder(x)
        x = self.norm(x.squeeze(1))
        return self.head(x)


def train_one_fold(train_X, train_y, val_X, val_y,
                   epochs=500, batch_size=16, lr=1e-3, weight_decay=1e-4,
                   d_model=64, n_heads=4, n_layers=2, dropout=0.35, patience=50):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    n_classes = len(np.unique(train_y))

    model = TinyTabTransformer(
        in_dim=train_X.shape[1], n_classes=n_classes,
        d_model=d_model, n_heads=n_heads, n_layers=n_layers, dropout=dropout
    ).to(device)

    classes, counts = np.unique(train_y, return_counts=True)
    weight_map = {c: (np.sum(counts) / cnt) for c, cnt in zip(classes, counts)}
    weights = torch.tensor([weight_map[i + 1] for i in range(n_classes)],
                           dtype=torch.float32).to(device)

    criterion = nn.CrossEntropyLoss(weight=weights)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)

    train_ds = TensorDataset(torch.tensor(train_X, dtype=torch.float32),
                             torch.tensor(train_y - 1, dtype=torch.long))
    val_ds = TensorDataset(torch.tensor(val_X, dtype=torch.float32),
                           torch.tensor(val_y - 1, dtype=torch.long))

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    best_f1, best_state, no_improve = -1.0, None, 0

    for _ in range(epochs):
        model.train()
        for bx, by in train_loader:
            bx, by = bx.to(device), by.to(device)
            logits = model(bx)
            loss = criterion(logits, by)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        model.eval()
        all_prob, all_pred, all_true = [], [], []
        with torch.no_grad():
            for bx, by in val_loader:
                bx = bx.to(device)
                logits = model(bx)
                prob = torch.softmax(logits, dim=1).cpu().numpy()
                pred = prob.argmax(axis=1)
                all_prob.append(prob)
                all_pred.append(pred)
                all_true.append(by.numpy())

        y_true = np.concatenate(all_true)
        y_pred = np.concatenate(all_pred)
        y_prob = np.concatenate(all_prob)
        f1 = f1_score(y_true, y_pred, average="macro")

        if f1 > best_f1:
            best_f1 = f1
            best_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
            no_improve = 0
        else:
            no_improve += 1
            if no_improve >= patience:
                break

    model.load_state_dict(best_state)
    model.eval()
    with torch.no_grad():
        val_logits = model(torch.tensor(val_X, dtype=torch.float32).to(device)).cpu().numpy()
        val_prob = softmax(val_logits, axis=1)
        val_pred = val_prob.argmax(axis=1)

    y_true0 = (val_y - 1)
    acc = accuracy_score(y_true0, val_pred)
    f1 = f1_score(y_true0, val_pred, average="macro")

    try:
        auc = roc_auc_score(label_binarize(y_true0, classes=[0, 1, 2]),
                            val_prob, average="macro", multi_class="ovr")
    except:
        auc = np.nan

    return acc, f1, auc


modes = ["X-only", "X+Y", "X+Z", "X+Y+Z"]
results = {}

for mode in modes:
    cols = feature_columns(mode)
    print(f"\n=== Mode: {mode} | Number of features: {len(cols)} ===")

    acc_list, f1_list, auc_list = [], [], []

    for fold in range(N_SPLITS):
        train_idx = df[FOLD_COL] != fold
        val_idx = df[FOLD_COL] == fold

        X_train_raw = df.loc[train_idx, cols].values
        y_train = df.loc[train_idx, LABEL_COL].values
        X_val_raw = df.loc[val_idx, cols].values
        y_val = df.loc[val_idx, LABEL_COL].values

        X_train_imp, col_mean = mean_impute_fit_transform(X_train_raw)
        X_val_imp = mean_impute_transform(X_val_raw, col_mean)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train_imp).astype(np.float32)
        X_val = scaler.transform(X_val_imp).astype(np.float32)

        acc, f1, auc = train_one_fold(X_train, y_train, X_val, y_val)
        acc_list.append(acc)
        f1_list.append(f1)
        auc_list.append(auc)

    results[mode] = {
        "acc_mean": np.mean(acc_list),
        "acc_std": np.std(acc_list),
        "f1_mean": np.mean(f1_list),
        "f1_std": np.std(f1_list),
        "auc_mean": np.nanmean(auc_list),
        "auc_std": np.nanstd(auc_list),
    }

    print(f"[{mode}] 10-fold 结果："
          f"Acc={np.mean(acc_list):.3f}±{np.std(acc_list):.3f} | "
          f"F1={np.mean(f1_list):.3f}±{np.std(f1_list):.3f} | "
          f"AUC={np.nanmean(auc_list):.3f}±{np.nanstd(auc_list):.3f}")

print(pd.DataFrame(results).T)

if not os.path.exists(PATH_IN_7565):
print(f"[Skipped] File not found: {PATH_IN_7565}")
else:
    df_big = pd.read_excel(PATH_IN_7565)

    num_cols = [c for c in df_big.columns if np.issubdtype(df_big[c].dtype, np.number)]
    # 只把 inf/-inf 当缺失，不把0当缺失
    df_big[num_cols] = df_big[num_cols].replace([np.inf, -np.inf], np.nan)

    # 用整张表（大样本自身）的列均值填满（如果你想用训练集均值去填大样本，需要另写）
    imputer = SimpleImputer(strategy="mean")
    df_big[num_cols] = imputer.fit_transform(df_big[num_cols])

    os.makedirs(os.path.dirname(PATH_OUT_7565), exist_ok=True)
    df_big.to_excel(PATH_OUT_7565, index=False)
    print(f"[OK] 已输出: {PATH_OUT_7565}")
