In [None]:
import os
os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["TORCH_NUM_THREADS"] = "1"
os.environ["TORCH_CPP_LOG_LEVEL"] = "ERROR"

import torch
torch.set_num_threads(1)
torch.set_num_interop_threads(1)

import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

import numpy as np
import pandas as pd
import random, joblib

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.metrics import f1_score

SEED = 2025
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

BASE = r"C:\Users\81005\Desktop\CYH\3-PFAS"
PATH_TRAIN = os.path.join(BASE, r"deep_learning_validation\merged_all.xlsx")
PATH_NEW   = os.path.join(BASE, r"large_scale_prediction\7565_filled.xlsx")
OUT_DIR    = os.path.join(BASE, r"large_scale_prediction")
os.makedirs(OUT_DIR, exist_ok=True)

df_tr = pd.read_excel(PATH_TRAIN)
assert "Level_OA" in df_tr.columns
df_tr["Level_OA"] = df_tr["Level_OA"].astype(int)
df_new = pd.read_excel(PATH_NEW)

EXCLUDE = {"Name","Level_OA","fold_id","G","P_PI3K","P_PPAR","P_ROS","P_LPS","P_OA"}
num_cols_tr  = [c for c in df_tr.columns  if (c not in EXCLUDE and np.issubdtype(df_tr[c].dtype, np.number))]
num_cols_new = [c for c in df_new.columns if  np.issubdtype(df_new[c].dtype, np.number)]
COLS_FINAL   = [c for c in num_cols_tr if c in num_cols_new]
assert len(COLS_FINAL) > 0, "No common structural feature columns between training data and new data"

X_all_raw = df_tr[COLS_FINAL].replace([np.inf,-np.inf], np.nan).values
y_all = df_tr["Level_OA"].values
X_new_raw = df_new[COLS_FINAL].replace([np.inf,-np.inf], np.nan).values

y_min = int(y_all.min())
y_all0 = (y_all - y_min).astype(int)
n_classes = int(len(np.unique(y_all0)))

sss = StratifiedShuffleSplit(n_splits=1, test_size=0.1, random_state=SEED)
tr_idx, val_idx = next(sss.split(X_all_raw, y_all0))
X_tr_raw, X_val_raw = X_all_raw[tr_idx], X_all_raw[val_idx]
y_tr, y_val = y_all0[tr_idx], y_all0[val_idx]

imputer = SimpleImputer(strategy="median").fit(X_tr_raw)
X_tr_imp  = imputer.transform(X_tr_raw)
X_val_imp = imputer.transform(X_val_raw)

scaler = StandardScaler().fit(X_tr_imp)
X_tr  = scaler.transform(X_tr_imp)
X_val = scaler.transform(X_val_imp)

X_all_std = scaler.transform(imputer.transform(X_all_raw))
X_new_std = scaler.transform(imputer.transform(X_new_raw))

joblib.dump(imputer, os.path.join(OUT_DIR, "imputer.joblib"))
joblib.dump(scaler,  os.path.join(OUT_DIR, "scaler.joblib"))
joblib.dump(COLS_FINAL, os.path.join(OUT_DIR, "cols_final.joblib"))

class TinyTabTransformer(nn.Module):
    def __init__(self, in_dim, n_classes=3, d_model=64, n_heads=4, n_layers=2, dropout=0.35):
        super().__init__()
        self.proj = nn.Linear(in_dim, d_model)
        enc = nn.TransformerEncoderLayer(
            d_model=d_model, nhead=n_heads,
            dim_feedforward=d_model*2, dropout=dropout,
            batch_first=True, activation="gelu"
        )
        self.encoder = nn.TransformerEncoder(enc, num_layers=n_layers)
        self.norm = nn.LayerNorm(d_model)
        self.head = nn.Sequential(
            nn.Linear(d_model, d_model//2),
            nn.GELU(), nn.Dropout(dropout),
            nn.Linear(d_model//2, n_classes)
        )
    def forward(self, x):
        x = self.proj(x).unsqueeze(1)
        x = self.encoder(x)
        x = self.norm(x.squeeze(1))
        return self.head(x)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = TinyTabTransformer(in_dim=X_tr.shape[1], n_classes=n_classes,
                           d_model=64, n_heads=4, n_layers=2, dropout=0.35).to(device)

train_ds = TensorDataset(torch.tensor(X_tr, dtype=torch.float32),
                         torch.tensor(y_tr, dtype=torch.long))
val_ds   = TensorDataset(torch.tensor(X_val, dtype=torch.float32),
                         torch.tensor(y_val, dtype=torch.long))

train_loader = DataLoader(train_ds, batch_size=16, shuffle=True,
                          num_workers=0, pin_memory=False, persistent_workers=False)
val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False,
                          num_workers=0, pin_memory=False, persistent_workers=False)

cnt_tr = np.bincount(y_tr, minlength=n_classes).astype(np.float32)
cnt_tr = np.maximum(cnt_tr, 1.0)
w_tr = (len(y_tr) / cnt_tr)
w_tr = w_tr / w_tr.mean()
w_tr = torch.tensor(w_tr, dtype=torch.float32, device=device)

criterion = nn.CrossEntropyLoss(weight=w_tr)
optim = torch.optim.Adam(model.parameters(), lr=1e-3, weight_decay=1e-4)

best_f1, best_state, no_improve, patience = -1.0, None, 0, 50

for ep in range(500):
    model.train()
    for bx, by in train_loader:
        bx, by = bx.to(device), by.to(device)
        logits = model(bx)
        loss = criterion(logits, by)
        optim.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim.step()

    model.eval()
    with torch.no_grad():
        pv, yv = [], []
        for bx, by in val_loader:
            bx = bx.to(device)
            logit = model(bx)
            pv.append(logit.argmax(1).cpu().numpy())
            yv.append(by.numpy())

    f1 = f1_score(np.concatenate(yv), np.concatenate(pv), average="macro")
    if f1 > best_f1:
        best_f1 = f1
        best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
        no_improve = 0
    else:
        no_improve += 1
        if no_improve >= patience:
            break

model.load_state_dict(best_state)

cnt_all = np.bincount(y_all0, minlength=n_classes).astype(np.float32)
cnt_all = np.maximum(cnt_all, 1.0)
w_all = (len(y_all0) / cnt_all)
w_all = w_all / w_all.mean()
w_all = torch.tensor(w_all, dtype=torch.float32, device=device)

criterion_final = nn.CrossEntropyLoss(weight=w_all)
optim_final = torch.optim.Adam(model.parameters(), lr=5e-4, weight_decay=1e-4)

full_ds = TensorDataset(torch.tensor(X_all_std, dtype=torch.float32),
                        torch.tensor(y_all0, dtype=torch.long))
full_loader = DataLoader(full_ds, batch_size=32, shuffle=True,
                         num_workers=0, pin_memory=False, persistent_workers=False)

for ep in range(10):
    model.train()
    for bx, by in full_loader:
        bx, by = bx.to(device), by.to(device)
        loss = criterion_final(model(bx), by)
        optim_final.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optim_final.step()

A = 5
pca = PCA(n_components=A, random_state=SEED).fit(X_all_std)
T_train = pca.transform(X_all_std)   
cov_T = np.cov(T_train, rowvar=False)
inv_cov_T = np.linalg.pinv(cov_T)

h_train = np.einsum('ij,jk,ik->i', T_train, inv_cov_T, T_train)
h_star = np.quantile(h_train, 0.95)

T_new = pca.transform(X_new_std)
h_new = np.einsum('ij,jk,ik->i', T_new, inv_cov_T, T_new)
ad_in = (h_new <= h_star)

model.eval()
with torch.no_grad():
    logits_new = model(torch.tensor(X_new_std, dtype=torch.float32, device=device)).cpu()
prob = torch.softmax(logits_new, dim=1).numpy()           
pred0 = prob.argmax(axis=1)                              
pred = pred0 + y_min                                     

prob_cols = [f"Prob_Level_{i}" for i in range(1, n_classes+1)]
out = pd.concat([
    df_new.reset_index(drop=True),
    pd.DataFrame({"Pred_Level": pred, "AD_in": ad_in}),
    pd.DataFrame(prob, columns=prob_cols)
], axis=1)
out["max_prob"] = out[prob_cols].max(axis=1)

out.to_excel(os.path.join(OUT_DIR, "predictions_Xonly.xlsx"), index=False)
print("[OK] 保存：predictions_Xonly.xlsx")

risk_low  = out[out["Pred_Level"]==1].sort_values("Prob_Level_1", ascending=False)
risk_mid  = out[out["Pred_Level"]==2].sort_values("Prob_Level_2", ascending=False)
risk_high = out[out["Pred_Level"]==3].sort_values("Prob_Level_3", ascending=False)

risk_low.to_excel (os.path.join(OUT_DIR, "risk_low_L1_all.xlsx"),  index=False)
risk_mid.to_excel (os.path.join(OUT_DIR, "risk_mid_L2_all.xlsx"),  index=False)
risk_high.to_excel(os.path.join(OUT_DIR, "risk_high_L3_all.xlsx"), index=False)
print("[OK]save：L1/L2/L3 ")

core = out[(out["AD_in"]) & (out["max_prob"]>=0.80)].copy()
core[core["Pred_Level"]==1].sort_values("Prob_Level_1", ascending=False)\
    .to_excel(os.path.join(OUT_DIR,"risk_low_L1_core.xlsx"), index=False)
core[core["Pred_Level"]==2].sort_values("Prob_Level_2", ascending=False)\
    .to_excel(os.path.join(OUT_DIR,"risk_mid_L2_core.xlsx"), index=False)
core[core["Pred_Level"]==3].sort_values("Prob_Level_3", ascending=False)\
    .to_excel(os.path.join(OUT_DIR,"risk_high_L3_core.xlsx"), index=False)
print("[OK] save：L1/L2/L3 ")

SUMMARY_TXT  = os.path.join(OUT_DIR, "AD_summary.txt")
SUMMARY_XLSX = os.path.join(OUT_DIR, "AD_breakdown.xlsx")

out["ICP_flag"] = np.select(
    [out["max_prob"]>=0.80, out["max_prob"]>=0.60],
    ["HighConf","Borderline"], default="Empty"
)
n_total          = len(out)
cov_leverage     = float(out["AD_in"].mean())
cov_highconf_all = float((out["max_prob"]>=0.80).mean())
cov_highconf_in  = float(((out["AD_in"]) & (out["max_prob"]>=0.80)).mean())

print("Number of samples:", n_total)
print("Leverage h*:", float(h_star))
print("Leverage coverage:", round(cov_leverage, 3))
print("HighConf (≥0.80) overall proportion:", round(cov_highconf_all, 3))
print("HighConf (≥0.80) proportion within AD_in:", round(cov_highconf_in, 3))
print("ICP_flag distribution:\n", out["ICP_flag"].value_counts(normalize=True).round(3))

with open(SUMMARY_TXT,"w",encoding="utf-8") as f:
    f.write(f"n_total={n_total}\n")
    f.write(f"h_star={float(h_star):.6f}\n")
    f.write(f"cov_leverage={cov_leverage:.3f}\n")
    f.write(f"highconf_all={cov_highconf_all:.3f}\n")
    f.write(f"highconf_in_AD={cov_highconf_in:.3f}\n")
    f.write("ICP_flag:\n")
    for k,v in out["ICP_flag"].value_counts(normalize=True).items():
        f.write(f"  {k}: {v:.3f}\n")

overall_df = pd.DataFrame({
    "metric":["n_total","h_star","cov_leverage","highconf_all","highconf_in_AD"],
    "value":[n_total, float(h_star), cov_leverage, cov_highconf_all, cov_highconf_in]
})
by_icp = out["ICP_flag"].value_counts().to_frame("count")
by_icp["proportion"] = by_icp["count"]/n_total

with pd.ExcelWriter(SUMMARY_XLSX) as w:
    overall_df.to_excel(w, sheet_name="overall", index=False)
    by_icp.to_excel(w, sheet_name="by_ICP_flag")
    out[["AD_in","ICP_flag","max_prob"]+prob_cols].head(20)\
        .to_excel(w, sheet_name="preview_top20", index=False)

print("[OK] save：AD_summary.txt / AD_breakdown.xlsx")