1) Загрузка X/y + группы

In [None]:
# === BUILD X/y FROM FILES (supports new "forest_1.biom" and old "(2.5).biom") ===
from pathlib import Path
import re, numpy as np, pandas as pd
import h5py

DATA_DIR = Path(".")         # корневая папка с .biom / .tsv
ALLOW_TSV = False            # если начнёшь качать TSV, поставь True и реализуй read_tsv_genus()

# === КАНОН классов (короткие → онтология MGnify) ===
CLASS_CANON = {
    "forest":       "root:Environmental:Terrestrial:Soil:Forest soil",
    "wetland":      "root:Environmental:Terrestrial:Soil:Wetlands",
    "grassland":    "root:Environmental:Terrestrial:Soil:Grasslands",
    "desert":       "root:Environmental:Terrestrial:Soil:Desert",
    "agricultural": "root:Environmental:Terrestrial:Soil:Agricultural",
}
ALLOWED = set(CLASS_CANON.keys())

# Синонимы/наследие → к канону (и старые ярлыки)
NAME_NORMALIZE = {
    "tropical_forest": "forest",
    "peat_bog": "wetland",
    "temperate_agri": "agricultural",
    "grassland_cropland": "grassland",
    "swamp": "wetland",
    "meadow": "grassland",
    "cropland": "agricultural",
    "agri": "agricultural",
    "farm": "agricultural",
}

# Что делать с "arctic"/"tundra" из старых наборов: "drop" или "map_to_grassland"
LEGACY_ARCTIC_POLICY = "drop"   # или "map_to_grassland"
if LEGACY_ARCTIC_POLICY == "map_to_grassland":
    NAME_NORMALIZE["arctic"] = "grassland"
    NAME_NORMALIZE["tundra"] = "grassland"

def normalize_label(raw: str):
    """raw -> (short_label, ontology_str) или (None, None) если отбрасываем."""
    if raw is None:
        return None, None
    s = raw.strip().lower()
    s = NAME_NORMALIZE.get(s, s)
    if s == "arctic" and LEGACY_ARCTIC_POLICY == "drop":
        return None, None
    if s in ALLOWED:
        return s, CLASS_CANON[s]
    return None, None

# --- парсеры имён ---
def parse_label_from_new(name: str):
    # Новый формат: <label>_<tag>.<ext>, напр.: forest_1.biom, wetland-07.tsv
    m = re.match(r"^([A-Za-z][A-Za-z0-9\-]*)[_\-]([A-Za-z0-9]+)\.(biom|tsv)$", name)
    if not m:
        return None, None
    raw = m.group(1)
    lab, _ = normalize_label(raw)
    tag = m.group(2)
    return lab, tag

def parse_label_from_old(name: str):
    # Старый формат: ... (N[.tag]).biom → мапим цифру к канону
    m = re.search(r"\(([\d\.]+)\)\.biom$", name)
    if not m:
        return None, None
    token = m.group(1)
    biome_digit = token.split(".", 1)[0]
    BIOME_MAP_OLD = {
        "1": "desert",
        "2": "arctic",
        "3": "forest",
        "4": "wetland",
        "6": "agricultural",
        # "5": volcanic — пропускаем
    }
    raw = BIOME_MAP_OLD.get(biome_digit)
    lab, _ = normalize_label(raw)
    tag = token.split(".", 1)[1] if "." in token else None
    return lab, tag

def clean_stem(stem: str) -> str:
    s = stem.strip().replace("— копия", "").replace("копия", "")
    s = re.sub(r"\s+", "_", s)
    s = re.sub(r"[^A-Za-z0-9_.()-]+", "_", s)
    return s

# --- чтение BIOM (HDF5) → Genus профиль ---
def read_biom_hdf5_genus(path: Path) -> pd.Series:
    with h5py.File(path, "r") as f:
        obs_ids = [s.decode() if isinstance(s, (bytes, np.bytes_)) else str(s)
                   for s in f["observation/ids"][...]]
        sample_ids = [s.decode() if isinstance(s, (bytes, np.bytes_)) else str(s)
                      for s in f["sample/ids"][...]]
        n_obs, n_samples = len(obs_ids), len(sample_ids)
        data    = f["observation/matrix/data"][...]
        indices = f["observation/matrix/indices"][...]
        indptr  = f["observation/matrix/indptr"][...]
        # taxonomy
        tax = []
        tax_ds = f.get("observation/metadata/taxonomy")
        if tax_ds is not None:
            raw = tax_ds[...]
            for item in raw:
                if isinstance(item, (np.ndarray, list)):
                    parts = [(e.decode() if isinstance(e, (bytes, np.bytes_)) else str(e)) for e in item]
                    tax.append("; ".join(parts))
                else:
                    tax.append(item.decode() if isinstance(item, (bytes, np.bytes_)) else str(item))
        else:
            tax = [""] * n_obs
        def to_genus(t: str) -> str:
            parts = [p.strip() for p in str(t).split(";")]
            for p in parts:
                if p.startswith("g__"):
                    v = p[3:].strip()
                    if v:
                        return v
            for p in reversed(parts):
                v = re.sub(r"^[a-z]__", "", p.strip())
                if v:
                    return v
            return "Unclassified"
        genus = [to_genus(t) for t in tax]
        per_sample = [dict() for _ in range(n_samples)]
        for i in range(n_obs):
            a, b = indptr[i], indptr[i+1]
            if a == b: 
                continue
            g = genus[i]
            cols = indices[a:b]
            vals = data[a:b]
            for c, v in zip(cols, vals):
                d = per_sample[int(c)]
                d[g] = d.get(g, 0.0) + float(v)
        if n_samples == 1:
            s = pd.Series(per_sample[0], dtype=float)
            if s.sum() > 0:
                s = s / s.sum()
            return s.sort_values(ascending=False)
        else:
            dfs = []
            for vec in per_sample:
                s = pd.Series(vec, dtype=float)
                s = s / s.sum() if s.sum() > 0 else s
                dfs.append(s)
            avg = pd.concat(dfs, axis=1).fillna(0.0).mean(axis=1)
            return avg.sort_values(ascending=False)

def read_tsv_genus(path: Path) -> pd.Series:
    raise NotImplementedError("Включи ALLOW_TSV и напиши разбор TSV, если он понадобится.")

# --- сборка ---
records, labels, labels_onto, logs = {}, {}, {}, []
files = list(DATA_DIR.rglob("*.biom"))
if ALLOW_TSV:
    files += list(DATA_DIR.rglob("*.tsv"))

for p in sorted(files):
    name = p.name
    label, tag = parse_label_from_new(name)
    if label is None:
        label, tag = parse_label_from_old(name)
    if label is None:
        logs.append(f"SKIP (no canonical label): {name}")
        continue
    try:
        if p.suffix == ".biom":
            vec = read_biom_hdf5_genus(p)
        else:
            vec = read_tsv_genus(p)
        vec = vec.head(2000)  # мягкий срез хвоста
        base = clean_stem(p.stem)
        sid = f"{base}"
        i, key = 1, sid
        while key in records:
            i += 1
            key = f"{sid}__{i}"
        records[key] = vec
        labels[key]  = label
        labels_onto[key] = CLASS_CANON[label]
        logs.append(f"OK: {name} → {key} → {len(vec)} genera, label={label}")
    except Exception as e:
        logs.append(f"FAIL: {name} → {e}")

X = pd.DataFrame(records).T.fillna(0.0)
y = pd.Series(labels, name="label")
y_onto = pd.Series(labels_onto, name="label_ontology")

print("Shape:", X.shape)
print("Class counts:\n", y.value_counts())
print("\n".join(logs[:10]), "\n...", f"({len(logs)} lines)")

X.to_csv("X_genus_matrix.csv")
y.to_csv("y_labels.csv")
y_onto.to_csv("y_labels_ontology.csv")
with open("build_log.txt", "w", encoding="utf-8") as f:
    f.write("\n".join(logs))

print("\nSaved: X_genus_matrix.csv, y_labels.csv, y_labels_ontology.csv, build_log.txt")


2) Финальные пайплайны (с CLR и фильтрацией)

In [4]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np

# CLR, совместимый с NumPy (работает и на DataFrame, и на ndarray)
def clr_np(X, pseudo=1e-6):
    X = np.asarray(X, dtype=float)
    Xp = X + pseudo
    logX = np.log(Xp)
    gm = logX.mean(axis=1, keepdims=True)  # геом. среднее в лог-пространстве (по строкам)
    return logX - gm

var_filter = VarianceThreshold(threshold=1e-6)

pipe_lr = Pipeline([
    ("var", var_filter),
    ("clr", FunctionTransformer(clr_np, validate=False)),
    ("sc",  StandardScaler(with_mean=False)),
    ("lr",  LogisticRegression(
        penalty="elasticnet", solver="saga",
        l1_ratio=0.5, C=1.0,
        class_weight="balanced",
        max_iter=6000, n_jobs=-1, random_state=42
    )),
])

pipe_rf = Pipeline([
    ("var", var_filter),
    ("rf", RandomForestClassifier(
        n_estimators=900, max_depth=None, max_features="sqrt",
        min_samples_leaf=1, class_weight="balanced",
        random_state=42, n_jobs=-1
    )),
])



3) Честное сравнение на GroupKFold (основная метрика)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import numpy as np
import pandas as pd

# Загрузка X/y, если не в памяти (на случай перезапуска)
if 'X' not in globals() or 'y' not in globals():
    X = pd.read_csv("X_genus_matrix.csv", index_col=0)
    y = pd.read_csv("y_labels.csv", index_col=0)["label"]

# CLR, совместимый с NumPy
def clr_np(X, pseudo=1e-6):
    X = np.asarray(X, dtype=float)
    Xp = X + pseudo
    logX = np.log(Xp)
    gm = logX.mean(axis=1, keepdims=True)
    return logX - gm

var_filter = VarianceThreshold(threshold=1e-6)

pipe_lr = Pipeline([
    ("var", var_filter),
    ("clr", FunctionTransformer(clr_np, validate=False)),
    ("sc",  StandardScaler(with_mean=False)),
    ("lr",  LogisticRegression(
        penalty="elasticnet", solver="saga",
        l1_ratio=0.5, C=1.0,
        class_weight="balanced",
        max_iter=6000, n_jobs=-1, random_state=42
    )),
])

pipe_rf = Pipeline([
    ("var", var_filter),
    ("rf", RandomForestClassifier(
        n_estimators=900, max_depth=None, max_features="sqrt",
        min_samples_leaf=1, class_weight="balanced",
        random_state=42, n_jobs=-1
    )),
])


In [None]:
import re, numpy as np, pandas as pd
from sklearn.model_selection import GroupKFold, cross_val_score

# если в индексе нет MGYS/MGYA/ERR/SRR, группа = сам sample_id (уникально)
def extract_group(s):
    m = re.search(r"(MGYS\d+|MGYA\d+|ERR\d+|SRR\d+)", s)
    return m.group(1) if m else s

groups = [extract_group(i) for i in X.index]
cv = GroupKFold(n_splits=min(5, len(set(groups))))


def gkf_score(model):
    s = cross_val_score(model, X, y, cv=cv.split(X, y, groups), scoring="balanced_accuracy", n_jobs=-1)
    return s.mean(), s.std()

s_lr = gkf_score(pipe_lr)
s_rf = gkf_score(pipe_rf)
print("LR+CLR:", s_lr, "RF:", s_rf)

best_model = pipe_lr if s_lr[0] >= s_rf[0] else pipe_rf
best_name  = "LR+CLR" if best_model is pipe_lr else "RF"
print("Chosen:", best_name)


LR+CLR: (np.float64(0.96), np.float64(0.07999999999999999)) RF: (np.float64(0.96), np.float64(0.07999999999999999))
Chosen: LR+CLR


4) Обучить лучший на всём X и сохранить

In [14]:
import joblib

best_model.fit(X, y)
joblib.dump({
    "pipeline": best_model,
    "feature_names": X.columns.tolist(),
    "classes": sorted(y.unique().tolist())
}, "soil_biome_classifier.joblib")

print("Saved: soil_biome_classifier.joblib")


Saved: soil_biome_classifier.joblib
