In [None]:
from Bio import Phylo
import re
import pandas as pd


def normalize_label(s: str) -> str:
    """
    统一物种标签，便于匹配：
    - 全小写
    - 下划线转空格
    - 去掉括号里的注释
    - 去掉 '|' 后面的数据库ID 等
    - 连续空白压缩
    """
    if s is None:
        return ""
    s = s.strip()
    # 去掉括号内容，如 "Homo sapiens (taxid 9606)"
    s = re.sub(r"\s*\([^)]*\)", "", s)
    # 只取管道前面，如 "Homo_sapiens|9606"
    s = s.split("|")[0]
    # 下划线转空格
    s = s.replace("_", " ")
    # 压缩空白并小写
    s = re.sub(r"\s+", " ", s).strip().lower()
    return s

def find_leaf_by_name(tree, target_name="Homo sapiens"):
    """
    在叶节点中寻找与 target_name 匹配的 clade。
    匹配规则：规范化后完全匹配，或包含全部词（'homo' & 'sapiens'）的子串匹配。
    """
    target_norm = normalize_label(target_name)
    target_tokens = target_norm.split()
    candidates = []
    for clade in tree.find_clades(terminal=True):
        label = getattr(clade, "name", None) or getattr(clade, "taxon", None)
        norm = normalize_label(str(label) if label is not None else "")
        if not norm:
            continue
        # 完全匹配优先
        if norm == target_norm:
            return clade
        # 次优：包含所有词（适配前缀/后缀/附加信息）
        if all(tok in norm for tok in target_tokens):
            candidates.append(clade)
    # 如果没有完全匹配，返回第一个“包含所有词”的候选
    return candidates[0] if candidates else None

def load_tree_and_get_evolution(newick_path: str, target_name="Homo sapiens"):
    # 读取 Newick
    tree = Phylo.read(newick_path, "newick")

    # 找到对应叶子
    clade = find_leaf_by_name(tree, target_name=target_name)
    if clade is None:
        # 打印一些近似候选，方便你校对物种名
        tips = sorted(
            {normalize_label(getattr(c, "name", "") or "") for c in tree.get_terminals()}
        )
        raise ValueError(
            f"Could not find leaf for '{target_name}'. "
            f"Try checking exact label. Example tips (normalized) include: "
            + ", ".join(t for t in tips[:20]) + (" ..." if len(tips) > 20 else "")
        )

    # 计算从根到该叶的总分支长度（branch length 累加）
    # Biopython: distance(clade) = root 到 clade 的路径长度和（以当前树根为起点）
    evolution = tree.distance(clade)

    return evolution, clade

from tqdm.auto import tqdm
# ===== 使用示例 =====
path = "../data/phylogenetic/41564_2016_BFnmicrobiol201648_MOESM209_ESM.txt"  # 你的Newick文件路径
df_species = pd.read_csv('../data/treeoflife.species.tsv', sep='\t')
species_names = df_species['Compact_name'].tolist()
evolution_list, clade_list = [], []
for name in tqdm(species_names):
    try:
        evolution, clade = load_tree_and_get_evolution(path, target_name=name)
        evolution_list.append(evolution)
        clade_list.append(clade.name)
    except ValueError as e:
        # print(e)
        evolution_list.append(None)
        clade_list.append(None)
df_species['Evolution'] = evolution_list
df_species['Clade'] = clade_list
df_species.to_csv('results/treeoflife.species.with_evolution.tsv', sep='\t', index=False)

num_evo_not_none = sum(1 for evo in evolution_list if evo is not None)
print(f"Successfully found evolution for {num_evo_not_none} out of {len(species_names)} species.")

In [None]:
from Bio import Phylo

def get_distance_to_leaf(tree_file_path, species_name):
    """
    计算系统发育树中从根节点到指定叶节点的总分支长度。

    参数:
    tree_file_path (str): Newick格式的树文件的路径。
    species_name (str): 您想计算距离的目标物种的名称。

    返回:
    float: 从根节点到指定物种的总分支长度。如果找不到物种，则返回None。
    """
    try:
        # 读取Newick格式的树文件
        tree = Phylo.read(tree_file_path, "newick")
        
        # 寻找目标物种（叶节点）
        # 注意：在Newick文件中，物种名称中的空格可能会被下划线替代
        # 我们同时检查原始名称和替换后的名称
        target_leaf = None
        for leaf in tree.get_terminals():
            if leaf.name == species_name or leaf.name == species_name.replace(' ', '_'):
                target_leaf = leaf
                break
        
        if target_leaf:
            # 计算从根节点到该叶节点的距离
            distance = tree.distance(target_leaf)
            return distance
        else:
            print(f"错误：在树中找不到名为 '{species_name}' 的物种。")
            return None

    except FileNotFoundError:
        print(f"错误：找不到文件 '{tree_file_path}'。请检查路径是否正确。")
        return None
    except Exception as e:
        print(f"处理文件时发生错误: {e}")
        return None

# --- 使用示例 ---

# 定义您的文件路径和目标物种
# 请确保将"../data/phylogenetic/41564_2016_BFnmicrobiol201648_MOESM209_ESM.txt" 替换为您的实际文件路径
file_path = "../data/phylogenetic/41564_2016_BFnmicrobiol201648_MOESM209_ESM.txt"
target_species = "Eukaryota_Opisthokonta_Holozoa_Metazoa_Animalia_Craniata_Mammalia_Homo_sapiens_human"

# 调用函数并获取结果
total_branch_length = get_distance_to_leaf(file_path, target_species)

# 打印结果
if total_branch_length is not None:
    print(f"从根节点到 '{target_species}' 的总分支长度为: {total_branch_length} (单位：核苷酸替换/位点)")

In [None]:
import csv
import re
from pathlib import Path
from typing import Optional, Tuple, Dict

from Bio import Phylo

GENUS_RE = re.compile(r"^[A-Z][a-z]+$")            # 形如 Homo, Apis, Tribolium
SPECIES_RE = re.compile(r"^[a-z][a-z0-9\-]*$")     # 形如 sapiens, mellifera, castaneum

def extract_genus_species_from_tree_label(label: str) -> Optional[str]:
    """
    从树叶节点的长串label中抽取 'Genus species'（如 'Homo sapiens'）。
    策略：按 '_' 切分，从右往左找最后一组（Genus, species）组合，丢掉更右侧的俗名。
    """
    if not label:
        return None
    toks = label.split('_')
    # 从右往左找最后一个满足 Genus species 的相邻对 (i, i+1)
    for i in range(len(toks) - 2, -1, -1):
        g, s = toks[i], toks[i + 1]
        if GENUS_RE.match(g) and SPECIES_RE.match(s):
            return f"{g} {s}"
    return None

def extract_genus_species_from_compact(name: str) -> Optional[str]:
    """
    从 TSV 中的 Compact_name 提取 'Genus species'。
    - 常见形式：'Homo sapiens'、'Synechococcus elongatus PCC7942'（取前两段）
    - 若第二段为 'sp.' 或非常规，返回 None（无法与树精确匹配）
    """
    if not name:
        return None
    # 去括号和多余空白
    name = name.strip()
    # 用空格切分（保留连字符数字等）
    toks = [t for t in re.split(r"\s+", name) if t]
    if len(toks) < 2:
        return None
    g, s = toks[0], toks[1]
    if not GENUS_RE.match(g):
        return None
    # 跳过不定种等写法
    if s.lower() in {"sp.", "sp", "cf.", "cf"}:
        return None
    # species 取到符号前（如有 “castaneum, ”）
    s = re.sub(r"[^\w\-].*$", "", s)
    if not s or not SPECIES_RE.match(s):
        return None
    return f"{g} {s}"

def build_gs_to_leaf(tree) -> Dict[str, object]:
    """
    为整棵树构建 'Genus species' -> leaf(clade) 的索引。
    若有重复（同一GS映射多个叶），保留第一个（也可改成存列表）。
    """
    gs2leaf = {}
    for leaf in tree.get_terminals():
        gs = extract_genus_species_from_tree_label(leaf.name or "")
        if gs and gs not in gs2leaf:
            gs2leaf[gs] = leaf
    return gs2leaf

def compute_evolution_for_compact(tree, gs2leaf: Dict[str, object], compact_name: str) -> Tuple[str, str]:
    """
    返回 (Evolution_str, node_id_str)
    - 找到则 Evolution=浮点值字符串, node_id=树中叶节点的原始label
    - 找不到则 ('NA', 'Species Not Found')
    """
    gs = extract_genus_species_from_compact(compact_name)
    if not gs:
        return "NA", "Species Not Found"
    leaf = gs2leaf.get(gs)
    if not leaf:
        return "NA", "Species Not Found"
    try:
        dist = tree.distance(leaf)  # 从根到叶的总branch length
    except Exception:
        return "NA", "Species Not Found"
    # node_id 用树里的原始叶节点label，更可溯源
    node_id = leaf.name if getattr(leaf, "name", None) else gs
    # 你也可以选择保留更多小数位
    return f"{dist}", node_id

def main(
    tree_path: str,
    species_tsv_in: str,
    out_tsv_path: str,
    species_id_col: str = "Species_ID",
    compact_col: str = "Compact_name",
):
    # 读取树
    tree = Phylo.read(tree_path, "newick")
    gs2leaf = build_gs_to_leaf(tree)

    # 读取 TSV、逐行匹配并计算
    rows_out = []
    with open(species_tsv_in, newline="", encoding="utf-8") as fin:
        reader = csv.DictReader(fin, delimiter="\t")
        for row in reader:
            species_id = row.get(species_id_col, "").strip()
            compact = row.get(compact_col, "").strip()
            evolution, node_id = compute_evolution_for_compact(tree, gs2leaf, compact)
            rows_out.append({
                "Species_ID": species_id,
                "Compact_name": compact,
                "Evolution": evolution,
                "node_ID": node_id
            })

    # 写出 TSV
    out_fields = ["Species_ID", "Compact_name", "Evolution", "node_ID"]
    Path(out_tsv_path).parent.mkdir(parents=True, exist_ok=True)
    with open(out_tsv_path, "w", newline="", encoding="utf-8") as fout:
        writer = csv.DictWriter(fout, fieldnames=out_fields, delimiter="\t")
        writer.writeheader()
        writer.writerows(rows_out)

if __name__ == "__main__":
    # 你可以改成自己的路径
    tree_file = "../data/phylogenetic/41564_2016_BFnmicrobiol201648_MOESM209_ESM.txt"
    species_tsv = "data/treeoflife.species.tsv"
    out_tsv = "data/treeoflife.evolution.tsv"
    main(tree_file, species_tsv, out_tsv)


In [1]:
from Bio import Phylo

# 输入 Newick 文件路径
tree_file = "../data/phylogenetic/41564_2016_BFnmicrobiol201648_MOESM209_ESM.txt"

# 读取系统发育树
tree = Phylo.read(tree_file, "newick")

# 获取所有叶节点（物种）
terminals = tree.get_terminals()

# 输出物种数量
print(f"Tree contains {len(terminals)} species.")


Tree contains 1871 species.
