# Genome Annotation Integration

Integrate the annotation information of high-quality genomes with the manually modified spidroin GFF file.

<zh>使用高质量基因组的注释信息，与人工修改好的蛛丝蛋白 GFF 文件进行整合。</zh>

In [1]:
import os
from pybedtools import BedTool


from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    EXTERNAL_DATA_DIR,
    PROCESSED_DATA_DIR
)
from spider_silkome_module import (
    run_shell_command_with_check,
    extract_positions_from_gff,
    positions_export
)

from spider_silkome_module import (
    GeneralGFF,
)

[32m2025-10-14 19:12:15.890[0m | [1mINFO    [0m | [36mspider_silkome_module.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/gyk/project/spider_silkome[0m


<zh>定义相关文件</zh>

In [2]:
fixed_miniprot_gff_path = f"{PROCESSED_DATA_DIR}/02.manual_curation"
spider_genome_path = f"{RAW_DATA_DIR}/spider_genome"
final_output_dir = f"{PROCESSED_DATA_DIR}/03.genome_annotation_integration"
interim_dir = f"{INTERIM_DATA_DIR}/02.manual_curation"
os.makedirs(final_output_dir, exist_ok=True)
os.makedirs(interim_dir, exist_ok=True)

In [7]:
fixed_miniprot_gffs = [f for f in os.listdir(fixed_miniprot_gff_path) if f.endswith(".gff")]
for fixed_miniprot_gff in fixed_miniprot_gffs:
    spider = fixed_miniprot_gff.split(".")[0]
    fixed_miniprot_gff_file = f"{fixed_miniprot_gff_path}/{fixed_miniprot_gff}"
    genome_gff_file = f"{spider_genome_path}/{spider}.gff"
    gene_grep_cmd = f"grep -E 'gene' {genome_gff_file} | sort -k1,1V -k4,4n > {interim_dir}/{spider}_gene.gff"
    run_shell_command_with_check(gene_grep_cmd, f"{interim_dir}/{spider}_gene.gff")
    spridroin_gff = GeneralGFF.parse_gff_file(fixed_miniprot_gff_file)
    GeneralGFF.split_by_custom_str(
        spridroin_gff,
        ["no_start", "no_end"],
        f"{interim_dir}/{spider}_SE_Sp.gff",
        f"{interim_dir}/{spider}_no_SE_Sp.gff"
    )
    gene_gff = BedTool(f"{interim_dir}/{spider}_gene.gff")
    se_gff = BedTool(f"{interim_dir}/{spider}_SE_Sp.gff")
    no_se_gff = BedTool(f"{interim_dir}/{spider}_no_SE_Sp.gff")
    se_intersect = se_gff.intersect(gene_gff, wa=True, wb=True)
    no_se_intersect = no_se_gff.intersect(gene_gff, wa=True, wb=True)
    se_intersect.saveas(f"{interim_dir}/{spider}_SE_Sp_INTERSECT.gff")
    no_se_intersect.saveas(f"{interim_dir}/{spider}_no_SE_Sp_INTERSECT.gff")
    spidroin_with_gnome_gffs_confident = {}
    spidroin_with_gnome_gffs_no_confidence = []
    spidroin_with_gnome_gffs_unconfirmed = []
    for gene in se_intersect:
        record = str(gene).split("\t")
        if record[8].endswith("no_start") and  int(record[13]) - 500 <= int(record[4]) <= int(record[13]):
            gene_ID = record[17].split("=")[-1]
            confident_gene = "\t".join(
                record[:3] +
                record[12:14] +
                record[5:8] +
                [record[8].replace("no_start", f"gene_id={gene_ID}")]
            )
            confident_gene_key = (record[0], record[12], record[13])
            if confident_gene_key in spidroin_with_gnome_gffs_confident:
                continue
            spidroin_with_gnome_gffs_confident[confident_gene_key] = confident_gene
        elif record[8].endswith("no_start") and int(record[12]) - 500 < int(record[4]) < int(record[13]) + 500:
            spidroin_with_gnome_gffs_no_confidence.append(record[:9])
            spidroin_with_gnome_gffs_no_confidence.append(record[9:])
        elif record[8].endswith("no_end") and int(record[12]) >= int(record[3]) >= int(record[12]) + 500:
            gene_ID = record[17].split("=")[-1]
            confident_gene = "\t".join([
                record[:3] +
                record[12:14] +
                record[5:8] +
                [record[8].replace("no_end", f"gene_id={gene_ID}")]
            ])
            confident_gene_key = (record[0], record[12], record[13])
            if confident_gene_key in spidroin_with_gnome_gffs_confident:
                continue
            spidroin_with_gnome_gffs_confident[confident_gene_key] = confident_gene
        elif record[8].endswith("no_end") and int(record[12]) + 500 < int(record[3]) < int(record[13]) - 500:
            spidroin_with_gnome_gffs_no_confidence.append(record[:9])
            spidroin_with_gnome_gffs_no_confidence.append(record[9:])
        else:
            spidroin_with_gnome_gffs_unconfirmed.append(record[:9])
            spidroin_with_gnome_gffs_unconfirmed.append(record[9:])

    # file write
    with open(f"{interim_dir}/{spider}_spidroin_with_gnome.gff", "w") as f:
        for gene in spidroin_with_gnome_gffs_confident.values():
            f.write(gene)
    with open(f"{interim_dir}/{spider}_spidroin_with_gnome_no_confidence.gff", "w") as f:
        for gene in spidroin_with_gnome_gffs_no_confidence:
            f.write("\t".join(gene) + "\n")
    with open(f"{interim_dir}/{spider}_spidroin_with_gnome_unconfirmed.gff", "w") as f:
        for gene in spidroin_with_gnome_gffs_unconfirmed:
            f.write("\t".join(gene))


[32m2025-10-14 19:20:27.083[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m46[0m - [1mThe output file already exists, skipping execution: /home/gyk/project/spider_silkome/data/interim/02.manual_curation/Trichonephila_clavata_gene.gff[0m


手动整理 miniprot 和基因组注释 intersect 结果:
- 有时候 GFF 的注释结果比较准确，能够准确的和 miniprot 的结果进行匹配，比如 no_start 条目的 end 位置和基因组注释的一模一样，那么就可以确认这个位置就是对应的 Spidroin 基因。
- 有时候 GFF 的注释结果和 miniprot 的结果差别太大，那么需要保留下来然后结果 DRS 的数据，看是否能够找到对应的 Spidroin 基因。
