# Miniprot mapping
Setting up the required Python packages

In [None]:
import os
import pandas as pd
import subprocess
import time
from pathlib import Path

from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    EXTERNAL_DATA_DIR,
    PROCESSED_DATA_DIR
)
from spider_silkome_module.utils import run_cmd

## Configure

Definition of the path for relevant data

In [19]:
task_name = "miniprot_mapping_20260127"

spidroin_fasta_file = EXTERNAL_DATA_DIR / "spider-silkome-database.v1.prot.fixed.renamed.fasta"
spider_genome_path = RAW_DATA_DIR / "spider_genome"

# dedup
dedup_output_dir = INTERIM_DATA_DIR / task_name / "cdhit"
spidroin_fasta_file_rep = dedup_output_dir / "cdhit_rep_seq.fa"
shortest_seq_file = dedup_output_dir / "cdhit_shortest_seq.fa"

# miniprot
miniprot_output_path = PROCESSED_DATA_DIR / task_name / "miniprot_output"



Perform redundancy reduction on the N/C terminal sequences of Spidroin proteins using cd-hit.

<zh>使用 cd-hit 对 Spidroin 蛋白的 N/C 端序列进行去冗余。</zh>

In [17]:

cdhit_cmd = f"pixi run cd-hit -i {spidroin_fasta_file} -o {spidroin_fasta_file_rep} -c 0.9 -T 0 -M 0 -d 0"
shortest_seq_cmd = f"pixi run python -m spider_silkome_module.extract_shortest_seq --clstr-path {spidroin_fasta_file_rep}.clstr --fasta-path {spidroin_fasta_file} --output-path {shortest_seq_file}"
run_cmd(cdhit_cmd, [spidroin_fasta_file_rep])
run_cmd(shortest_seq_cmd, [shortest_seq_file],force=True)

⏭️ /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_rep_seq.fa exists, skip


2026-01-27 10:51:07.499 | INFO     | spider_silkome_module.config:<module>:11 - PROJ_ROOT path is: /home/gyk/project/spider_silkome


[32m2026-01-27 10:51:07.725[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m61[0m - [1mParsing cluster file: /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_rep_seq.fa.clstr[0m
[32m2026-01-27 10:51:07.752[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m63[0m - [1mFound 6985 clusters[0m
[32m2026-01-27 10:51:07.752[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_shortest_seq_ids[0m:[36m48[0m - [34m[1mCluster 0: shortest = 5826_MaSp2_CTD (1854aa)[0m
[32m2026-01-27 10:51:07.752[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_shortest_seq_ids[0m:[36m48[0m - [34m[1mCluster 1: shortest = 2571_Flag_CTD (448aa)[0m
[32m2026-01-27 10:51:07.752[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_shortest_seq_ids[0m:[36m48[0m - [34m[1mCluster 2: shortest = 1737_MiSp_CTD (1674aa)[0m
[32m2026-01-27 10:51:07.752[0m | [34m[1mDEBUG   [0m | [36m__main__[0m:[36mget_shortest_seq_ids[

Extracting sequences: 100%|██████████| 6985/6985 [00:00<00:00, 938922.97it/s]


Use miniprot to align the C-terminal and N-terminal sequences of the spidroin gene sequences to the genome of the new species.

In [21]:
# index_cmd = f"miniprot -t70 -d {gnome_mpi_path}/{spider}.mpi {spider_genome_path}/{spider_genome}"
miniprot_cmd = f"pixi run python -m spider_silkome_module.run_miniprot --input-path {spider_genome_path} --protein-fasta {shortest_seq_file} --output-path {miniprot_output_path} --threads 70 --outc 0.8 --force"

# run_cmd(index_cmd, [gnome_mpi_path])
run_cmd(miniprot_cmd, [miniprot_output_path])

2026-01-27 12:35:50.046 | INFO     | spider_silkome_module.config:<module>:11 - PROJ_ROOT path is: /home/gyk/project/spider_silkome
Processing genomes:   0%|          | 0/10 [00:00<?, ?it/s]

[32m2026-01-27 12:35:50.077[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m76[0m - [1mInput path: /home/gyk/project/spider_silkome/data/raw/spider_genome[0m
[32m2026-01-27 12:35:50.078[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m77[0m - [1mProtein FASTA: /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa[0m
[32m2026-01-27 12:35:50.079[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m84[0m - [1mFound 10 genome(s) to process[0m
[32m2026-01-27 12:35:50.080[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Songthela_sp: /home/gyk/project/spider_silkome/data/raw/spider_genome/049.Songthela_sp/Songthela_sp.mpi[0m


[M::mp_idx_restore@18.123*0.56] loaded the index
[M::mp_idx_print_stat] 2398163 distinct k-mers; mean occ of infrequent k-mers: 507.90; 2150 frequent k-mers accounting for 105152995 occurrences
[M::worker_pipeline::27.231*22.50] mapped 4066 sequences
[M::worker_pipeline::29.724*25.90] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/049.Songthela_sp/Songthela_sp.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 30.217 sec; CPU: 770.238 sec; Peak RSS: 14.456 GB
Processing genomes:  10%|█         | 1/10 [00:30<04:32, 30.28s/it]

[32m2026-01-27 12:36:20.357[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Songthela_sp/Songthela_sp.gff[0m
[32m2026-01-27 12:36:20.358[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Araneus_ventricosus: /home/gyk/project/spider_silkome/data/raw/spider_genome/064.Araneus_ventricosus/Araneus_ventricosus.mpi[0m


[M::mp_idx_restore@10.371*0.60] loaded the index
[M::mp_idx_print_stat] 2371156 distinct k-mers; mean occ of infrequent k-mers: 368.91; 368 frequent k-mers accounting for 10236796 occurrences
[M::worker_pipeline::16.531*24.23] mapped 4066 sequences
[M::worker_pipeline::18.639*28.92] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/064.Araneus_ventricosus/Araneus_ventricosus.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 18.983 sec; CPU: 539.287 sec; Peak RSS: 9.306 GB
Processing genomes:  20%|██        | 2/10 [00:49<03:09, 23.70s/it]

[32m2026-01-27 12:36:39.453[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Araneus_ventricosus/Araneus_ventricosus.gff[0m
[32m2026-01-27 12:36:39.453[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Pardosa_pseudoannulata: /home/gyk/project/spider_silkome/data/raw/spider_genome/106.Pardosa_pseudoannulata/Pardosa_pseudoannulata.mpi[0m


[M::mp_idx_restore@11.353*0.56] loaded the index
[M::mp_idx_print_stat] 2345173 distinct k-mers; mean occ of infrequent k-mers: 312.51; 841 frequent k-mers accounting for 53172699 occurrences
[M::worker_pipeline::15.822*17.63] mapped 4066 sequences
[M::worker_pipeline::18.000*22.22] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/106.Pardosa_pseudoannulata/Pardosa_pseudoannulata.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 18.282 sec; CPU: 400.292 sec; Peak RSS: 7.499 GB
Processing genomes:  30%|███       | 3/10 [01:07<02:29, 21.29s/it]

[32m2026-01-27 12:36:57.870[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Pardosa_pseudoannulata/Pardosa_pseudoannulata.gff[0m
[32m2026-01-27 12:36:57.870[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Heteropoda_venatoria: /home/gyk/project/spider_silkome/data/raw/spider_genome/079.Heteropoda_venatoria/Heteropoda_venatoria.mpi[0m


[M::mp_idx_restore@29.988*0.57] loaded the index
[M::mp_idx_print_stat] 2400099 distinct k-mers; mean occ of infrequent k-mers: 814.51; 10205 frequent k-mers accounting for 299685481 occurrences
[M::worker_pipeline::53.287*29.21] mapped 4066 sequences
[M::worker_pipeline::55.964*30.35] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/079.Heteropoda_venatoria/Heteropoda_venatoria.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 56.918 sec; CPU: 1699.393 sec; Peak RSS: 23.398 GB
Processing genomes:  40%|████      | 4/10 [02:04<03:32, 35.37s/it]

[32m2026-01-27 12:37:54.827[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Heteropoda_venatoria/Heteropoda_venatoria.gff[0m
[32m2026-01-27 12:37:54.828[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Scorpiops_zhui: /home/gyk/project/spider_silkome/data/raw/spider_genome/045.Scorpiops_zhui/Scorpiops_zhui.mpi[0m


[M::mp_idx_restore@33.461*0.60] loaded the index
[M::mp_idx_print_stat] 2404995 distinct k-mers; mean occ of infrequent k-mers: 949.21; 14376 frequent k-mers accounting for 503454547 occurrences
[M::worker_pipeline::57.122*27.42] mapped 4066 sequences
[M::worker_pipeline::60.667*28.98] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/045.Scorpiops_zhui/Scorpiops_zhui.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 61.947 sec; CPU: 1759.277 sec; Peak RSS: 27.552 GB
Processing genomes:  50%|█████     | 5/10 [03:06<03:44, 44.96s/it]

[32m2026-01-27 12:38:56.803[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Scorpiops_zhui/Scorpiops_zhui.gff[0m
[32m2026-01-27 12:38:56.803[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Pholcus_sp: /home/gyk/project/spider_silkome/data/raw/spider_genome/034.Pholcus_sp/Pholcus_sp.mpi[0m


[M::mp_idx_restore@5.317*0.57] loaded the index
[M::mp_idx_print_stat] 2319710 distinct k-mers; mean occ of infrequent k-mers: 169.35; 102 frequent k-mers accounting for 2936720 occurrences
[M::worker_pipeline::7.588*18.71] mapped 4066 sequences
[M::worker_pipeline::8.456*23.71] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/034.Pholcus_sp/Pholcus_sp.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 8.646 sec; CPU: 200.635 sec; Peak RSS: 3.865 GB
Processing genomes:  60%|██████    | 6/10 [03:15<02:10, 32.65s/it]

[32m2026-01-27 12:39:05.543[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Pholcus_sp/Pholcus_sp.gff[0m
[32m2026-01-27 12:39:05.543[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Evarcha_sp: /home/gyk/project/spider_silkome/data/raw/spider_genome/013.Evarcha_sp/Evarcha_sp.mpi[0m


[M::mp_idx_restore@32.889*0.63] loaded the index
[M::mp_idx_print_stat] 2379530 distinct k-mers; mean occ of infrequent k-mers: 827.53; 15638 frequent k-mers accounting for 523531217 occurrences
[M::worker_pipeline::58.625*29.51] mapped 4066 sequences
[M::worker_pipeline::62.517*31.32] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/013.Evarcha_sp/Evarcha_sp.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 63.519 sec; CPU: 1958.761 sec; Peak RSS: 23.942 GB
Processing genomes:  70%|███████   | 7/10 [04:19<02:08, 42.75s/it]

[32m2026-01-27 12:40:09.088[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Evarcha_sp/Evarcha_sp.gff[0m
[32m2026-01-27 12:40:09.088[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Pandercetes_sp: /home/gyk/project/spider_silkome/data/raw/spider_genome/031.Pandercetes_sp/Pandercetes_sp.mpi[0m


[M::mp_idx_restore@29.770*0.58] loaded the index
[M::mp_idx_print_stat] 2400014 distinct k-mers; mean occ of infrequent k-mers: 740.24; 6594 frequent k-mers accounting for 200110267 occurrences
[M::worker_pipeline::58.322*33.03] mapped 4066 sequences
[M::worker_pipeline::63.691*35.44] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/031.Pandercetes_sp/Pandercetes_sp.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 64.597 sec; CPU: 2258.179 sec; Peak RSS: 21.445 GB
Processing genomes:  80%|████████  | 8/10 [05:23<01:39, 49.72s/it]

[32m2026-01-27 12:41:13.746[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Pandercetes_sp/Pandercetes_sp.gff[0m
[32m2026-01-27 12:41:13.747[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Hippasa_lycosina: /home/gyk/project/spider_silkome/data/raw/spider_genome/017.Hippasa_lycosina/Hippasa_lycosina.mpi[0m


[M::mp_idx_restore@11.240*0.60] loaded the index
[M::mp_idx_print_stat] 2347132 distinct k-mers; mean occ of infrequent k-mers: 329.05; 587 frequent k-mers accounting for 22261028 occurrences
[M::worker_pipeline::19.345*27.52] mapped 4066 sequences
[M::worker_pipeline::21.064*30.57] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/017.Hippasa_lycosina/Hippasa_lycosina.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 21.467 sec; CPU: 644.427 sec; Peak RSS: 7.830 GB
Processing genomes:  90%|█████████ | 9/10 [05:45<00:40, 40.93s/it]

[32m2026-01-27 12:41:35.351[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Hippasa_lycosina/Hippasa_lycosina.gff[0m
[32m2026-01-27 12:41:35.352[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m89[0m - [1mProcessing Trichonephila_clavata: /home/gyk/project/spider_silkome/data/raw/spider_genome/119.Trichonephila_clavata/Trichonephila_clavata.mpi[0m


[M::mp_idx_restore@3.893*1.00] loaded the index
[M::mp_idx_print_stat] 2364518 distinct k-mers; mean occ of infrequent k-mers: 391.92; 573 frequent k-mers accounting for 15240950 occurrences
[M::worker_pipeline::15.088*48.93] mapped 4066 sequences
[M::worker_pipeline::16.882*50.57] mapped 2919 sequences
[M::main] Version: 0.18-r281
[M::main] CMD: miniprot -S --gff-delim . --outc 0.8 -t 70 -I --gff-only /home/gyk/project/spider_silkome/data/raw/spider_genome/119.Trichonephila_clavata/Trichonephila_clavata.mpi /home/gyk/project/spider_silkome/data/interim/miniprot_mapping_20260127/cdhit/cdhit_shortest_seq.fa
[M::main] Real time: 17.346 sec; CPU: 854.098 sec; Peak RSS: 9.686 GB
Processing genomes: 100%|██████████| 10/10 [06:02<00:00, 36.28s/it]


[32m2026-01-27 12:41:52.857[0m | [1mINFO    [0m | [36m__main__[0m:[36mmain[0m:[36m99[0m - [1mOutput: /home/gyk/project/spider_silkome/data/processed/miniprot_mapping_20260127/miniprot_output/Trichonephila_clavata/Trichonephila_clavata.gff[0m
[32m2026-01-27 12:41:52.858[0m | [32m[1mSUCCESS [0m | [36m__main__[0m:[36mmain[0m:[36m101[0m - [32m[1mCompleted miniprot alignment for 10 genome(s)[0m


In [None]:
from pybedtools import BedTool

spider = "Trichonephila_clavata"
output_dir = f"{miniprot_output_path}/{spider}_all"
mRNA_gff = f"{output_dir}/{spider}.mRNA.gff"

merged_mRNA_gff = BedTool(mRNA_gff).merge(s=True)
for gene in merged_mRNA_gff:
    print(gene)


Defining the gene boundaries for each type of Spidroin should be part of the `for spider_genome in spider_genomes:` loop. For convenience, this section has been separated for testing.

<zh>处理每一种蛛丝蛋白的基因边界界定，这一部分应该作为`for spider_genome in spider_genomes:`循环下的一部分，这里为了方便，独立出来进行测试。</zh>

In [None]:
spidroins = ['MaSp3', 'Pflag', 'MiSp', 'MaSp2', 'AcSp', 'CySp',
'MaSp3B', 'Spidroin', 'Putative_spidroin', 'CrSp', 'AgSp2', 'MaSp1',
'Ampullate_spidroin', 'MaSp', 'Flag', 'AgSp1', 'MaSp2B', 'PySp'] # Define the spidroins for test, should be the same as the spidroins used in the `for spider_genome in spider_genomes:` loop.
all_gff_records = []
all_csv = pd.DataFrame()
for spidroin in spidroins:
    # spidroin = "MiSp"
    spider = "Trichonephila_clavata"
    output_dir = f"{miniprot_output_path}/{spider}_all"
    spidroin_gff = f"{output_dir}/{spider}.mRNA.{spidroin}.gff"
    attr_dict = {}
    spidroin_gff_data = []
    with open(spidroin_gff, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue

            fields = line.strip().split('\t')

            # Analysis of the attributes field
            for attr in fields[8].split(';'):
                if '=' in attr:
                    key, value = attr.split('=', 1)
                    attr_dict[key] = value

            # Create attributes object
            attr_obj = Attributes(
                ID=attr_dict['ID'],
                Rank=int(attr_dict['Rank']),
                Identity=float(attr_dict['Identity']),
                Positive=float(attr_dict['Positive']),
                Target=attr_dict['Target'].split('|')
            )

            # Create gff_data object and add to list
            spidroin_gff_data.append(GFFData(
                seqid=fields[0],
                source=fields[1],
                type=fields[2],
                start=int(fields[3]),
                end=int(fields[4]),
                score=float(fields[5]),
                strand=fields[6],
                frame=fields[7],
                attributes=attr_obj
            ))

    # Sort by positive with descending order
    spidroin_gff_data.sort(key=lambda x: x.attributes.Positive, reverse=True)
    # Extract positions
    positions = extract_positions_from_gff(spidroin_gff_data, positive_threshold=0.75)
    # Export data
    # Export CSV data
    csv_output = spidroin_gff.replace('.gff', '.csv')
    df = positions_export(positions, csv_output, format='csv')
    all_csv = pd.concat([all_csv, df], ignore_index=True)

    # Export GFF data
    gff_output = spidroin_gff.replace('.gff', '.combined.gff')
    gff_records = positions_export(
        positions,
        gff_output,
        format='gff',
        spidroin=spidroin,
        min_length=1000, # Set minimum gene length threshold
        max_length=100000, # Set maximum gene length threshold
        extension_length=10000 # Set length to extend when start or end is missing
)

    all_gff_records.extend(gff_records)

# Convert to DataFrame and save
final_output_dir = f"{PROCESSED_DATA_DIR}/01.miniprot_mapping"
os.makedirs(final_output_dir, exist_ok=True)
if all_gff_records:
    df_combined = pd.DataFrame(all_gff_records)
    # Sort by seqid (chromosome) and start position with natural sorting
    df_combined['seqid_sort'] = df_combined['seqid'].str.extract(r'(\d+)').astype(float)
    df_combined = df_combined.sort_values(['seqid_sort', 'start']).drop('seqid_sort', axis=1)
    # //todo: 添加一个函数，能够处理 df_comined 中基因座相似的情况。
    # Write to GFF file
    with open(f"{final_output_dir}/{spider}.gff", 'w') as f:
        f.write("##gff-version 3\n")
        df_combined.to_csv(f, sep='\t', header=False, index=False)
else:
    print(f"No GFF records to combine for {spider}")

if not all_csv.empty:
    all_csv.to_csv(f"{final_output_dir}/{spider}.csv", index=False)

Filter positive_threshold > 0.75 mRNA in gff file

In [None]:
spider_genomes = [f for f in os.listdir(spider_genome_path) if f.endswith(".fa.gz")]
positive_threshold = 0.75
for spider_genome in spider_genomes:
    spider = spider_genome.replace(".fa.gz", "")
    gff_file = f"{INTERIM_DATA_DIR}/miniprot/{spider}_all/{spider}.gff"
    if not os.path.exists(gff_file):
        gff_file = f"{INTERIM_DATA_DIR}/miniprot/{spider}_all/{spider}.fa.mRNA.gff"
    filtered_gff_file = f"{INTERIM_DATA_DIR}/miniprot/filtered_gff/{spider}.gff"
    os.makedirs(os.path.dirname(filtered_gff_file), exist_ok=True)
    cmd = f"awk -F'\t' '$3==\"mRNA\"{{match($9,/Positive=([0-9.]+)/,m); if(m[1]>={positive_threshold}) print; next}}1' {gff_file} | sort -k1,1V -k4,4n -k9,9 > {filtered_gff_file}"
    run_shell_command_with_check(cmd, filtered_gff_file, force=True)
