# Miniprot mapping
Setting up the required Python packages

In [1]:
import os
import pandas as pd
import subprocess
import time

from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    EXTERNAL_DATA_DIR,
    PROCESSED_DATA_DIR
)
from spider_silkome_module import (
    Attributes,
    GFFData
)
from spider_silkome_module import (
    extract_positions_from_gff,
    positions_export
)

[32m2025-10-11 10:33:06.607[0m | [1mINFO    [0m | [36mspider_silkome_module.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/gyk/project/spider_silkome[0m


Definition of the path for relevant data

In [2]:
spider_species_file = f"{EXTERNAL_DATA_DIR}/organisms.csv"
mechanical_properties_file = f"{EXTERNAL_DATA_DIR}/mechanical_properties.csv"
spidroin_fasta_file = f"{EXTERNAL_DATA_DIR}/spider-silkome-database.v1.prot.fixed.fasta"
spider_genome_path = f"{RAW_DATA_DIR}/spider_genome"
spidroin_path = f"{INTERIM_DATA_DIR}/spidroin"

Perform redundancy reduction on the N/C terminal sequences of Spidroin proteins using MMseq2.

<zh>使用 MMseq2 对 Spidroin 蛋白的 N/C 端序列进行去冗余。</zh>

In [3]:
mmseqs_output_dir=f"{INTERIM_DATA_DIR}/mmseqs"
spidroin_fasta_file_rep = f"{mmseqs_output_dir}/{spidroin_fasta_file.split('/')[-1].replace('.fasta', '')}_rep_seq.fasta"
if not os.path.exists(spidroin_fasta_file_rep):
    os.makedirs(mmseqs_output_dir, exist_ok=True)
    cmd = f"pixi run --environment mmseqs mmseqs easy-cluster {spidroin_fasta_file} {mmseqs_output_dir}/{spidroin_fasta_file.split('/')[-1].replace('.fasta', '')} {mmseqs_output_dir}/tmp --min-seq-id 0.9 -c 0.8 --cov-mode 1"
    subprocess.run(cmd, shell=True)
spidroin_fasta_file_rep_manually = spidroin_fasta_file_rep.replace('.fasta', '_manually.fasta')

Use miniprot to align the C-terminal and N-terminal sequences of the spidroin gene sequences to the genome of the new species.

In [4]:
spider_genomes = [f for f in os.listdir(spider_genome_path) if f.endswith(".fa.gz")]
gnome_mpi_path = f"{INTERIM_DATA_DIR}/genome_mpi"
miniprot_output_path = f"{INTERIM_DATA_DIR}/miniprot"
os.makedirs(gnome_mpi_path, exist_ok=True)
os.makedirs(miniprot_output_path, exist_ok=True)
for spider_genome in spider_genomes:
    spider = spider_genome.split(".")[0]
    index_cmd = f"miniprot -t70 -d {gnome_mpi_path}/{spider}.mpi {spider_genome_path}/{spider_genome}"
    if not os.path.exists(f"{gnome_mpi_path}/{spider}.mpi"):
        indext_start_time = time.time()
        subprocess.run(index_cmd, shell=True)
        indext_end_time = time.time()
        print(f"Indexing {spider} takes {indext_end_time - indext_start_time} seconds")
    else:
        print(f"Indexing {spider} already exists")

    output_dir = f"{miniprot_output_path}/{spider}_all"
    os.makedirs(output_dir, exist_ok=True)
    align_cmd = f"miniprot -t 70 -I --gff {gnome_mpi_path}/{spider}.mpi {spidroin_fasta_file_rep_manually} > {output_dir}/{spider}.gff"
    if not os.path.exists(f"{output_dir}/{spider}.gff"):
        align_start_time = time.time()
        subprocess.run(align_cmd, shell=True)
        align_end_time = time.time()
        print(f"Alignment {spider} takes {align_end_time - align_start_time} seconds")
    else:
        print(f"Alignment {spider} already exists")

    if not os.path.exists(f"{output_dir}/{spider}.mRNA.gff"):
        grep_cmd = f"grep 'mRNA' {output_dir}/{spider}.gff | sort -k1,1V -k4,4n > {output_dir}/{spider}.mRNA.gff"
        subprocess.run(grep_cmd, shell=True)
    else:
        print(f"mRNA gff file already exists")

    if os.path.exists(f"{output_dir}/{spider}.mRNA.gff"):
        mRNA_gff = pd.read_csv(f"{output_dir}/{spider}.mRNA.gff", sep='\t', header=None)
        gff_header = ["seqid", "source", "type", "start", "end", "score", "strand", "frame", "attribute"]
        mRNA_gff.columns = gff_header
        spidroins = list(set([row["attribute"].split(';')[-1].split('|')[-2] for index, row in mRNA_gff.iterrows()]))
        print(f"Total number of spidroins in {spider}: {len(spidroins)}\n{spidroins}")
        for spidroin in spidroins:
            gff_spidroin_output = f"{output_dir}/{spider}.mRNA.{spidroin}.gff"
            if not os.path.exists(gff_spidroin_output):
                grep_cmd = f"grep '|{spidroin}|' {output_dir}/{spider}.mRNA.gff > {gff_spidroin_output}"
                subprocess.run(grep_cmd, shell=True)
            else:
                print(f"{spider}.mRNA.{spidroin}.gff already exists")

Indexing Trichonephila_clavata already exists
Alignment Trichonephila_clavata already exists
mRNA gff file already exists
Total number of spidroins in Trichonephila_clavata: 18
['MaSp1', 'MaSp3B', 'MaSp3', 'CySp', 'AcSp', 'AgSp2', 'Putative_spidroin', 'MaSp2', 'CrSp', 'MaSp', 'PySp', 'Flag', 'MiSp', 'Ampullate_spidroin', 'Pflag', 'Spidroin', 'MaSp2B', 'AgSp1']
Indexing Araneus_ventricosus already exists
Alignment Araneus_ventricosus already exists
mRNA gff file already exists
Total number of spidroins in Araneus_ventricosus: 17
['MaSp1', 'MaSp3B', 'MaSp3', 'CySp', 'AcSp', 'AgSp2', 'Putative_spidroin', 'MaSp2', 'MaSp', 'PySp', 'Flag', 'MiSp', 'Ampullate_spidroin', 'Pflag', 'Spidroin', 'MaSp2B', 'AgSp1']
Araneus_ventricosus.mRNA.MaSp1.gff already exists
Araneus_ventricosus.mRNA.MaSp3B.gff already exists
Araneus_ventricosus.mRNA.MaSp3.gff already exists
Araneus_ventricosus.mRNA.CySp.gff already exists
Araneus_ventricosus.mRNA.AcSp.gff already exists
Araneus_ventricosus.mRNA.AgSp2.gff alre

In [None]:
from pybedtools import BedTool

spider = "Trichonephila_clavata"
output_dir = f"{miniprot_output_path}/{spider}_all"
mRNA_gff = f"{output_dir}/{spider}.mRNA.gff"

merged_mRNA_gff = BedTool(mRNA_gff).merge(s=True)
for gene in merged_mRNA_gff:
    print(gene)


Defining the gene boundaries for each type of Spidroin should be part of the `for spider_genome in spider_genomes:` loop. For convenience, this section has been separated for testing.

<zh>处理每一种蛛丝蛋白的基因边界界定，这一部分应该作为`for spider_genome in spider_genomes:`循环下的一部分，这里为了方便，独立出来进行测试。</zh>

In [None]:
spidroins = ['MaSp3', 'Pflag', 'MiSp', 'MaSp2', 'AcSp', 'CySp',
'MaSp3B', 'Spidroin', 'Putative_spidroin', 'CrSp', 'AgSp2', 'MaSp1',
'Ampullate_spidroin', 'MaSp', 'Flag', 'AgSp1', 'MaSp2B', 'PySp'] # Define the spidroins for test, should be the same as the spidroins used in the `for spider_genome in spider_genomes:` loop.
all_gff_records = []
all_csv = pd.DataFrame()
for spidroin in spidroins:
    # spidroin = "MiSp"
    spider = "Trichonephila_clavata"
    output_dir = f"{miniprot_output_path}/{spider}_all"
    spidroin_gff = f"{output_dir}/{spider}.mRNA.{spidroin}.gff"
    attr_dict = {}
    spidroin_gff_data = []
    with open(spidroin_gff, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue

            fields = line.strip().split('\t')

            # Analysis of the attributes field
            for attr in fields[8].split(';'):
                if '=' in attr:
                    key, value = attr.split('=', 1)
                    attr_dict[key] = value

            # Create attributes object
            attr_obj = Attributes(
                ID=attr_dict['ID'],
                Rank=int(attr_dict['Rank']),
                Identity=float(attr_dict['Identity']),
                Positive=float(attr_dict['Positive']),
                Target=attr_dict['Target'].split('|')
            )

            # Create gff_data object and add to list
            spidroin_gff_data.append(GFFData(
                seqid=fields[0],
                source=fields[1],
                type=fields[2],
                start=int(fields[3]),
                end=int(fields[4]),
                score=float(fields[5]),
                strand=fields[6],
                frame=fields[7],
                attributes=attr_obj
            ))

    # Sort by positive with descending order
    spidroin_gff_data.sort(key=lambda x: x.attributes.Positive, reverse=True)
    # Extract positions
    positions = extract_positions_from_gff(spidroin_gff_data, positive_threshold=0.75)
    # Export data
    # Export CSV data
    csv_output = spidroin_gff.replace('.gff', '.csv')
    df = positions_export(positions, csv_output, format='csv')
    all_csv = pd.concat([all_csv, df], ignore_index=True)

    # Export GFF data
    gff_output = spidroin_gff.replace('.gff', '.combined.gff')
    gff_records = positions_export(
        positions,
        gff_output,
        format='gff',
        spidroin=spidroin,
        min_length=1000, # Set minimum gene length threshold
        max_length=100000, # Set maximum gene length threshold
        extension_length=10000 # Set length to extend when start or end is missing
)

    all_gff_records.extend(gff_records)

# Convert to DataFrame and save
final_output_dir = f"{PROCESSED_DATA_DIR}/01.miniprot_mapping"
os.makedirs(final_output_dir, exist_ok=True)
if all_gff_records:
    df_combined = pd.DataFrame(all_gff_records)
    # Sort by seqid (chromosome) and start position with natural sorting
    df_combined['seqid_sort'] = df_combined['seqid'].str.extract(r'(\d+)').astype(float)
    df_combined = df_combined.sort_values(['seqid_sort', 'start']).drop('seqid_sort', axis=1)
    # //todo: 添加一个函数，能够处理 df_comined 中基因座相似的情况。
    # Write to GFF file
    with open(f"{final_output_dir}/{spider}.gff", 'w') as f:
        f.write("##gff-version 3\n")
        df_combined.to_csv(f, sep='\t', header=False, index=False)
else:
    print(f"No GFF records to combine for {spider}")

if not all_csv.empty:
    all_csv.to_csv(f"{final_output_dir}/{spider}.csv", index=False)

[32m2025-10-11 10:40:14.608[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_csv[0m:[36m180[0m - [1mCSV saved to /home/gyk/project/spider_silkome/data/interim/miniprot/Trichonephila_clavata_all/Trichonephila_clavata.mRNA.MaSp3.csv[0m
[32m2025-10-11 10:40:14.608[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_csv[0m:[36m181[0m - [1mTotal combinations: 0[0m
[32m2025-10-11 10:40:14.610[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_gff[0m:[36m326[0m - [1mGFF saved to /home/gyk/project/spider_silkome/data/interim/miniprot/Trichonephila_clavata_all/Trichonephila_clavata.mRNA.MaSp3.combined.gff[0m
[32m2025-10-11 10:40:14.610[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_gff[0m:[36m327[0m - [1mTotal genes predicted: 0[0m
[32m2025-10-11 10:40:14.612[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_csv[0m:[36m180[