# Miniprot mapping


## Configure

Setting up the required Python packages


In [None]:
from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    EXTERNAL_DATA_DIR,
    PROCESSED_DATA_DIR
)
from spider_silkome_module.utils import run_cmd

Definition of the path for relevant data

In [None]:
task_name = "miniprot_mapping_20260209"

spidroin_fasta_file = EXTERNAL_DATA_DIR / "spider-silkome-database.v1.prot.fixed.renamed.fasta"
spider_genome_path = RAW_DATA_DIR / "01.ref_gff"

# dedup
dedup_output_dir = INTERIM_DATA_DIR / task_name / "cdhit"
spidroin_fasta_file_rep = dedup_output_dir / "cdhit_rep_seq.fa"
shortest_seq_file = dedup_output_dir / "cdhit_shortest_seq.fa"

# miniprot
gnome_mpi_path = INTERIM_DATA_DIR / task_name / "gnome_mpi"
miniprot_output_path = PROCESSED_DATA_DIR / task_name / "miniprot_output"



## cd-hit
Perform redundancy reduction on the N/C terminal sequences of Spidroin proteins using cd-hit.

In [None]:
# cd-hit
run_cmd(
    f"pixi run cd-hit -i {spidroin_fasta_file} -o {spidroin_fasta_file_rep} -c 0.9 -T 0 -M 0 -d 0",
    [spidroin_fasta_file_rep]
)

# extract shortest seq
run_cmd(
    f"pixi run python -m spider_silkome_module.extract_shortest_seq \
        --clstr-path {spidroin_fasta_file_rep}.clstr \
        --fasta-path {spidroin_fasta_file} \
        --output-path {shortest_seq_file}",
    [shortest_seq_file]
)

## miniprot

Use miniprot to align the C-terminal and N-terminal sequences of the spidroin gene sequences to the genome of the new species.

In [None]:
# genome index
run_cmd(
    f"miniprot -t70 -d {gnome_mpi_path}/{spider}.mpi {spider_genome_path}/{spider_genome}",
    [gnome_mpi_path]
)


# miniprot
run_cmd(
    f"pixi run python -m spider_silkome_module.run_miniprot \
        --input-path {spider_genome_path} \
        --protein-fasta {shortest_seq_file} \
        --output-path {miniprot_output_path} \
        --threads 70 --outc 0.8 --force",
    [miniprot_output_path]
)