# Automated Spidroin Annotation

This project base on the github repository of [Spidroins](https://github.com/amandamarkee/spidroins). The goal of this project is to develop an automated pipeline for spidroin annotation for spider genome.

In [1]:
import os
import json
import pandas as pd

from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    PROCESSED_DATA_DIR,
    EXTERNAL_DATA_DIR,
    SCRIPTS_DIR
)
from spider_silkome_module import (
    run_shell_command_with_check,
)

[32m2025-11-28 14:10:10.927[0m | [1mINFO    [0m | [36mspider_silkome_module.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/gyk/project/spider_silkome[0m


## Configure the environment

In [4]:
project_name = "automated_spidroin_annotation"
spider = "Trichonephila_clavata"

# Step 1: Identifying and sorting spidroins
os.makedirs(INTERIM_DATA_DIR / project_name / spider, exist_ok=True)
spider_genome_file = RAW_DATA_DIR / f"spider_genome/{spider}.fa.gz"
spidroin_fasta_file = EXTERNAL_DATA_DIR / "spider-silkome-database.v1.prot.fixed.fasta"
gnome_mpi = INTERIM_DATA_DIR / project_name / spider / f"{spider}.mpi"
miniprot_out = INTERIM_DATA_DIR / project_name / spider / "miniprot_out.gff"
miniprot_out_mRNA = miniprot_out.with_suffix(".mRNA.gff")
miniprot_out_mRNA_expanded = miniprot_out_mRNA.with_suffix(".expanded.csv")

miniprot_to_coordinates_script = SCRIPTS_DIR / "miniprot_to_coordinates.py"
miniprot_out_dedup = INTERIM_DATA_DIR / project_name / spider / "miniprot_out_dedup.tsv"
miniprot_out_paired = INTERIM_DATA_DIR / project_name / spider / "miniprot_out_paired.bed"
miniprot_out_unpaired = INTERIM_DATA_DIR / project_name / spider / "miniprot_out_unpaired.bed"
bin_size = 500
pad = 1000
min_distance = 15000
max_distance = 90000
min_positive = 0.75
ref_fasta = spider_genome_file

## Step 1: Identifying and Sorting Spidroins

In this step, we will use the blastn to map the sppider genome to the spidroin N/C terminal protein sequence that can produce a file including qseqid, sseqid, pident, length, mismatch, gapopen, qstart, qend, sstart, send, evalue, bitscore.

In [5]:
index_cmd = f"miniprot -t70 -d {gnome_mpi} {spider_genome_file}"
run_shell_command_with_check(index_cmd, gnome_mpi)

align_cmd = f"miniprot -t 70 -I --gff {gnome_mpi} {spidroin_fasta_file} > {miniprot_out}"
run_shell_command_with_check(align_cmd, miniprot_out)

grep_cmd = f"grep 'mRNA' {miniprot_out} | sort -k1,1V -k4,4n > {miniprot_out_mRNA}"
run_shell_command_with_check(grep_cmd, miniprot_out_mRNA)

miniprot_to_coordinates_cmd = f"python {miniprot_to_coordinates_script} --miniprot-gff {miniprot_out_mRNA} --out-dedup {miniprot_out_dedup} --out-paired {miniprot_out_paired} --out-unpaired {miniprot_out_unpaired} --bin-size {bin_size} --pad {pad} --min-distance {min_distance} --max-distance {max_distance} --ref-fasta {ref_fasta} --min-positive {min_positive}"
run_shell_command_with_check(miniprot_to_coordinates_cmd, miniprot_out_dedup, force=True)

[32m2025-11-28 14:11:56.029[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m46[0m - [1mThe output file already exists, skipping execution: /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/Trichonephila_clavata/Trichonephila_clavata.mpi[0m
[32m2025-11-28 14:11:56.030[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m46[0m - [1mThe output file already exists, skipping execution: /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/Trichonephila_clavata/miniprot_out.gff[0m
[32m2025-11-28 14:11:56.030[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m46[0m - [1mThe output file already exists, skipping execution: /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/Trichonephila_clavata/miniprot_out.mRNA.gff[0m
[32m2025-11-28 14:11:5

True

Organize and sort the output files of miniprot to facilitate the next step of processing.

<zh>对 miniprot 的输出文件进行整理和排序，方便下一步的整理</zh>

In [None]:
miniprot_out_mRNA_table = pd.read_csv(miniprot_out_mRNA, sep="\t")
gff_header = ["seqid", "source", "type", "start", "end", "score", "strand", "frame", "attributes"]
miniprot_out_mRNA_table.columns = gff_header

def parse_attributes(attr):
    items = [x.split("=") for x in attr.split(";") if "=" in x]
    return {k: v for k, v in items}

df_attrs = miniprot_out_mRNA_table["attributes"].apply(parse_attributes).apply(pd.Series)
miniprot_out_mRNA_table_expanded = miniprot_out_mRNA_table.drop(columns=["attributes"]).join(df_attrs)

miniprot_out_mRNA_table_expanded.to_csv(miniprot_out_mRNA_expanded, index=False)