# Automated Spidroin Annotation

This project The goal of this project is to develop an automated pipeline for spidroin annotation for spider genome.

In [None]:
import os
import subprocess

from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    PROCESSED_DATA_DIR,
    EXTERNAL_DATA_DIR,
    REFERENCES_DIR,
    SCRIPTS_DIR,
)
from spider_silkome_module import (
    run_shell_command_with_check,
)

## Configure the environment

In [None]:
project_name = "automated_spidroin_annotation"

# Define multiple species to analyze
spiders = [
    "Trichonephila_clavata",
    "Araneus_ventricosus",
    "Evarcha_sp",
    "Heteropoda_venatoria",
    "Hippasa_lycosina",
    "Pandercetes_sp",
    "Pardosa_pseudoannulata",
    "Pholcus_sp",
    "Scorpiops_zhui",
    "Songthela_sp"
]

# Common directories
hmmer_model_dir = REFERENCES_DIR / "2025_Schoneberg_data" / "hmmer_nucl_profile_trimmed"
nhmmer_search_base = INTERIM_DATA_DIR / project_name / "nhmmer_search"
spider_genome_dir = RAW_DATA_DIR / "spider_genome"
spidroin_analysis_output = PROCESSED_DATA_DIR / project_name / "spidroin_analysis"

# Create output directories for each species
for spider in spiders:
    spider_nhmmer_dir = nhmmer_search_base / spider
    os.makedirs(spider_nhmmer_dir, exist_ok=True)
os.makedirs(spidroin_analysis_output, exist_ok=True)

# Step 2: Spidroin prediction with Augustus (per species config)
augustus_config_file = EXTERNAL_DATA_DIR / "extrinsic.cfg"

print(f"Configured {len(spiders)} species: {spiders}")

## Step 1: Identifying and Sorting Spidroins

In this step, we will use the nhmmer to map the spider genome to the spidroin N/C terminal protein sequences. The hmmer model was downloaded from this [paper](https://doi.org/10.1111/1755-0998.14038) and the [Zenodo link](https://doi.org/10.5281/zenodo.13711380).

In [None]:
# Step 1.1: Press HMM models (only once)
for model in hmmer_model_dir.glob("*TD.hmm"):
    model_name = model.stem.split('.')[0]
    hmmer_press_cmd = f"pixi run hmmpress {model}"
    run_shell_command_with_check(hmmer_press_cmd, f"{model_name}.h3m")

In [None]:
# Step 1.2: Run nhmmer search for all species
for spider in spiders:
    print(f"\n{'='*60}")
    print(f"Processing: {spider}")
    print(f"{'='*60}")

    spider_genome_file = spider_genome_dir / f"{spider}.fa"
    nhmmer_output_dir = nhmmer_search_base / spider

    if not spider_genome_file.exists():
        print(f"Warning: Genome file not found: {spider_genome_file}")
        continue

    for model in hmmer_model_dir.glob("*TD.hmm"):
        model_name = model.stem.split('.')[0]
        nhmmer_cmd = f"pixi run nhmmer --cpu 70 --tblout {nhmmer_output_dir}/{model_name}.tbl {model} {spider_genome_file} > {nhmmer_output_dir}/{model_name}.out"
        run_shell_command_with_check(nhmmer_cmd, f"{nhmmer_output_dir}/{model_name}.out", force=True)

    print(f"Completed: {spider}")

In [None]:
# Step 1.3: Analyze all species with spidroin analysis script
# The script will automatically detect all species subdirectories in nhmmer_search_base
spidroin_analysis_cmd = f"pixi run python {SCRIPTS_DIR}/analyse_spidroins.py \
    -i {nhmmer_search_base} \
    -p {hmmer_model_dir} \
    --assembly-dir {spider_genome_dir} \
    -o {spidroin_analysis_output}"
print(spidroin_analysis_cmd)
run_shell_command_with_check(spidroin_analysis_cmd, spidroin_analysis_output / "spidroins_total.tsv", force=True)

## Step 2: Spidroin Gene Prediction with Augustus

Extract candidate spidroin sequences and predict gene structures using Augustus.

In [None]:
from Bio import SeqIO

# Input: all spider spidroin sequences from spidroin analysis (fasta file)
spidroin_fasta_path = spidroin_analysis_output / "spidroin_sequences.fasta"
if not spidroin_fasta_path.exists():
    print(f"Warning: No spidroin sequences file found")
    exit(1)

# Load all sequences into list (can iterate multiple times)
all_spidroin_sequences = list(SeqIO.parse(spidroin_fasta_path, "fasta"))

# Step 2: Extract spidroin sequences and make a hints.gff for the genes prediction with Augustus
for spider in spiders:
    print(f"\n{'='*60}")
    print(f"Gene prediction for: {spider}")
    print(f"{'='*60}")

    spider_output_dir = INTERIM_DATA_DIR / project_name / spider
    os.makedirs(spider_output_dir, exist_ok=True)
    candidate_gene_seq = spider_output_dir / "spidroin_seqs.fa"
    hints_gff = spider_output_dir / "hints.gff"

    # Filter sequences for current spider
    spider_records = [r for r in all_spidroin_sequences if r.id.startswith(spider)]

    if not spider_records:
        print(f"No spidroin sequences found for {spider}, skipping...")
        continue

    # Write all sequences for this spider at once
    SeqIO.write(spider_records, candidate_gene_seq, "fasta")

    with open(hints_gff, "w") as file:
        for record in spider_records:
            seqname = record.id
            start_hints = f"{seqname}\tmanual\tstart\t1\t3\t.\t+\t0\tgrp=1;pri=4;src=M"
            stop_hints = f"{seqname}\tmanual\tstop\t{len(record.seq)-2}\t{len(record.seq)}\t.\t+\t0\tgrp=1;pri=4;src=M"
            file.write(start_hints + "\n")
            file.write(stop_hints + "\n")

    # Run Augustus for gene prediction
    augustus_output = spider_output_dir / "augustus_output.gff"
    augustus_cmd = f"pixi run augustus --strand=forward --singlestrand=true \
        --extrinsicCfgFile={augustus_config_file} \
        --alternatives-from-evidence=true --gff3=on --uniqueGeneId=true \
        --genemodel=exactlyone --hintsfile={hints_gff} \
        --UTR=off --species=parasteatoda {candidate_gene_seq} > {augustus_output}"

    run_shell_command_with_check(augustus_cmd, augustus_output)
    print(f"Completed gene prediction for: {spider}")

## Reapeat analysis

In [None]:
# Protein sequence extraction and visualization
for spider in spiders:
    spider_output_dir = INTERIM_DATA_DIR / project_name / spider
    augustus_output = spider_output_dir / "augustus_output.gff"
    candidate_gene_seq = spider_output_dir / "spidroin_seqs.fa"
    # Extract protein sequences using getAnnoFasta.pl
    protein_extraction_cmd = f"pixi run getAnnoFasta.pl --seqfile={candidate_gene_seq} {augustus_output}"
    print(protein_extraction_cmd)
    run_shell_command_with_check(protein_extraction_cmd, augustus_output.with_suffix('.aa'))
