# Automated Spidroin Annotation

This project The goal of this project is to develop an automated pipeline for spidroin annotation for spider genome.

In [2]:
import os
import subprocess

from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    PROCESSED_DATA_DIR,
    EXTERNAL_DATA_DIR,
    REFERENCES_DIR,
    SCRIPTS_DIR,
)
from spider_silkome_module import (
    run_shell_command_with_check,
)

[32m2025-12-02 09:50:54.420[0m | [1mINFO    [0m | [36mspider_silkome_module.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/gyk/project/spider_silkome[0m


## Configure the environment

In [3]:
project_name = "automated_spidroin_annotation"

# Define multiple species to analyze
spiders = [
    "Trichonephila_clavata",
    "Araneus_ventricosus",
    "Evarcha_sp",
    "Heteropoda_venatoria",
    "Hippasa_lycosina",
    "Pandercetes_sp",
    "Pardosa_pseudoannulata",
    "Pholcus_sp",
    "Scorpiops_zhui",
    "Songthela_sp"
]

# Common directories
hmmer_model_dir = REFERENCES_DIR / "2025_Schoneberg_data" / "hmmer_nucl_profile_trimmed"
nhmmer_search_base = INTERIM_DATA_DIR / project_name / "nhmmer_search"
spider_genome_dir = RAW_DATA_DIR / "spider_genome"
spidroin_analysis_output = PROCESSED_DATA_DIR / project_name / "spidroin_analysis"

# Create output directories for each species
for spider in spiders:
    spider_nhmmer_dir = nhmmer_search_base / spider
    os.makedirs(spider_nhmmer_dir, exist_ok=True)
os.makedirs(spidroin_analysis_output, exist_ok=True)

# Step 2: Spidroin prediction with Augustus (per species config)
augustus_config_file = EXTERNAL_DATA_DIR / "extrinsic.cfg"

print(f"Configured {len(spiders)} species: {spiders}")

Configured 10 species: ['Trichonephila_clavata', 'Araneus_ventricosus', 'Evarcha_sp', 'Heteropoda_venatoria', 'Hippasa_lycosina', 'Pandercetes_sp', 'Pardosa_pseudoannulata', 'Pholcus_sp', 'Scorpiops_zhui', 'Songthela_sp']


## Step 1: Identifying and Sorting Spidroins

In this step, we will use the nhmmer to map the spider genome to the spidroin N/C terminal protein sequences. The hmmer model was downloaded from this [paper](https://doi.org/10.1111/1755-0998.14038) and the [Zenodo link](https://doi.org/10.5281/zenodo.13711380).

In [None]:
# Step 1.1: Press HMM models (only once)
for model in hmmer_model_dir.glob("*TD.hmm"):
    model_name = model.stem.split('.')[0]
    hmmer_press_cmd = f"pixi run hmmpress {model}"
    run_shell_command_with_check(hmmer_press_cmd, f"{model_name}.h3m")

In [3]:
# Step 1.2: Run nhmmer search for all species
for spider in spiders:
    print(f"\n{'='*60}")
    print(f"Processing: {spider}")
    print(f"{'='*60}")

    spider_genome_file = spider_genome_dir / f"{spider}.fa"
    nhmmer_output_dir = nhmmer_search_base / spider

    if not spider_genome_file.exists():
        print(f"Warning: Genome file not found: {spider_genome_file}")
        continue

    for model in hmmer_model_dir.glob("*TD.hmm"):
        model_name = model.stem.split('.')[0]
        nhmmer_cmd = f"pixi run nhmmer --cpu 70 --tblout {nhmmer_output_dir}/{model_name}.tbl {model} {spider_genome_file} > {nhmmer_output_dir}/{model_name}.out"
        run_shell_command_with_check(nhmmer_cmd, f"{nhmmer_output_dir}/{model_name}.out", force=True)

    print(f"Completed: {spider}")


Processing: Trichonephila_clavata
[32m2025-12-01 18:42:14.791[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: pixi run nhmmer --cpu 70 --tblout /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/nhmmer_search/Trichonephila_clavata/MiSp_NTD.tbl /home/gyk/project/spider_silkome/references/2025_Schoneberg_data/hmmer_nucl_profile_trimmed/MiSp_NTD.hmm /home/gyk/project/spider_silkome/data/raw/spider_genome/Trichonephila_clavata.fa > /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/nhmmer_search/Trichonephila_clavata/MiSp_NTD.out[0m
[32m2025-12-01 18:42:31.729[0m | [32m[1mSUCCESS [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m53[0m - [32m[1mCommand executed successfully, output file: /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/nhmmer_search/Trichonephila_clavata/MiSp_N

In [7]:
# Step 1.3: Analyze all species with spidroin analysis script
# The script will automatically detect all species subdirectories in nhmmer_search_base
spidroin_analysis_cmd = f"pixi run python {SCRIPTS_DIR}/analyse_spidroins.py \
    -i {nhmmer_search_base} \
    -p {hmmer_model_dir} \
    --assembly-dir {spider_genome_dir} \
    -o {spidroin_analysis_output}"
print(spidroin_analysis_cmd)
run_shell_command_with_check(spidroin_analysis_cmd, spidroin_analysis_output / "spidroins_total.tsv", force=True)

pixi run python /home/gyk/project/spider_silkome/scripts/analyse_spidroins.py     -i /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/nhmmer_search     -p /home/gyk/project/spider_silkome/references/2025_Schoneberg_data/hmmer_nucl_profile_trimmed     --assembly-dir /home/gyk/project/spider_silkome/data/raw/spider_genome     -o /home/gyk/project/spider_silkome/data/processed/automated_spidroin_annotation/spidroin_analysis
[32m2025-12-02 10:22:19.123[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: pixi run python /home/gyk/project/spider_silkome/scripts/analyse_spidroins.py     -i /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/nhmmer_search     -p /home/gyk/project/spider_silkome/references/2025_Schoneberg_data/hmmer_nucl_profile_trimmed     --assembly-dir /home/gyk/project/spider_silkome/data/raw/spider_genome     -o /home/gyk/project/spider_

  summary_table = pd.concat([
  summary_table = pd.concat([
  summary_table = pd.concat([
  summary_table = pd.concat([
  summary_table = pd.concat([


Reading HMMER-Tables for Pandercetes_sp
Reading HMMER-Tables for Trichonephila_clavata


  summary_table = pd.concat([
  summary_table = pd.concat([
  summary_table = pd.concat([
  summary_table = pd.concat([


Reading HMMER-Tables for Pardosa_pseudoannulata
Reading HMMER-Tables for Pholcus_sp
Reading HMMER-Tables for Hippasa_lycosina


  summary_table = pd.concat([
  summary_table = pd.concat([


Reading HMMER-Tables for Heteropoda_venatoria
Reading HMMER-Tables for Araneus_ventricosus


  summary_table = pd.concat([


Reading HMMER-Tables for Songthela_sp
Finished reading HMMER-Tables
Writing summary table
Getting HMM profile lengths
Applying filters...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("

Found 330 spidroin hits after filtering
Removing Spidroin hits originating from unclassified spidroins


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("

Generating plots...


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("E-value", inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  duplicates.sort_values("

Number of hits without second best: 0
Joining Terminal Domains...


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctds["distance"] = abs(ctds["ali_from"] - ntd["ali_from"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctds["distance"] = abs(ctds["ali_from"] - ntd["ali_from"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ctds["distance"] = abs(ctds["ali_from"] - ntd["ali_from"])
A value is trying to be set 

Found 130 joined spidroin genes, 58 singletons
Extracting spidroin sequences...
Extracting spidroin sequences for Hippasa_lycosina
Assembly: /home/gyk/project/spider_silkome/data/raw/spider_genome/Hippasa_lycosina.fa
Chr03: 182883559 bp
Chr03: 182883559 bp
Chr03: 182883559 bp
Chr04: 180112200 bp
Chr04: 180112200 bp
Chr04: 180112200 bp
Chr04: 180112200 bp
Chr04: 180112200 bp
Chr06: 172433735 bp
Chr06: 172433735 bp
Chr06: 172433735 bp
Chr06: 172433735 bp
Extracting spidroin sequences for Pandercetes_sp
Assembly: /home/gyk/project/spider_silkome/data/raw/spider_genome/Pandercetes_sp.fa
Chr08: 227776469 bp
Chr13: 208702017 bp
Chr13: 208702017 bp
Chr13: 208702017 bp
Chr13: 208702017 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Chr15: 195731055 bp
Extracting spidroin sequences for Trichonephila_clavata
Assembly: /hom

True

## Step 2: Spidroin Gene Prediction with Augustus

Extract candidate spidroin sequences and predict gene structures using Augustus.

In [None]:
from Bio import SeqIO

# Input: all spider spidroin sequences from spidroin analysis (fasta file)
spidroin_fasta_path = spidroin_analysis_output / "spidroin_sequences.fasta"
if not spidroin_fasta_path.exists():
    print(f"Warning: No spidroin sequences file found")
    exit(1)

# Load all sequences into list (can iterate multiple times)
all_spidroin_sequences = list(SeqIO.parse(spidroin_fasta_path, "fasta"))

# Step 2: Extract spidroin sequences and make a hints.gff for the genes prediction with Augustus
for spider in spiders:
    print(f"\n{'='*60}")
    print(f"Gene prediction for: {spider}")
    print(f"{'='*60}")

    spider_output_dir = INTERIM_DATA_DIR / project_name / spider
    os.makedirs(spider_output_dir, exist_ok=True)
    candidate_gene_seq = spider_output_dir / "spidroin_seqs.fa"
    hints_gff = spider_output_dir / "hints.gff"

    # Filter sequences for current spider
    spider_records = [r for r in all_spidroin_sequences if r.id.startswith(spider)]

    if not spider_records:
        print(f"No spidroin sequences found for {spider}, skipping...")
        continue

    # Write all sequences for this spider at once
    SeqIO.write(spider_records, candidate_gene_seq, "fasta")

    with open(hints_gff, "w") as file:
        for record in spider_records:
            seqname = record.id
            start_hints = f"{seqname}\tmanual\tstart\t1\t3\t.\t+\t0\tgrp=1;pri=4;src=M"
            stop_hints = f"{seqname}\tmanual\tstop\t{len(record.seq)-2}\t{len(record.seq)}\t.\t+\t0\tgrp=1;pri=4;src=M"
            file.write(start_hints + "\n")
            file.write(stop_hints + "\n")

    # Run Augustus for gene prediction
    augustus_output = spider_output_dir / "augustus_output.gff"
    augustus_cmd = f"pixi run augustus --strand=forward --singlestrand=true \
        --extrinsicCfgFile={augustus_config_file} \
        --alternatives-from-evidence=true --gff3=on --uniqueGeneId=true \
        --genemodel=exactlyone --hintsfile={hints_gff} \
        --UTR=off --species=parasteatoda {candidate_gene_seq} > {augustus_output}"

    run_shell_command_with_check(augustus_cmd, augustus_output)
    print(f"Completed gene prediction for: {spider}")


Gene prediction for: Trichonephila_clavata
[32m2025-12-02 09:57:43.437[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: pixi run augustus --strand=forward --singlestrand=true         --extrinsicCfgFile=/home/gyk/project/spider_silkome/data/external/extrinsic.cfg         --alternatives-from-evidence=true --gff3=on --uniqueGeneId=true         --genemodel=exactlyone --hintsfile=/home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/Trichonephila_clavata/hints.gff         --UTR=off --species=parasteatoda /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/Trichonephila_clavata/spidroin_seqs.fa > /home/gyk/project/spider_silkome/data/interim/automated_spidroin_annotation/Trichonephila_clavata/augustus_output.gff[0m
[32m2025-12-02 09:58:06.229[0m | [32m[1mSUCCESS [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m5

## Results Summary

Check the output files for all analyzed species.

In [None]:
# Display results summary
import pandas as pd

print("=" * 60)
print("ANALYSIS RESULTS SUMMARY")
print("=" * 60)

# Check spidroin analysis results
summary_file = spidroin_analysis_output / "spidroins_total.tsv"
if summary_file.exists():
    df = pd.read_csv(summary_file, sep="\t")
    print(f"\nTotal HMMER hits: {len(df)}")
    print(f"Species analyzed: {df['Species'].unique().tolist()}")
    print(f"\nHits per species:")
    print(df.groupby('Species').size())

filtered_file = spidroin_analysis_output / "spidroins_filtered.tsv"
if filtered_file.exists():
    df_filtered = pd.read_csv(filtered_file, sep="\t")
    print(f"\nFiltered spidroin hits: {len(df_filtered)}")

joined_file = spidroin_analysis_output / "joined_domains.tsv"
if joined_file.exists():
    df_joined = pd.read_csv(joined_file, sep="\t")
    print(f"\nJoined spidroin genes: {len(df_joined)}")
    print(f"\nSpidroin types found:")
    print(df_joined['spidroin_type'].value_counts())

print("\n" + "=" * 60)
print("Output files location:")
print(f"  {spidroin_analysis_output}")
print("=" * 60)