# DRS Mapping
Use IsoQuant software to align DRS data to the genome, calibrate it based on second-generation sequencing, and then obtain transcript boundary information.

<zh>使用 IsoQuant 软件把 DRS 数据比对到基因组上，并且根据二代测序进行校准，然后获取转录本边界信息。</zh>

In [1]:
import os

from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    PROJ_ROOT,
    PROCESSED_DATA_DIR,
)
from spider_silkome_module import (
    run_shell_command_with_check,
)

[32m2025-11-13 09:36:59.837[0m | [1mINFO    [0m | [36mspider_silkome_module.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/gyk/project/spider_silkome[0m


## Trichonephila_clavata

Some errors occurred when running DRS mapping. So the gff file must be fixed by `agat_sp_fix_features_locations_duplicated.pl` before running the next step.

In [None]:

!awk -F'\t' '$3 ~ /^(gene|mRNA|exon|CDS)$/' /home/gyk/project/spider_silkome/data/raw/spider_genome/Trichonephila_clavata.gff > /home/gyk/project/spider_silkome/data/raw/spider_genome/Trichonephila_clavata_fixed.gff
!pixi run --environment agat agat_convert_sp_gxf2gxf.pl \
    --gff /home/gyk/project/spider_silkome/data/raw/spider_genome/Trichonephila_clavata_fixed.gff \
    --output /home/gyk/project/spider_silkome/data/raw/spider_genome/Trichonephila_clavata_fixed_agat.gff


Note: if ValueError: Duplicate ID xxxxxx, then fixed the id manually.

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/Trichonephila_clavata.fa"
gene_annotation_file = genome_file.replace(".fa", "_fixed_agat.gff")
fastq_file = f"{RAW_DATA_DIR}/Tclav-F1_Ar-28-Trcl-f/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

## Araneus_ventricosus

In [2]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Araneus_ventricosus.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Arve-27-f1/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Araneus_ventricosus_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

[32m2025-11-07 17:09:37.282[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: isoquant.py --genedb /home/gyk/project/spider_silkome/data/raw/spider_genome/genome/Araneus_ventricosus.gff --reference /home/gyk/project/spider_silkome/data/raw/spider_genome/genome/Araneus_ventricosus.fa --fastq /home/gyk/project/spider_silkome/data/raw/BC202407614-ONT-DRS-10samples/Arve-27-f1/pass.fq.gz --data_type nanopore -o /home/gyk/project/spider_silkome/data/interim/03.DRS_mapping/Araneus_ventricosus_isoquant[0m
2025-11-07 17:09:39,614 - INFO - Running IsoQuant version 3.9.0
2025-11-07 17:09:39,651 - INFO - Novel unspliced transcripts will not be reported, set --report_novel_unspliced true to discover them
2025-11-07 17:09:39,652 - INFO -  === IsoQuant pipeline started === 
2025-11-07 17:09:39,652 - INFO - Python version: 3.10.18 | packaged by conda-forge | (main, Jun  4 2025, 14:45:41) [GCC 13.3.0]
2025-11-07 1

True

## IsoQuant batch run

In [None]:
# Trichonephila_clavata and Pardosa_pseudoannulata need to run isoquant manually, because their gff files are not in the correct format, such as duplicate ID.
spiders = ["Araneus_ventricosus", "Evarcha_sp", "Heteropoda_venatoria", "Hippasa_lycosina", "Pandercetes_sp", "Pholcus_sp", "Scorpiops_zhui", "Songthela_sp"]
for spider in spiders:
    genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/{spider}.fa"
    gene_annotation_file = genome_file.replace(".fa", ".gff")
    fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/{spider}/pass.fq.gz"
    isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/{spider}_isoquant"
    isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
    run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

Convert gtf files to gff files

In [3]:
spiders = ["Trichonephila_clavata", "Araneus_ventricosus", "Evarcha_sp", "Heteropoda_venatoria", "Hippasa_lycosina", "Pandercetes_sp", "Pardosa_pseudoannulata", "Pholcus_sp", "Scorpiops_zhui", "Songthela_sp"]
os.makedirs(f"{PROCESSED_DATA_DIR}/DRS_mapping", exist_ok=True)
for spider in spiders:
    gtf_file = f"{INTERIM_DATA_DIR}/03.DRS_mapping/{spider}_isoquant/OUT/OUT.transcript_models.gtf"
    gff_file = f"{PROCESSED_DATA_DIR}/DRS_mapping/{spider}_isoquant.transcript_models.gff"
    cmd = f"pixi run gffread {gtf_file} -o {gff_file}"
    run_shell_command_with_check(cmd, gff_file)


[32m2025-11-13 09:39:06.722[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: pixi run gffread /home/gyk/project/spider_silkome/data/interim/03.DRS_mapping/Trichonephila_clavata_isoquant/OUT/OUT.transcript_models.gtf -o /home/gyk/project/spider_silkome/data/processed/DRS_mapping/Trichonephila_clavata_isoquant.transcript_models.gff[0m
[32m2025-11-13 09:39:08.346[0m | [32m[1mSUCCESS [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m53[0m - [32m[1mCommand executed successfully, output file: /home/gyk/project/spider_silkome/data/processed/DRS_mapping/Trichonephila_clavata_isoquant.transcript_models.gff[0m
[32m2025-11-13 09:39:08.346[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: pixi run gffread /home/gyk/project/spider_silkome/data/interim/03.DRS_mapping/Araneus_

## Evarcha_sp

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Evarcha_sp.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff3")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Evsp-Sa-11-Evhu-f1/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Evarcha_sp_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

## Heteropoda_venatoria

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Heteropoda_venatoria.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Heve_Sp-15-Heve-f/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Heteropoda_venatoria_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

## Hippasa_lycosina

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Hippasa_lycosina.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff3")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Hily-LLy-37-Hily-m1/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Hippasa_lycosina_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

## Pandercetes_sp

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Pandercetes_sp.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff3")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Pansp-LSp-07-Paba-f2/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Pandercetes_sp_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

## Pardosa_pseudoannulata

修复 Duplicate ID 问题

In [20]:
gff_file = f"{RAW_DATA_DIR}/spider_genome/genome/Pardosa_pseudoannulata.gff"
gff_fixed_file = f"{RAW_DATA_DIR}/spider_genome/genome/Pardosa_pseudoannulata.unique.gff"
cmd = f"{PROJ_ROOT}/scripts/gff3_unique_cds_ids.sh -i {gff_file} -o {gff_fixed_file}"
run_shell_command_with_check(cmd, gff_fixed_file, force=True)

[32m2025-11-09 14:49:20.663[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: /home/gyk/project/spider_silkome/scripts/gff3_unique_cds_ids.sh -i /home/gyk/project/spider_silkome/data/raw/spider_genome/genome/Pardosa_pseudoannulata.gff -o /home/gyk/project/spider_silkome/data/raw/spider_genome/genome/Pardosa_pseudoannulata.unique.gff[0m
[32m2025-11-09 14:49:28.910[0m | [32m[1mSUCCESS [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m53[0m - [32m[1mCommand executed successfully, output file: /home/gyk/project/spider_silkome/data/raw/spider_genome/genome/Pardosa_pseudoannulata.unique.gff[0m


True

In [23]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Pardosa_pseudoannulata.fa"
# gene_annotation_file = genome_file.replace(".fa", ".gff")
gene_annotation_file = gff_fixed_file
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Pardosa_pseudoannulata/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Pardosa_pseudoannulata_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

[32m2025-11-09 15:09:39.838[0m | [1mINFO    [0m | [36mspider_silkome_module.features[0m:[36mrun_shell_command_with_check[0m:[36m50[0m - [1mExecute command: isoquant.py --genedb /home/gyk/project/spider_silkome/data/raw/spider_genome/genome/Pardosa_pseudoannulata.unique.gff --reference /home/gyk/project/spider_silkome/data/raw/spider_genome/genome/Pardosa_pseudoannulata.fa --fastq /home/gyk/project/spider_silkome/data/raw/BC202407614-ONT-DRS-10samples/Pardosa_pseudoannulata/pass.fq.gz --data_type nanopore -o /home/gyk/project/spider_silkome/data/interim/03.DRS_mapping/Pardosa_pseudoannulata_isoquant[0m
2025-11-09 15:09:40,408 - INFO - Running IsoQuant version 3.9.0
2025-11-09 15:09:49,418 - INFO - Overwriting the previous run
2025-11-09 15:09:50,422 - INFO - Novel unspliced transcripts will not be reported, set --report_novel_unspliced true to discover them
2025-11-09 15:09:50,422 - INFO -  === IsoQuant pipeline started === 
2025-11-09 15:09:50,422 - INFO - Python version: 3

True

## Pholcus_sp

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Pholcus_sp.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff3")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Phsp-Pc-01-Phsp-f3/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Pholcus_sp_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

## Scorpiops_zhui

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Scorpiops_zhui.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Scz-Scorpiones-f2/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Scorpiops_zhui_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)

## Songthela_sp

In [None]:
genome_file = f"{RAW_DATA_DIR}/spider_genome/genome/Songthela_sp.fa"
gene_annotation_file = genome_file.replace(".fa", ".gff")
fastq_file = f"{RAW_DATA_DIR}/BC202407614-ONT-DRS-10samples/Sosp-Lp-01-Sosp-f2/pass.fq.gz"
isoquant_output_dir = f"{INTERIM_DATA_DIR}/03.DRS_mapping/Songthela_sp_isoquant"
isoquant_cmd = f"isoquant.py --genedb {gene_annotation_file} --reference {genome_file} --fastq {fastq_file} --data_type nanopore -o {isoquant_output_dir}"
run_shell_command_with_check(isoquant_cmd, isoquant_output_dir,force=True)