# Spidroin curation
Setting up the required Python packages

In [1]:
import os
import pandas as pd
from Bio import SeqIO
import subprocess
import time
from dataclasses import dataclass
from collections import defaultdict

from spider_silkome_module import (
    RAW_DATA_DIR,
    INTERIM_DATA_DIR,
    EXTERNAL_DATA_DIR,
    PROCESSED_DATA_DIR
)
from spider_silkome_module import (
    Attributes,
    GFFData
)
from spider_silkome_module import (
    extract_positions_from_gff,
    positions_export
)

[32m2025-10-08 18:15:54.131[0m | [1mINFO    [0m | [36mspider_silkome_module.config[0m:[36m<module>[0m:[36m11[0m - [1mPROJ_ROOT path is: /home/gyk/project/spider_silkome[0m


Definition of the path for relevant data

In [2]:
spider_species_file = f"{EXTERNAL_DATA_DIR}/organisms.csv"
mechanical_properties_file = f"{EXTERNAL_DATA_DIR}/mechanical_properties.csv"
spidroin_fasta_file = f"{EXTERNAL_DATA_DIR}/spider-silkome-database.v1.prot.fixed.fasta"
spider_genome_path = f"{RAW_DATA_DIR}/spider_genome"
spidroin_path = f"{INTERIM_DATA_DIR}/spidroin"

Use miniprot to align the C-terminal and N-terminal sequences of the spidroin gene sequences to the genome of the new species.

In [3]:
spidroin_files = os.listdir(spidroin_path)
spider_genomes = [f for f in os.listdir(spider_genome_path) if f.endswith(".fa.gz")]
gnome_mpi_path = f"{INTERIM_DATA_DIR}/genome_mpi"
miniprot_output_path = f"{INTERIM_DATA_DIR}/miniprot"
os.makedirs(gnome_mpi_path, exist_ok=True)
os.makedirs(miniprot_output_path, exist_ok=True)
for spider_genome in spider_genomes:
    spider = spider_genome.split(".")[0]
    index_cmd = f"miniprot -t70 -d {gnome_mpi_path}/{spider}.mpi {spider_genome_path}/{spider_genome}"
    if not os.path.exists(f"{gnome_mpi_path}/{spider}.mpi"):
        indext_start_time = time.time()
        subprocess.run(index_cmd, shell=True)
        indext_end_time = time.time()
        print(f"Indexing {spider} takes {indext_end_time - indext_start_time} seconds")
    else:
        print(f"Indexing {spider} already exists")

    output_dir = f"{miniprot_output_path}/{spider}_all"
    os.makedirs(output_dir, exist_ok=True)
    align_cmd1 = f"miniprot -t 70 -I --gff {gnome_mpi_path}/{spider}.mpi {spidroin_fasta_file} > {output_dir}/{spider}.gff"
    align_cmd2 = f"miniprot -t 70 -I --aln {gnome_mpi_path}/{spider}.mpi {spidroin_fasta_file} > {output_dir}/{spider}.aln"
    if not os.path.exists(f"{output_dir}/{spider}.gff"):
        align_start_time = time.time()
        subprocess.run(align_cmd1, shell=True)
        align_end_time = time.time()
        print(f"Alignment {spider} takes {align_end_time - align_start_time} seconds")
    else:
        print(f"Alignment {spider} already exists")

    if not os.path.exists(f"{output_dir}/{spider}.mRNA.gff"):
        grep_cmd = f"grep 'mRNA' {output_dir}/{spider}.gff > {output_dir}/{spider}.mRNA.gff"
        subprocess.run(grep_cmd, shell=True)
    else:
        print(f"mRNA gff file already exists")

    if os.path.exists(f"{output_dir}/{spider}.mRNA.gff"):
        mRNA_gff = pd.read_csv(f"{output_dir}/{spider}.mRNA.gff", sep='\t', header=None)
        gff_header = ["seqid", "source", "type", "start", "end", "score", "strand", "frame", "attribute"]
        mRNA_gff.columns = gff_header
        spidroins = list(set([row["attribute"].split(';')[-1].split('|')[-2] for index, row in mRNA_gff.iterrows()]))
        print(f"Total number of spidroins in {spider}: {len(spidroins)}\n{spidroins}")
        for spidroin in spidroins:
            gff_spidroin_output = f"{output_dir}/{spider}.mRNA.{spidroin}.gff"
            if not os.path.exists(gff_spidroin_output):
                grep_cmd = f"grep '|{spidroin}|' {output_dir}/{spider}.mRNA.gff > {gff_spidroin_output}"
                subprocess.run(grep_cmd, shell=True)
            else:
                print(f"{spider}.mRNA.{spidroin}.gff already exists")

Indexing Trichonephila_clavata already exists
Alignment Trichonephila_clavata already exists
mRNA gff file already exists
Total number of spidroins in Trichonephila_clavata: 18
['AcSp', 'CySp', 'Pflag', 'MaSp3B', 'MaSp1', 'MiSp', 'Putative_spidroin', 'MaSp2', 'MaSp', 'Ampullate_spidroin', 'AgSp1', 'PySp', 'MaSp3', 'Flag', 'Spidroin', 'AgSp2', 'CrSp', 'MaSp2B']
Trichonephila_clavata.mRNA.AcSp.gff already exists
Trichonephila_clavata.mRNA.CySp.gff already exists
Trichonephila_clavata.mRNA.Pflag.gff already exists
Trichonephila_clavata.mRNA.MaSp3B.gff already exists
Trichonephila_clavata.mRNA.MaSp1.gff already exists
Trichonephila_clavata.mRNA.MiSp.gff already exists
Trichonephila_clavata.mRNA.Putative_spidroin.gff already exists
Trichonephila_clavata.mRNA.MaSp2.gff already exists
Trichonephila_clavata.mRNA.MaSp.gff already exists
Trichonephila_clavata.mRNA.Ampullate_spidroin.gff already exists
Trichonephila_clavata.mRNA.AgSp1.gff already exists
Trichonephila_clavata.mRNA.PySp.gff already

In [None]:
spidroins = ['Pflag', 'MaSp1', 'AcSp', 'MaSp3B', 'Putative_spidroin',
'PySp', 'MaSp3', 'AgSp1', 'AgSp2', 'MaSp2B', 'MaSp', 'Ampullate_spidroin',
'Flag', 'CrSp', 'CySp', 'MiSp', 'MaSp2', 'Spidroin'
]
all_gff_records = []
for spidroin in spidroins:
    # spidroin = "MiSp"
    spider = "Trichonephila_clavata"
    output_dir = f"{miniprot_output_path}/{spider}_all"
    spidroin_gff = f"{output_dir}/{spider}.mRNA.{spidroin}.gff"
    attr_dict = {}
    spidroin_gff_data = []
    with open(spidroin_gff, 'r') as f:
        for line in f:
            if line.startswith('#'):
                continue

            fields = line.strip().split('\t')

            # Analysis of the attributes field
            for attr in fields[8].split(';'):
                if '=' in attr:
                    key, value = attr.split('=', 1)
                    attr_dict[key] = value

            # Create attributes object
            attr_obj = Attributes(
                ID=attr_dict['ID'],
                Rank=int(attr_dict['Rank']),
                Identity=float(attr_dict['Identity']),
                Positive=float(attr_dict['Positive']),
                Target=attr_dict['Target'].split('|')
            )

            # Create gff_data object and add to list
            spidroin_gff_data.append(GFFData(
                seqid=fields[0],
                source=fields[1],
                type=fields[2],
                start=int(fields[3]),
                end=int(fields[4]),
                score=float(fields[5]),
                strand=fields[6],
                frame=fields[7],
                attributes=attr_obj
            ))

    # Sort by positive with descending order
    spidroin_gff_data.sort(key=lambda x: x.attributes.Positive, reverse=True)
    # Extract positions
    positions = extract_positions_from_gff(spidroin_gff_data, positive_threshold=0.85)
    # Export data
    # Export CSV data
    csv_output = spidroin_gff.replace('.gff', '.csv')
    df = positions_export(positions, csv_output, format='csv')

    # Export GFF data
    gff_output = spidroin_gff.replace('.gff', '.combined.gff')
    gff_records = positions_export(positions, gff_output, format='gff', spidroin=spidroin)

    all_gff_records.extend(gff_records)

# Convert to DataFrame and save
if all_gff_records:
    df_combined = pd.DataFrame(all_gff_records)
    # Sort by seqid (chromosome) and start position with natural sorting
    df_combined['seqid_sort'] = df_combined['seqid'].str.extract(r'(\d+)').astype(float)
    df_combined = df_combined.sort_values(['seqid_sort', 'start']).drop('seqid_sort', axis=1)
    # Write to GFF file
    with open(f"{PROCESSED_DATA_DIR}/{spider}_combined.gff", 'w') as f:
        f.write("##gff-version 3\n")
        df_combined.to_csv(f, sep='\t', header=False, index=False)
else:
    print(f"No GFF records to combine for {spider}")

[32m2025-10-08 18:30:49.993[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_csv[0m:[36m139[0m - [1mCSV saved to /home/gyk/project/spider_silkome/data/interim/miniprot/Trichonephila_clavata_all/Trichonephila_clavata.mRNA.Pflag.csv[0m
[32m2025-10-08 18:30:49.994[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_csv[0m:[36m140[0m - [1mTotal combinations: 0[0m
[32m2025-10-08 18:30:49.994[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_gff[0m:[36m226[0m - [1mGFF saved to /home/gyk/project/spider_silkome/data/interim/miniprot/Trichonephila_clavata_all/Trichonephila_clavata.mRNA.Pflag.combined.gff[0m
[32m2025-10-08 18:30:49.995[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_gff[0m:[36m227[0m - [1mTotal genes predicted: 0[0m
[32m2025-10-08 18:30:50.031[0m | [1mINFO    [0m | [36mspider_silkome_module.export[0m:[36m_export_to_csv[0m:[36m139[

Optimized version (with quality filtering), if you want to weight or filter based on alignment quality:

In [None]:
positions = extract_positions_from_gff(spidroin_gff_data, positive_threshold=0.85)
positions

核心逻辑说明
关键理解：

正向链 (+)：基因从5'到3'，N端在前（start），C端在后（end）
反向链 (-)：基因在反向互补链上，基因组坐标中start < end，但生物学意义上C端对应较小的坐标
位置记录规则：

|类型|链方向|记录位置|原因|
|---|---|---|---|
|CTD|+|end|C端在基因末尾，向后延伸|
|CTD|-|start|C端在基因起始（基因组坐标小），向前延伸|
|NTD|+|start|N端在基因起始，向前延伸|
|NTD|-|end|N端在基因末尾（基因组坐标大），向后延伸|

In [None]:
# 导出数据
# 输出CSV
csv_output = spidroin_gff.replace('.gff', '.csv')
df = positions_export(positions, csv_output, format='csv')

# 输出GFF
gff_output = spidroin_gff.replace('.gff', '.combined.gff')
gff_records = positions_export(positions, gff_output, format='gff', spidroin=spidroin)

# 查看统计信息
print(f"\n统计信息-{spidroin}:")
print(f"总共 {len(positions)} 个染色体-链组合")
print(f"有效组合（同时有start和end）: {sum(1 for p in positions if p.start and p.end)}")
print(f"预测的基因数量: {len(gff_records)}")

代码说明：

1. CSV输出：
- 每行包含：染色体、链、类型（start/end）、位置、计数
- 跳过空的start和end
- 按染色体、链、位置排序
2. GFF输出：
- 只输出同时有start和end的记录
- 配对所有可能的start-end组合
- 过滤长度超过100kbp的基因
- 包含计数信息在attributes字段中
- GFF格式符合标准（version 3）
3. 配对逻辑：
- 正向链：start < end
- 反向链：start < end（基因组坐标，但生物学上C端在前）
- 计算基因长度并过滤