In [1]:
import re

In [2]:
# Extracting the sequence with the coreesponding residue scores for only the condition of interest from the fasta file.

#pattern_disorder_disorder = r"\|derived-binding_mode_disorder_to_disorder-mobi"
#pattern_disorder_order = r"\|derived-binding_mode_disorder_to_order-mobi"

def write_transition_sate_from_fasta(input_file, output_file, pattern_sequence=r"\|sequence\|", pattern_transition=r"\|derived-binding_mode_disorder_to_disorder-mobi"):
    '''
    Extracts sequences and their transition states from a FASTA file.

    Args:
        input_file (str): Path to the input FASTA file.
        output_file (str): Path to the output file.
        pattern_sequence (str): Regex pattern to identify sequence headers.
        pattern_transition (str): Regex pattern to identify transition state headers.
    '''
    with open(input_file, "r") as infile, open(output_file, "w") as outfile:
        id = None
        prev_line = None
        for line in infile:
            if re.search(pattern_sequence, line):
                outfile.write(line.strip() + "\n")
                id = line.strip().split("|")[0]
                prev_line = line
                next_line = next(infile, None)
                if next_line:
                    outfile.write(next_line.strip() + "\n")
                continue
            if id and re.search(re.escape(id) + pattern_transition, line):
                outfile.write(line.strip() + "\n")
                next_line = next(infile, None)
                if next_line:
                    outfile.write(next_line.strip() + "\n")

In [3]:
# Based on the residue scores of 1, the residues forming a strech was extracted as a seperate sequence from a fasta file.

def find_transition_regions_from_fasta(input_file, output_file,transition_state="Disorder-Disorder"):
    '''
    Identifies and extracts transition regions from a FASTA file containing sequences and scores.

    Args:
        input_file (str): Path to the input FASTA file.
        output_file (str): Path to the output file to write the regions.
        transition_state (str): The type of transition state to label the regions.
    '''
    score=None
    sequence=None
    id=None
    with open(input_file,"r") as file, open(output_file, "w") as output:
        for line in file:
            if not line.startswith(">"):
                # Check if the line is a sequence or a score
                if not line.startswith("0") and not line.startswith("1"):
                    sequence=line.strip()
                    score=None
                else:
                    score=line.strip()
                # If we have a sequence, score, and ID, process them
                if score and sequence and id:
                    result="".join(sequence[i] if score[i] == "1" else " " for i in range(len(sequence)))
                    results=result.split()
                    for i,r in enumerate(results):
                        output.write(id+" "+transition_state+"_"+str(i+1)+"\n")
                        output.write(r+"\n")
            else:
                # If the line is a header, extract the ID
                if "sequence" in line:
                    id=line.strip()

In [4]:
# Define file paths for MobiDB datasets
mobidb_dd = "mobidb_search_2025-07-09T19-37-03_DD.fasta"
mobidb_do = "mobidb_search_2025-07-09T19-39-02_DO.fasta"

# Define regex patterns for sequence and transition state headers
pattern_sequence = r"\|sequence\|"
pattern_disorder_disorder = r"\|derived-binding_mode_disorder_to_disorder-mobi"
pattern_disorder_order = r"\|derived-binding_mode_disorder_to_order-mobi"

# Process the Disorder-Disorder dataset
write_transition_sate_from_fasta(mobidb_dd, "transition_mobidb_dd.fasta", pattern_sequence=pattern_sequence, pattern_transition=pattern_disorder_disorder)
find_transition_regions_from_fasta("transition_mobidb_dd.fasta", "mobidb_dd.fasta", transition_state="Disorder-Disorder")

# Process the Disorder-Order dataset
write_transition_sate_from_fasta(mobidb_do, "transition_mobidb_do.fasta", pattern_sequence=pattern_sequence, pattern_transition=pattern_disorder_order)
find_transition_regions_from_fasta("transition_mobidb_do.fasta", "mobidb_do.fasta", transition_state="Disorder-Order")