In [None]:
# Import required libraries
# numpy: for random number generation and array operations
# pandas: for potential data manipulation (imported but not used in current code)
# re: for regular expression pattern matching in sequence analysis
import numpy as np
import pandas as pd
import re

In [None]:
# Define the 20 canonical amino acids (single-letter codes)
canonical_amino_acids = 'ACDEFGHIKLMNPQRSTVWY'

def get_modification_sites(sequence, mod_sites):
    """
    Find all positions in a sequence where specific amino acids (potential modification sites) occur.
    
    Args:
        sequence (str): Protein or peptide sequence
        mod_sites (str): String of amino acids to search for (e.g., 'SC' for Serine and Cysteine)
    
    Returns:
        list: Indices of positions where modification sites are found
    """
    return [i for i, aa in enumerate(sequence) if aa in mod_sites]

def parse_modification_from_peptide(peptide, modifications):
    """
    Extract modification positions and types from a peptide sequence.
    
    Args:
        peptide (str): Peptide sequence with modifications in brackets (e.g., 'PEPS[Phospho]IDE')
        modifications (list): List of modification strings to search for (e.g., ['[Phospho]', '[DHAA]'])
    
    Returns:
        list: Tuples of (position, modification_string) for each modification found
    """
    mod_pattern = '|'.join(map(re.escape, modifications))
    return [(match.start(), match.group()) for match in re.finditer(mod_pattern, peptide)]

def add_mod(sequence, ind, mod):
    """
    Insert a modification string at a specific position in a sequence.
    
    Args:
        sequence (str): Original sequence
        ind (int): Index after which to insert the modification
        mod (str): Modification string to insert (e.g., '[Phospho]')
    
    Returns:
        str: Modified sequence with the modification inserted
    """
    return sequence[:ind+1] + mod + sequence[ind+1:]

def generate_protein_sequence(length, methionine_start=True, force_motif=None, count=1):
    """
    Generate a random protein sequence with optional constraints.
    
    Args:
        length (int): Total length of the sequence to generate
        methionine_start (bool): Whether to start with methionine (M) residue
        force_motif (str): Motif substring that must appear in the sequence
        count (int): Minimum number of times the motif must appear
    
    Returns:
        str: Generated protein sequence
    """
    # Generate random sequence, starting with M if specified
    sequence = 'M'*methionine_start + ''.join(np.random.choice(list(canonical_amino_acids), size=length - 1))
    
    # Ensure the motif appears at least 'count' times if specified
    if force_motif:
        sequence = enforce_motif(sequence, force_motif, count)

    return sequence

def enforce_motif(sequence, motif, count=1):
    """
    Ensure a specific motif appears at least 'count' times in a sequence.
    Randomly inserts the motif until the requirement is met.
    
    Args:
        sequence (str): Original sequence
        motif (str): Motif to enforce
        count (int): Minimum number of occurrences required
    
    Returns:
        str: Sequence with the motif appearing at least 'count' times
    """
    while sequence.count(motif) < count:
        # Insert motif at a random position
        ind = np.random.randint(0, len(sequence) - len(motif) + 1)
        sequence = add_mod(sequence, ind, motif)
    return sequence

def digest_protein(sequence, cleavage_site):
    """
    Simulate enzymatic digestion of a protein sequence.
    Cleaves the protein after residues specified in cleavage_site.
    
    Args:
        sequence (str): Protein sequence to digest
        cleavage_site (str): String of amino acids where cleavage occurs (e.g., 'KR' for trypsin)
    
    Returns:
        list: List of peptide fragments after digestion
    """
    # Find all positions where cleavage occurs
    cleavage_sites = [i for i, aa in enumerate(sequence) if aa in cleavage_site]
    peptides = []
    start = 0
    
    # Generate peptides between cleavage sites
    for site in cleavage_sites:
        peptides.append(sequence[start:site + 1])
        start = site + 1
    
    # Add the final peptide if any sequence remains
    if start < len(sequence):
        peptides.append(sequence[start:])
    return peptides

In [None]:
class Peptide:
    """
    Class representing a peptide with modifications, protein origin information, and intensity.
    
    Attributes:
        full_sequence (str): Peptide sequence with modifications in brackets (e.g., 'PEPS[Phospho]IDE')
        protein_id (str): Identifier of the parent protein
        start_pos (int): Starting position of peptide in the parent protein (1-indexed)
        end_pos (int): Ending position of peptide in the parent protein (1-indexed)
        modifications (list): List of allowed modifications
        intensity (float): Measured intensity/abundance of the peptide
        base_sequence (str): Peptide sequence without modifications
        mod_sites (dict): Modification sites relative to peptide start {position: modification}
        protein_mod_sites (dict): Modification sites relative to protein start {position: modification}
    """
    
    def __init__(self, full_sequence, protein_id, start_pos, end_pos, modifications, intensity=0.0):
        """
        Initialize a Peptide object and parse its modifications.
        
        Args:
            full_sequence (str): Peptide sequence with modifications
            protein_id (str): Parent protein identifier
            start_pos (int): Start position in protein
            end_pos (int): End position in protein
            modifications (list): List of modification strings to recognize
            intensity (float): Peptide signal intensity
        """
        self.full_sequence: str = full_sequence
        self.protein_id: str = protein_id
        self.start_pos: int = start_pos
        self.end_pos: int = end_pos
        self.modifications: str = modifications
        self.intensity: float = intensity
        # Extract base sequence and parse modifications upon initialization
        self.get_base_sequence()
        self.parse_modifications()

    def get_base_sequence(self):
        """
        Extract the base peptide sequence by removing modification annotations.
        Removes all bracketed modifications (e.g., '[Phospho]') from the sequence.
        """
        self.base_sequence: str = re.sub(r'\[\w*?\]', '', self.full_sequence)
        
    def parse_modifications(self):
        """
        Parse modification positions from the full sequence.
        Creates two dictionaries:
        - mod_sites: positions relative to the peptide sequence (0-indexed)
        - protein_mod_sites: positions relative to the protein sequence (1-indexed)
        """
        mods = parse_modification_from_peptide(self.full_sequence, self.modifications)
        # Modification sites within the peptide (0-indexed)
        self.mod_sites: dict[int, str] = {pos: mod for pos, mod in mods}
        # Modification sites mapped to protein positions (1-indexed)
        self.protein_mod_sites: dict[int, str] = {self.start_pos + pos - 1: mod for pos, mod in mods}
        
    def __str__(self):
        """
        Generate a tab-separated string representation of the peptide for file output.
        
        Returns:
            str: Tab-separated values: FullSequence, ProteinGroup, StartPos, EndPos, Modifications, Intensity
        """
        return '\t'.join([self.full_sequence, 
                          self.protein_id, 
                          str(self.start_pos), 
                          str(self.end_pos), 
                          ','.join(f"{str(pos)}:{mod}" for pos, mod in self.mod_sites.items()), 
                          str(self.intensity)])

In [None]:
# ==== Configuration Parameters for Synthetic Peptide Dataset Generation ====

# Enzyme cleavage specificity: Define which amino acids trigger cleavage
# 'KR' = trypsin, 'FWY' = chymotrypsin, 'DE' = GluC, etc.
cutsite = 'KR'

# Amino acids that can be modified (e.g., Serine and Cysteine)
modified_residues = 'SC'

# Types of modifications that can occur at the modifiable residues
modifications = ['[Phospho]', '[DHAA]'] 

# Range for the number of cleavage sites per protein (min, max)
range_cutsite_count = (1, 5)

# Number of synthetic proteins to generate
num_proteins = 3

# Range for protein length in amino acids (min, max)
range_protein_length = (50, 200)

# Random seed for reproducibility
seed = 42

In [None]:
# Set random seed for reproducible results
np.random.seed(seed)

# Generate synthetic protein sequences
# Each protein has a random length within range_protein_length
# and contains the cleavage motif (cutsite) at least range_cutsite_count[0] times
proteins = {f"PROT{i}": generate_protein_sequence(np.random.randint(*range_protein_length), 
                                      force_motif=cutsite, 
                                      count=np.random.randint(*range_cutsite_count)) 
            for i in range(num_proteins)}

# Digest each protein using the specified cleavage site
# Filter peptides to be:
#   - Between 4 and 50 amino acids long
#   - Must contain at least one Serine (S) residue
# Replicate each peptide 5 times to simulate repeated detection
peptides = {key: [pep for pep in digest_protein(protein, cutsite) 
             if 4<len(pep)<50 and 'S' in pep] * 5
            for key, protein in proteins.items()}

# Display the total number of peptides and number of proteins
len([p for i in peptides for p in i]), len(peptides)

(15, 3)

In [None]:
# Create a list to store all Peptide objects with their metadata and modifications
allPeptides = []

# Process each peptide from each protein
for (prot_id, sublist) in peptides.items():
	for pep in sublist:
		# Find all occurrences of this peptide in the parent protein sequence
		# (peptides may appear multiple times in a protein)
		possible_positions = [m.start()+1 for m in re.finditer(re.escape(pep), proteins[prot_id])]
		
		# Randomly select one occurrence if multiple exist (1-indexed position)
		start_pos = np.random.choice(possible_positions)
		end_pos = start_pos + len(pep)-1

		# Identify which residues in the peptide can be modified (S or C positions)
		mod_sites = get_modification_sites(pep, modified_residues)
		
		# Randomly decide whether to add a modification (50% probability)
		will_modify = np.random.rand() < 0.5
		
		# Assign a random intensity value between 1 and 4
		intensity = np.random.randint(1,5)

		# If modification sites exist and we decided to modify:
		# - Randomly select one modification site
		# - Randomly select one modification type
		# - Add the modification to the peptide sequence
		if mod_sites and will_modify:
			ind = np.random.choice(mod_sites)
			mod = np.random.choice(modifications)
			pep = add_mod(pep, ind, mod)

		# Create a Peptide object with all metadata and add to the list
		allPeptides.append(Peptide(full_sequence=pep,
								   protein_id=prot_id, 
								   start_pos=int(start_pos),
								   end_pos=int(end_pos),
								   modifications=modifications,
								   intensity=intensity))

In [None]:
# Write all peptides to a tab-separated file
# Output format: FullSequence, ProteinGroup, StartPos, EndPos, Modifications, Intensity
with open('synthetic_peptides.tsv', 'w') as f:
    # Write header row
    f.write('FullSequence\tProteinGroup\tStartPos\tEndPos\tModifications\tIntensity\n')
    # Write each peptide as a tab-separated line
    for peptide in allPeptides:
        f.write(str(peptide) + '\n')