# core

> Core utilities for in-silico protein digestion and peptide analysis

This module provides the foundational functions for proteomics workflows, including FASTA file parsing, enzymatic digestion simulation, and peptide property calculations.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

ERROR:tornado.general:Uncaught exception in ZMQStream callback
Traceback (most recent call last):
  File "/Users/mtinti/miniforge3/envs/stella_seq/lib/python3.9/site-packages/traitlets/traitlets.py", line 632, in get
    value = obj._trait_values[self.name]
KeyError: '_control_lock'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/mtinti/miniforge3/envs/stella_seq/lib/python3.9/site-packages/zmq/eventloop/zmqstream.py", line 565, in _log_error
    f.result()
  File "/Users/mtinti/miniforge3/envs/stella_seq/lib/python3.9/site-packages/ipykernel/kernelbase.py", line 301, in dispatch_control
    async with self._control_lock:
  File "/Users/mtinti/miniforge3/envs/stella_seq/lib/python3.9/site-packages/traitlets/traitlets.py", line 687, in __get__
    return t.cast(G, self.get(obj, cls))  # the G should encode the Optional
  File "/Users/mtinti/miniforge3/envs/stella_seq/lib/python3.9/site-packages/traitlets/traitlets.p

In [None]:
#| export
from typing import Dict, Union
from pyteomics import parser
from pyteomics import mass
from pathlib import Path
from Bio import SeqIO
import pandas as pd

In [None]:
#| export
from pathlib import Path
import os

# Get the repository root
if 'GITHUB_WORKSPACE' in os.environ:
    # In GitHub Actions
    REPO_ROOT = Path(os.environ['GITHUB_WORKSPACE'])
else:
    # Local development - find repo root
    REPO_ROOT = Path.cwd()
    while not (REPO_ROOT / 'settings.ini').exists():
        if REPO_ROOT == REPO_ROOT.parent:
            REPO_ROOT = Path.cwd()  # Fallback
            break
        REPO_ROOT = REPO_ROOT.parent

TEST_DATA = REPO_ROOT / 'test_data'

print(f"Repo root: {REPO_ROOT}")
print(f"Test data dir: {TEST_DATA}")
print(f"Test data exists: {TEST_DATA.exists()}")

Repo root: /Users/mtinti/git_projects/protein_cutter
Test data dir: /Users/mtinti/git_projects/protein_cutter/test_data
Test data exists: True


In [None]:
#| export
def load_fasta(fasta_path: Union[str, Path]) -> Dict[str, str]:
    """
    Load a FASTA file and return a dictionary mapping protein IDs to sequences.
    
    Parameters
    ----------
    fasta_path : str or Path
        Path to the FASTA file to load.
    
    Returns
    -------
    Dict[str, str]
        Dictionary mapping protein IDs (record.id) to amino acid sequences (as strings).
    
    Raises
    ------
    FileNotFoundError
        If the specified FASTA file does not exist.
    ValueError
        If the file is empty or cannot be parsed as FASTA format.
    
    Examples
    --------
    >>> proteins = load_fasta("proteins.fasta")
    >>> len(proteins)
    42
    >>> proteins['sp|P12345|EXAMPLE']
    'MKTAYIAKQRQISFVKSHFSRQLEERLGL...'
    """
    import os
    path = os.path.abspath(os.curdir)
    print(path)
    
    fasta_path = Path(fasta_path)
    if not fasta_path.exists():
        raise FileNotFoundError(f"FASTA file not found: {fasta_path} {path}")
    
    protein_dict = {}
    with fasta_path.open('r') as handle:
        for record in SeqIO.parse(handle, "fasta"):
            protein_dict[record.id] = str(record.seq)
    
    if not protein_dict:
        raise ValueError(f"No sequences found in FASTA file: {fasta_path} {path}")
    
    return protein_dict

In [None]:
#| export
protein_dict = load_fasta(TEST_DATA / 'test_sequence.fa')

/Users/mtinti/git_projects/protein_cutter/nbs


In [None]:
#| hide
assert(protein_dict['P15497'][0:5]=='MKAVV')

In [None]:
#| export
def digest(
    sequence: str,
    protein_id: str,
    enzyme: str = 'trypsin',
    missed_cleavages: int = 1,
    charge_states: list = [1, 2, 3],
    mass_range: tuple = (800.0, 4000.0),
    min_pep_length: int = 5,
    max_pep_length: int = 35,
    sort_by_mass: bool = False,
) -> pd.DataFrame:
    """
    Digest a protein and add flanking amino acids for each peptide.
    
    Parameters
    ----------
    sequence : str
        Protein sequence to digest
    protein_id : str
        Protein identifier (for the DataFrame)
    enzyme : str
        Enzyme name (default: 'trypsin')
    missed_cleavages : int
        Number of allowed missed cleavages (default: 1)
    charge_states : list
        Charge states for m/z calculation (default: [1, 2, 3])
    mass_range : tuple
        (min, max) monoisotopic mass filter in Da (default: (800.0, 4000.0))
    min_pep_length : int
        Minimum peptide length to retain (default: 5)
    max_pep_length : int
        Maximum peptide length to retain (default: 35)
    sort_by_mass : bool
        Sort output by monoisotopic mass (default: False)
    
    Returns
    -------
    pd.DataFrame
        DataFrame with the following columns:
        - start_index : int - Start position in protein sequence
        - end_index : int - End position in protein sequence
        - pep_seq : str - Peptide sequence
        - protein_id : str - Protein identifier
        - pep_length : int - Peptide length
        - prev_aa : str - Previous amino acid (or '-' at N-terminus)
        - next_aa : str - Next amino acid (or '-' at C-terminus)
        - extended_seq : str - Sequence with flanking AAs
        - rep_extended_seq : str - Extended sequence with parentheses
        - mass_mono : float - Monoisotopic mass (Da)
        - mass_avg : float - Average mass (Da)
        - mz_{z} : float - m/z for each charge state in charge_states
    """
    # Digest the protein
    cleavage_results = parser.xcleave(
        sequence,
        enzyme,
        missed_cleavages=missed_cleavages
    )
    
    # Create DataFrame with proper dtypes
    df = pd.DataFrame(
        cleavage_results,
        columns=['start_index', 'pep_seq']
    )
    
    # Add protein ID
    df['protein_id'] = protein_id
    
    # Calculate end index
    df['end_index'] = df['start_index'] + df['pep_seq'].str.len()
    
    # Add peptide length
    df['pep_length'] = df['pep_seq'].str.len()
    
    # Get flanking amino acids with proper boundary handling
    df['prev_aa'] = df['start_index'].apply(
        lambda idx: sequence[idx - 1] if idx > 0 else '-'
    )
    
    df['next_aa'] = df['end_index'].apply(
        lambda idx: sequence[idx] if idx < len(sequence) else '-'
    )
    
    # Create extended sequence (prev-peptide-next)
    df['extended_seq'] = df['prev_aa'] + df['pep_seq'] + df['next_aa']
    df['rep_extended_seq'] = '(' + df['prev_aa'] + ')' + df['pep_seq'] + '(' + df['next_aa'] + ')'
    
    # Calculate masses
    df['mass_mono'] = df['pep_seq'].apply(mass.fast_mass)
    df['mass_avg'] = df['pep_seq'].apply(
        lambda seq: mass.calculate_mass(seq, average=True)
    )
    
    # Calculate m/z for different charge states
    for z in charge_states:
        df[f'mz_{z}'] = df['pep_seq'].apply(
            lambda seq: mass.calculate_mass(seq, charge=z)
        )
    
    # Build column list dynamically based on charge_states
    cols = [
        'start_index', 'end_index', 'pep_seq', 'protein_id', 'pep_length',
        'prev_aa', 'next_aa', 'extended_seq', 'rep_extended_seq',
        'mass_mono', 'mass_avg'
    ]
    cols += [f'mz_{z}' for z in charge_states]
    df = df[cols]
    
    # Apply filters
    df = df[
        (df['pep_seq'].str.len() >= min_pep_length) &
        (df['pep_seq'].str.len() <= max_pep_length) &
        (df['mass_mono'].between(mass_range[0], mass_range[1]))
    ]
    
    if sort_by_mass:
        df = df.sort_values('mass_mono')
    
    return df

In [None]:
df = digest(
    sequence=protein_dict['P15497'],
    protein_id='P15497',
    enzyme='trypsin',
    missed_cleavages=0,
    sort_by_mass=True
  )
df.head(10)

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
26,176,183,AHVETLR,P15497,7,R,Q,RAHVETLRQ,(R)AHVETLR(Q),824.450451,824.926094,825.457727,413.232502,275.824093
22,156,163,VQELQDK,P15497,7,K,L,KVQELQDKL,(K)VQELQDK(L),858.444697,858.93752,859.451973,430.229625,287.155509
19,141,150,VAPLGEEFR,P15497,9,K,E,KVAPLGEEFRE,(K)VAPLGEEFR(E),1016.529095,1017.137693,1017.536372,509.271824,339.850308
23,163,172,LSPLAQELR,P15497,9,K,D,KLSPLAQELRD,(K)LSPLAQELR(D),1025.586944,1026.189246,1026.594221,513.800749,342.869591
34,228,237,AKPVLEDLR,P15497,9,K,Q,KAKPVLEDLRQ,(K)AKPVLEDLR(Q),1039.602594,1040.215863,1040.609871,520.808574,347.541475
35,237,248,QGLLPVLESLK,P15497,11,R,V,RQGLLPVLESLKV,(R)QGLLPVLESLK(V),1195.717624,1196.438348,1196.724901,598.866089,399.579818
36,248,260,VSILAAIDEASK,P15497,12,K,K,KVSILAAIDEASKK,(K)VSILAAIDEASK(K),1215.671068,1216.383328,1216.678344,608.84281,406.230966
31,205,217,EGGGSLAEYHAK,P15497,12,K,A,KEGGGSLAEYHAKA,(K)EGGGSLAEYHAK(A),1217.567665,1218.274828,1218.574942,609.791109,406.863165
4,35,46,DFATVYVEAIK,P15497,11,K,D,KDFATVYVEAIKD,(K)DFATVYVEAIK(D),1254.649604,1255.417813,1255.656881,628.332079,419.223811
17,130,139,WHEEVEIYR,P15497,9,K,Q,KWHEEVEIYRQ,(K)WHEEVEIYR(Q),1259.593486,1260.356315,1260.600763,630.80402,420.871772


In [None]:
#| hide
assert('RAHVETLRQ' in df[df['start_index']==176]['extended_seq'].values )

In [None]:
df2 = digest(
      sequence=protein_dict['P15497-2_KtoA_142'],
      protein_id='P15497-2_KtoA_142',
      enzyme='trypsin',
      missed_cleavages=0
  )

In [None]:
df2.head()

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
1,2,19,AVVLTLAVLFLTGSQAR,P15497-2_KtoA_142,17,K,H,KAVVLTLAVLFLTGSQARH,(K)AVVLTLAVLFLTGSQAR(H),1758.040355,1759.101048,1759.047632,880.027454,587.020728
2,19,33,HFWQQDDPQSSWDR,P15497-2_KtoA_142,14,R,V,RHFWQQDDPQSSWDRV,(R)HFWQQDDPQSSWDR(V),1830.77101,1831.856733,1831.778286,916.392781,611.26428
4,35,46,DFATVYVEAIK,P15497-2_KtoA_142,11,K,D,KDFATVYVEAIKD,(K)DFATVYVEAIK(D),1254.649604,1255.417813,1255.656881,628.332079,419.223811
6,50,63,DYVAQFEASALGK,P15497-2_KtoA_142,13,R,Q,RDYVAQFEASALGKQ,(R)DYVAQFEASALGK(Q),1397.682695,1398.518785,1398.689972,699.848624,466.901508
8,68,82,LLDNWDTLASTLSK,P15497-2_KtoA_142,14,K,V,KLLDNWDTLASTLSKV,(K)LLDNWDTLASTLSK(V),1575.814438,1576.748491,1576.821714,788.914495,526.278756


In [None]:
#| hide
assert('RAHVETLRQ' in df2[df2['start_index']==176]['extended_seq'].values )

In [None]:
#| hide
assert(set(df['pep_seq'])-set(df2['pep_seq']))

In [None]:
#| hide
assert(set(df2['pep_seq'])-set(df['pep_seq'])=={'QAVAPLGEEFR'})

In [None]:
#| hide
df2[~df2['pep_seq'].isin(df['pep_seq'])]

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
18,139,150,QAVAPLGEEFR,P15497-2_KtoA_142,11,R,E,RQAVAPLGEEFRE,(R)QAVAPLGEEFR(E),1215.624786,1216.345134,1216.632063,608.81967,406.215539


In [None]:
#| hide
df[~df['pep_seq'].isin(df2['pep_seq'])]

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
19,141,150,VAPLGEEFR,P15497,9,K,E,KVAPLGEEFRE,(K)VAPLGEEFR(E),1016.529095,1017.137693,1017.536372,509.271824,339.850308


In [None]:
#| export
def digest_to_set(
    sequence: str,
    enzyme: str = 'trypsin',
    missed_cleavages: int = 0,
    mass_range: tuple[float, float] = (800.0, 4000.0),
    min_pep_length: int = 5,
    max_pep_length: int = 35,
) -> set[str]:
    """
    Lightweight digest that returns only peptide sequences as a set.
    
    This is a memory-efficient alternative to `digest()` when only the
    peptide sequences are needed, without positional or mass annotations.
    
    Parameters
    ----------
    sequence : str
        Protein sequence to digest
    enzyme : str
        Enzyme name (default: 'trypsin')
    missed_cleavages : int
        Number of allowed missed cleavages (default: 0)
    mass_range : tuple[float, float]
        (min, max) monoisotopic mass filter in Da (default: (800.0, 4000.0))
    min_pep_length : int
        Minimum peptide length to retain (default: 5)
    max_pep_length : int
        Maximum peptide length to retain (default: 35)  
        
    Returns
    -------
    set[str]
        Set of peptide sequences passing the length and mass filters
    
    Examples
    --------
    >>> peptides = digest_to_set("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVK")
    >>> len(peptides)
    2
    >>> "LGLIEVQAPILSR" in peptides
    True
    """
    # Digest the protein
    cleavage_results = parser.xcleave(
        sequence,
        enzyme,
        missed_cleavages=missed_cleavages
    )
    
    # Filter and return as set
    return {
        pep_seq
        for _, pep_seq in cleavage_results
        if len(pep_seq) >= min_pep_length
        and len(pep_seq) <= max_pep_length
        and mass_range[0] <= mass.fast_mass(pep_seq) <= mass_range[1]
    }

In [None]:
#| hide
assert(digest_to_set(protein_dict['P15497'])==set(digest(
    sequence=protein_dict['P15497'],
    protein_id='P15497',
    enzyme='trypsin',
    missed_cleavages=0,
    sort_by_mass=True
  )['pep_seq']))

In [None]:
#| hide
assert(digest_to_set(protein_dict['P15497'],missed_cleavages=2)==set(digest(
    sequence=protein_dict['P15497'],
    protein_id='P15497',
    enzyme='trypsin',
    missed_cleavages=2,
    sort_by_mass=True
  )['pep_seq']))

In [None]:
#| export
from typing import Union

def fasta_to_peptide_set(
    fasta_path: Union[str, Path],
    enzyme: str = 'trypsin',
    missed_cleavages: int = 0,
    mass_range: tuple[float, float] = (800.0, 4000.0),
    min_pep_length: int = 5,
    max_pep_length: int = 35,
    show_progress: bool = True,
) -> set[str]:
    """
    Parse a FASTA file and return all unique peptides as a set.
    
    Memory-efficient function that digests proteins on-the-fly without
    storing the full protein sequences. Useful for building canonical
    peptide reference sets for comparison with experimental data.
    
    Parameters
    ----------
    fasta_path : str or Path
        Path to the FASTA file to parse
    enzyme : str
        Enzyme name (default: 'trypsin')
    missed_cleavages : int
        Number of allowed missed cleavages (default: 0)
    mass_range : tuple[float, float]
        (min, max) monoisotopic mass filter in Da (default: (800.0, 4000.0))
    min_pep_length : int
        Minimum peptide length to retain (default: 5)
    max_pep_length : int
        Maximum peptide length to retain (default: 35)        
    show_progress : bool
        Show progress bar with tqdm (default: True)
    
    Returns
    -------
    set[str]
        Set of all unique peptide sequences from the FASTA file
    
    Raises
    ------
    FileNotFoundError
        If the specified FASTA file does not exist.
    """
    fasta_path = Path(fasta_path)
    if not fasta_path.exists():
        raise FileNotFoundError(f"FASTA file not found: {fasta_path}")
    
    peptide_set = set()
    
    with fasta_path.open('r') as handle:
        records = SeqIO.parse(handle, "fasta")
        if show_progress:
            from tqdm import tqdm
            records = tqdm(records, desc="Digesting proteins")
        
        for record in records:
            peptide_set.update(
                digest_to_set(
                    sequence=str(record.seq),
                    enzyme=enzyme,
                    missed_cleavages=missed_cleavages,
                    mass_range=mass_range,
                    min_pep_length=min_pep_length,
                    max_pep_length=max_pep_length
                )
            )
    
    return peptide_set

In [None]:
peptide_set = fasta_to_peptide_set(TEST_DATA / 'test_sequence.fa')

Digesting proteins: 2it [00:00, 12446.01it/s]


In [None]:
#| hide
assert('VAPLGEEFR' in peptide_set )
assert('QAVAPLGEEFR' in peptide_set)

In [None]:
#| export
import shutil
import tempfile

def annotate_peptides_inplace(
    file_path: Union[str, Path],
    canonical_peptides: set[str],
    sequence_col: str = 'PEP.StrippedSequence',
    new_col_name: str = 'is_canonical',
    sep: str = None,
    show_progress: bool = True,
) -> int:
    """
    Annotate peptides in-place by writing to temp file then replacing.
    
    Parameters
    ----------
    file_path : str or Path
        Path to peptide file (will be modified in-place)
    canonical_peptides : set[str]
        Set of canonical peptide sequences
    sequence_col : str
        Column name containing peptide sequences (default: 'PEP.StrippedSequence')
    new_col_name : str
        Name for the new annotation column (default: 'is_canonical')
    sep : str or None
        Field separator. If None, auto-detects from extension (default: None)
    show_progress : bool
        Show progress bar (default: True)
    
    Returns
    -------
    int
        Number of peptides processed
    
    Raises
    ------
    FileNotFoundError
        If the input file does not exist.
    ValueError
        If sequence_col is not found or new_col_name already exists.
    """
    file_path = Path(file_path)
    
    if not file_path.exists():
        raise FileNotFoundError(f"File not found: {file_path}")
    
    # Auto-detect separator
    if sep is None:
        sep = ',' if file_path.suffix == '.csv' else '\t'
    
    # Check header before processing
    with file_path.open('r') as f:
        header = f.readline().rstrip('\n\r')
        columns = header.split(sep)
        
        if new_col_name in columns:
            raise ValueError(f"Column '{new_col_name}' already exists in file")
        
        if sequence_col not in columns:
            raise ValueError(f"Column '{sequence_col}' not found. Available: {columns[:10]}...")
    
    # Create temp file in same directory
    temp_fd, temp_path = tempfile.mkstemp(
        dir=file_path.parent,
        suffix='.tmp'
    )
    temp_path = Path(temp_path)
    
    try:
        peptide_count = 0
        seq_col_idx = columns.index(sequence_col)
        
        with file_path.open('r') as infile, open(temp_fd, 'w') as outfile:
            # Skip header (already read) and write new header
            infile.readline()
            outfile.write(f"{header}{sep}{new_col_name}\n")
            
            # Setup progress bar
            lines = infile
            if show_progress:
                from tqdm import tqdm
                lines = tqdm(infile, desc="Annotating peptides")
            
            for line in lines:
                line = line.rstrip('\n\r')
                fields = line.split(sep)
                
                peptide = fields[seq_col_idx]
                is_canonical = peptide in canonical_peptides
                
                outfile.write(f"{line}{sep}{is_canonical}\n")
                peptide_count += 1
        
        # Atomic replace
        shutil.move(temp_path, file_path)
        
    except Exception:
        if temp_path.exists():
            temp_path.unlink()
        raise
    
    return peptide_count

In [None]:
canonical = set(['DASGPAMTEIGEQPWGR', 'DVAGAVEFWTDR'])

In [None]:
spc_out = pd.read_csv(TEST_DATA / 'test_spectronaut_pep_out.tsv.bk',sep='\t')

In [None]:
spc_out['PEP.StrippedSequence']

0          DVAGAVEFWTDR
1     DASGPAMTEIGEQPWGR
2     DASGPAMTEIGEQPWGR
3           LTSAVPVLTAR
4       DPAGNCVHFVAEEQD
5       DPAGNCVHFVAEEQD
6    GLDELYAEWSEVVSTNFR
7          DFVEDDFAGVVR
8          DFVEDDFAGVVR
9              TEYKPTVR
Name: PEP.StrippedSequence, dtype: object

In [None]:
spc_out.head()

Unnamed: 0,PG.MolecularWeight,PG.ProteinAccessions,PG.Genes,PG.Organisms,PG.WBGene,PG.Locus,PG.Status,PEP.StrippedSequence,EG.PrecursorId,[1] 020_2025-DUN_DH-GB-2T1-A.raw.PG.IsSingleHit,...,[6] 020_2025-DUN_DH-GB-SLBP1+2-C.raw.EG.TotalQuantity (Settings),[7] 020_2025-DUN_DH-GB-SLBP1+2-TET-A.raw.EG.TotalQuantity (Settings),[8] 020_2025-DUN_DH-GB-SLBP1+2-TET-B.raw.EG.TotalQuantity (Settings),[9] 020_2025-DUN_DH-GB-SLBP1+2-TET-C.raw.EG.TotalQuantity (Settings),[10] 020_2025-DUN_DH-GB-SLBP1-A.raw.EG.TotalQuantity (Settings),[11] 020_2025-DUN_DH-GB-SLBP1-B-Rep-2.raw.EG.TotalQuantity (Settings),[12] 020_2025-DUN_DH-GB-SLBP1-C-Rep.raw.EG.TotalQuantity (Settings),[13] 020_2025-DUN_DH-GB-SLBP1-TET-A.raw.EG.TotalQuantity (Settings),[14] 020_2025-DUN_DH-GB-SLBP1-TET-B.raw.EG.TotalQuantity (Settings),[15] 020_2025-DUN_DH-GB-SLBP1-TET-C.raw.EG.TotalQuantity (Settings)
0,13796.3,Phleomycin,,Unknown,,,,DVAGAVEFWTDR,_DVAGAVEFWTDR_.2,False,...,29607.361328125,24625.78125,27913.017578125,33634.2265625,32040.609375,34004.15234375,31561.0546875,32193.07421875,33442.484375,30433.17578125
1,13796.3,Phleomycin,,Unknown,,,,DASGPAMTEIGEQPWGR,_DASGPAMTEIGEQPWGR_.2,False,...,16402.740234375,16614.609375,21239.26171875,26206.439453125,22974.56640625,27617.26953125,26588.005859375,26862.169921875,25289.5390625,21857.587890625
2,13796.3,Phleomycin,,Unknown,,,,DASGPAMTEIGEQPWGR,_DASGPAMTEIGEQPWGR_.3,False,...,1493.2821044921875,1043.6531982421875,2921.56005859375,3539.15234375,1961.1495361328125,3309.3720703125,3433.963134765625,3639.20458984375,3179.865478515625,2923.409423828125
3,13796.3,Phleomycin,,Unknown,,,,LTSAVPVLTAR,_LTSAVPVLTAR_.2,False,...,14565.984375,12778.6640625,15191.779296875,18907.958984375,17571.548828125,22430.3671875,21747.65625,20893.990234375,19671.70703125,15862.9970703125
4,13796.3,Phleomycin,,Unknown,,,,DPAGNCVHFVAEEQD,_DPAGNC[Carbamidomethyl (C)]VHFVAEEQD_.2,False,...,2699.176513671875,2801.34814453125,2980.920166015625,2851.24560546875,3146.41015625,3558.781982421875,4068.1572265625,4052.650390625,4167.5390625,3744.2060546875


In [None]:
annotate_peptides_inplace(TEST_DATA / "test_spectronaut_pep_out.tsv", canonical)

Annotating peptides: 10it [00:00, 19572.11it/s]


10

In [None]:
spc_out = pd.read_csv(TEST_DATA / 'test_spectronaut_pep_out.tsv',sep='\t')
spc_out.head()

Unnamed: 0,PG.MolecularWeight,PG.ProteinAccessions,PG.Genes,PG.Organisms,PG.WBGene,PG.Locus,PG.Status,PEP.StrippedSequence,EG.PrecursorId,[1] 020_2025-DUN_DH-GB-2T1-A.raw.PG.IsSingleHit,...,[7] 020_2025-DUN_DH-GB-SLBP1+2-TET-A.raw.EG.TotalQuantity (Settings),[8] 020_2025-DUN_DH-GB-SLBP1+2-TET-B.raw.EG.TotalQuantity (Settings),[9] 020_2025-DUN_DH-GB-SLBP1+2-TET-C.raw.EG.TotalQuantity (Settings),[10] 020_2025-DUN_DH-GB-SLBP1-A.raw.EG.TotalQuantity (Settings),[11] 020_2025-DUN_DH-GB-SLBP1-B-Rep-2.raw.EG.TotalQuantity (Settings),[12] 020_2025-DUN_DH-GB-SLBP1-C-Rep.raw.EG.TotalQuantity (Settings),[13] 020_2025-DUN_DH-GB-SLBP1-TET-A.raw.EG.TotalQuantity (Settings),[14] 020_2025-DUN_DH-GB-SLBP1-TET-B.raw.EG.TotalQuantity (Settings),[15] 020_2025-DUN_DH-GB-SLBP1-TET-C.raw.EG.TotalQuantity (Settings),is_canonical
0,13796.3,Phleomycin,,Unknown,,,,DVAGAVEFWTDR,_DVAGAVEFWTDR_.2,False,...,24625.78125,27913.017578125,33634.2265625,32040.609375,34004.15234375,31561.0546875,32193.07421875,33442.484375,30433.17578125,True
1,13796.3,Phleomycin,,Unknown,,,,DASGPAMTEIGEQPWGR,_DASGPAMTEIGEQPWGR_.2,False,...,16614.609375,21239.26171875,26206.439453125,22974.56640625,27617.26953125,26588.005859375,26862.169921875,25289.5390625,21857.587890625,True
2,13796.3,Phleomycin,,Unknown,,,,DASGPAMTEIGEQPWGR,_DASGPAMTEIGEQPWGR_.3,False,...,1043.6531982421875,2921.56005859375,3539.15234375,1961.1495361328125,3309.3720703125,3433.963134765625,3639.20458984375,3179.865478515625,2923.409423828125,True
3,13796.3,Phleomycin,,Unknown,,,,LTSAVPVLTAR,_LTSAVPVLTAR_.2,False,...,12778.6640625,15191.779296875,18907.958984375,17571.548828125,22430.3671875,21747.65625,20893.990234375,19671.70703125,15862.9970703125,False
4,13796.3,Phleomycin,,Unknown,,,,DPAGNCVHFVAEEQD,_DPAGNC[Carbamidomethyl (C)]VHFVAEEQD_.2,False,...,2801.34814453125,2980.920166015625,2851.24560546875,3146.41015625,3558.781982421875,4068.1572265625,4052.650390625,4167.5390625,3744.2060546875,False


In [None]:
try:
    annotate_peptides_inplace(TEST_DATA / "test_spectronaut_pep_out.tsv", canonical)
except:
    print('shuld raise an error')

shuld raise an error


In [None]:
src = TEST_DATA / "test_spectronaut_pep_out.tsv.bk"
dst = TEST_DATA / "test_spectronaut_pep_out.tsv"

shutil.copy(src, dst)
print(f"Restored {dst.name} from backup")

Restored test_spectronaut_pep_out.tsv from backup


/Users/mtinti/git_projects/protein_cutter/nbs


69212

100%|█████████████████████████████████████████████████████████████████████████████████████████████████| 69212/69212 [02:18<00:00, 500.67it/s]


(667054, 9)


In [None]:
#| hide
import nbdev; nbdev.nbdev_export()