# core

> Core utilities for in-silico protein digestion and peptide analysis

This module provides the foundational functions for proteomics workflows, including FASTA file parsing, enzymatic digestion simulation, and peptide property calculations.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from typing import Dict, Union
from pyteomics import parser
from pyteomics import mass
from pathlib import Path
from Bio import SeqIO
import pandas as pd

In [None]:
#| export
from pathlib import Path
import os

# Get the repository root
if 'GITHUB_WORKSPACE' in os.environ:
    # In GitHub Actions
    REPO_ROOT = Path(os.environ['GITHUB_WORKSPACE'])
else:
    # Local development - find repo root
    REPO_ROOT = Path.cwd()
    while not (REPO_ROOT / 'settings.ini').exists():
        if REPO_ROOT == REPO_ROOT.parent:
            REPO_ROOT = Path.cwd()  # Fallback
            break
        REPO_ROOT = REPO_ROOT.parent

TEST_DATA = REPO_ROOT / 'test_data'

print(f"Repo root: {REPO_ROOT}")
print(f"Test data dir: {TEST_DATA}")
print(f"Test data exists: {TEST_DATA.exists()}")

Repo root: /Users/mtinti/git_projects/protein_cutter
Test data dir: /Users/mtinti/git_projects/protein_cutter/test_data
Test data exists: True


In [None]:
#| export
def load_fasta(fasta_path: Union[str, Path]) -> Dict[str, str]:
    """
    Load a FASTA file and return a dictionary mapping protein IDs to sequences.
    
    Parameters
    ----------
    fasta_path : str or Path
        Path to the FASTA file to load.
    
    Returns
    -------
    Dict[str, str]
        Dictionary mapping protein IDs (record.id) to amino acid sequences (as strings).
    
    Raises
    ------
    FileNotFoundError
        If the specified FASTA file does not exist.
    ValueError
        If the file is empty or cannot be parsed as FASTA format.
    
    Examples
    --------
    >>> proteins = load_fasta("proteins.fasta")
    >>> len(proteins)
    42
    >>> proteins['sp|P12345|EXAMPLE']
    'MKTAYIAKQRQISFVKSHFSRQLEERLGL...'
    """
    import os
    path = os.path.abspath(os.curdir)
    print(path)
    
    fasta_path = Path(fasta_path)
    if not fasta_path.exists():
        raise FileNotFoundError(f"FASTA file not found: {fasta_path} {path}")
    
    protein_dict = {}
    with fasta_path.open('r') as handle:
        for record in SeqIO.parse(handle, "fasta"):
            protein_dict[record.id] = str(record.seq)
    
    if not protein_dict:
        raise ValueError(f"No sequences found in FASTA file: {fasta_path} {path}")
    
    return protein_dict

In [None]:
#| export
protein_dict = load_fasta(TEST_DATA / 'test_sequence.fa')

/Users/mtinti/git_projects/protein_cutter/nbs


In [None]:
#| hide
assert(protein_dict['P15497'][0:5]=='MKAVV')

In [None]:
#| export
def digest(
    sequence: str,
    protein_id: str,
    enzyme: str = 'trypsin',
    missed_cleavages: int = 1,
    charge_states: list = [1, 2, 3],
    mass_range: tuple=(800.0, 4000.0),
    min_pep_length: int=5,
    sort_by_mass: bool=False,
) -> pd.DataFrame:
    """
    Digest a protein and add flanking amino acids for each peptide.
    
    Parameters
    ----------
    sequence : str
        Protein sequence to digest
    protein_id : str
        Protein identifier (for the DataFrame)
    enzyme : str
        Enzyme name (default: 'trypsin')
    missed_cleavages : int
        Number of allowed missed cleavages (default: 1)
    
    Returns
    -------
    pd.DataFrame
        DataFrame with columns:
        'start_index', 'end_index', 'pep_seq', 'protein_id', 'pep_length',
        'prev_aa', 'next_aa', 'extended_seq', 'rep_extended_seq', 
        'mass_mono', 'mass_avg', 'mz_1', 'mz_2', 'mz_3'
    """
    # Digest the protein
    cleavage_results = parser.xcleave(
        sequence,
        enzyme,
        missed_cleavages=missed_cleavages
    )
    
    # Create DataFrame with proper dtypes
    df = pd.DataFrame(
        cleavage_results,
        columns=['start_index', 'pep_seq']
    )
    
    # Add protein ID
    df['protein_id'] = protein_id
    
    # Calculate end index
    df['end_index'] = df['start_index'] + df['pep_seq'].str.len()
    
    # Add peptide length
    df['pep_length'] = df['pep_seq'].str.len()
    
    # Get flanking amino acids with proper boundary handling
    df['prev_aa'] = df['start_index'].apply(
        lambda idx: sequence[idx - 1] if idx > 0 else '-'
    )
    
    df['next_aa'] = df['end_index'].apply(
        lambda idx: sequence[idx] if idx < len(sequence) else '-'
    )
    
    # Create extended sequence (prev-peptide-next)
    df['extended_seq'] = df['prev_aa'] + df['pep_seq'] + df['next_aa']
    df['rep_extended_seq'] = '(' + df['prev_aa'] + ')' + df['pep_seq'] + '(' + df['next_aa'] + ')'
    
    # Calculate masses
    df['mass_mono'] = df['pep_seq'].apply(mass.fast_mass)
    df['mass_avg'] = df['pep_seq'].apply(
        lambda seq: mass.calculate_mass(seq, average=True)
    )
    
    # Calculate m/z for different charge states
    for z in charge_states:
        df[f'mz_{z}'] = df['pep_seq'].apply(
            lambda seq: mass.calculate_mass(seq, charge=z)
        )
    
    df = df[[
        'start_index', 'end_index', 'pep_seq', 'protein_id', 'pep_length',
        'prev_aa', 'next_aa', 'extended_seq', 'rep_extended_seq', 
        'mass_mono', 'mass_avg', 'mz_1', 'mz_2', 'mz_3'
    ]]

    df = df[df['pep_seq'].str.len()>=min_pep_length]
    df = df[df['mass_mono']>=mass_range[0]]
    df = df[df['mass_mono']<=mass_range[1]]
    if sort_by_mass:
        df=df.sort_values('mass_mono')
    return df

In [None]:
df = digest(
    sequence=protein_dict['P15497'],
    protein_id='P15497',
    enzyme='trypsin',
    missed_cleavages=0,
    sort_by_mass=True
  )
df.head(10)

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
26,176,183,AHVETLR,P15497,7,R,Q,RAHVETLRQ,(R)AHVETLR(Q),824.450451,824.926094,825.457727,413.232502,275.824093
22,156,163,VQELQDK,P15497,7,K,L,KVQELQDKL,(K)VQELQDK(L),858.444697,858.93752,859.451973,430.229625,287.155509
19,141,150,VAPLGEEFR,P15497,9,K,E,KVAPLGEEFRE,(K)VAPLGEEFR(E),1016.529095,1017.137693,1017.536372,509.271824,339.850308
23,163,172,LSPLAQELR,P15497,9,K,D,KLSPLAQELRD,(K)LSPLAQELR(D),1025.586944,1026.189246,1026.594221,513.800749,342.869591
34,228,237,AKPVLEDLR,P15497,9,K,Q,KAKPVLEDLRQ,(K)AKPVLEDLR(Q),1039.602594,1040.215863,1040.609871,520.808574,347.541475
35,237,248,QGLLPVLESLK,P15497,11,R,V,RQGLLPVLESLKV,(R)QGLLPVLESLK(V),1195.717624,1196.438348,1196.724901,598.866089,399.579818
36,248,260,VSILAAIDEASK,P15497,12,K,K,KVSILAAIDEASKK,(K)VSILAAIDEASK(K),1215.671068,1216.383328,1216.678344,608.84281,406.230966
31,205,217,EGGGSLAEYHAK,P15497,12,K,A,KEGGGSLAEYHAKA,(K)EGGGSLAEYHAK(A),1217.567665,1218.274828,1218.574942,609.791109,406.863165
4,35,46,DFATVYVEAIK,P15497,11,K,D,KDFATVYVEAIKD,(K)DFATVYVEAIK(D),1254.649604,1255.417813,1255.656881,628.332079,419.223811
17,130,139,WHEEVEIYR,P15497,9,K,Q,KWHEEVEIYRQ,(K)WHEEVEIYR(Q),1259.593486,1260.356315,1260.600763,630.80402,420.871772


In [None]:
#| hide
assert('RAHVETLRQ' in df[df['start_index']==176]['extended_seq'].values )

In [None]:
df2 = digest(
      sequence=protein_dict['P15497-2_KtoA_142'],
      protein_id='P15497-2_KtoA_142',
      enzyme='trypsin',
      missed_cleavages=0
  )

In [None]:
df2.head()

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
1,2,19,AVVLTLAVLFLTGSQAR,P15497-2_KtoA_142,17,K,H,KAVVLTLAVLFLTGSQARH,(K)AVVLTLAVLFLTGSQAR(H),1758.040355,1759.101048,1759.047632,880.027454,587.020728
2,19,33,HFWQQDDPQSSWDR,P15497-2_KtoA_142,14,R,V,RHFWQQDDPQSSWDRV,(R)HFWQQDDPQSSWDR(V),1830.77101,1831.856733,1831.778286,916.392781,611.26428
4,35,46,DFATVYVEAIK,P15497-2_KtoA_142,11,K,D,KDFATVYVEAIKD,(K)DFATVYVEAIK(D),1254.649604,1255.417813,1255.656881,628.332079,419.223811
6,50,63,DYVAQFEASALGK,P15497-2_KtoA_142,13,R,Q,RDYVAQFEASALGKQ,(R)DYVAQFEASALGK(Q),1397.682695,1398.518785,1398.689972,699.848624,466.901508
8,68,82,LLDNWDTLASTLSK,P15497-2_KtoA_142,14,K,V,KLLDNWDTLASTLSKV,(K)LLDNWDTLASTLSK(V),1575.814438,1576.748491,1576.821714,788.914495,526.278756


In [None]:
#| hide
assert('RAHVETLRQ' in df2[df2['start_index']==176]['extended_seq'].values )

In [None]:
#| hide
assert(set(df['pep_seq'])-set(df2['pep_seq']))

In [None]:
#| hide
assert(set(df2['pep_seq'])-set(df['pep_seq'])=={'QAVAPLGEEFR'})

In [None]:
#| hide
df2[~df2['pep_seq'].isin(df['pep_seq'])]

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
18,139,150,QAVAPLGEEFR,P15497-2_KtoA_142,11,R,E,RQAVAPLGEEFRE,(R)QAVAPLGEEFR(E),1215.624786,1216.345134,1216.632063,608.81967,406.215539


In [None]:
#| hide
df[~df['pep_seq'].isin(df2['pep_seq'])]

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
19,141,150,VAPLGEEFR,P15497,9,K,E,KVAPLGEEFRE,(K)VAPLGEEFR(E),1016.529095,1017.137693,1017.536372,509.271824,339.850308


In [None]:
#| export
def digest_to_set(
    sequence: str,
    enzyme: str = 'trypsin',
    missed_cleavages: int = 0,
    mass_range: tuple[float, float] = (800.0, 4000.0),
    min_pep_length: int = 5,
) -> set[str]:
    """
    Lightweight digest that returns only peptide sequences as a set.
    
    This is a memory-efficient alternative to `digest()` when only the
    peptide sequences are needed, without positional or mass annotations.
    
    Parameters
    ----------
    sequence : str
        Protein sequence to digest
    enzyme : str
        Enzyme name (default: 'trypsin')
    missed_cleavages : int
        Number of allowed missed cleavages (default: 0)
    mass_range : tuple[float, float]
        (min, max) monoisotopic mass filter in Da (default: (800.0, 4000.0))
    min_pep_length : int
        Minimum peptide length to retain (default: 5)
    
    Returns
    -------
    set[str]
        Set of peptide sequences passing the length and mass filters
    
    Examples
    --------
    >>> peptides = digest_to_set("MKTAYIAKQRQISFVKSHFSRQLEERLGLIEVQAPILSRVGDGTQDNLSGAEKAVQVKVK")
    >>> len(peptides)
    2
    >>> "LGLIEVQAPILSR" in peptides
    True
    """
    # Digest the protein
    cleavage_results = parser.xcleave(
        sequence,
        enzyme,
        missed_cleavages=missed_cleavages
    )
    
    # Filter and return as set
    return {
        pep_seq
        for _, pep_seq in cleavage_results
        if len(pep_seq) >= min_pep_length
        and mass_range[0] <= mass.fast_mass(pep_seq) <= mass_range[1]
    }

In [None]:
#| hide
assert(digest_to_set(protein_dict['P15497'])==set(digest(
    sequence=protein_dict['P15497'],
    protein_id='P15497',
    enzyme='trypsin',
    missed_cleavages=0,
    sort_by_mass=True
  )['pep_seq']))

In [None]:
#| hide
assert(digest_to_set(protein_dict['P15497'],missed_cleavages=2)==set(digest(
    sequence=protein_dict['P15497'],
    protein_id='P15497',
    enzyme='trypsin',
    missed_cleavages=2,
    sort_by_mass=True
  )['pep_seq']))

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()

In [None]:
#print(1)