# core

> Core utilities for in-silico protein digestion and peptide analysis

This module provides the foundational functions for proteomics workflows, including FASTA file parsing, enzymatic digestion simulation, and peptide property calculations.

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from typing import Dict, Union
from pyteomics import parser
from pyteomics import mass
from pathlib import Path
from Bio import SeqIO
import pandas as pd

In [None]:
#| export
from pathlib import Path
import os

# Get the repository root
if 'GITHUB_WORKSPACE' in os.environ:
    # In GitHub Actions
    REPO_ROOT = Path(os.environ['GITHUB_WORKSPACE'])
else:
    # Local development - find repo root
    REPO_ROOT = Path.cwd()
    while not (REPO_ROOT / 'settings.ini').exists():
        if REPO_ROOT == REPO_ROOT.parent:
            REPO_ROOT = Path.cwd()  # Fallback
            break
        REPO_ROOT = REPO_ROOT.parent

TEST_DATA = REPO_ROOT / 'test_data'

print(f"Repo root: {REPO_ROOT}")
print(f"Test data dir: {TEST_DATA}")
print(f"Test data exists: {TEST_DATA.exists()}")

Repo root: /Users/mtinti/git_projects/protein_cutter
Test data dir: /Users/mtinti/git_projects/protein_cutter/test_data
Test data exists: True


In [None]:
#| export
def load_fasta(fasta_path: Union[str, Path]) -> Dict[str, str]:
    """
    Load a FASTA file and return a dictionary mapping protein IDs to sequences.
    
    Parameters
    ----------
    fasta_path : str or Path
        Path to the FASTA file to load.
    
    Returns
    -------
    Dict[str, str]
        Dictionary mapping protein IDs (record.id) to amino acid sequences (as strings).
    
    Raises
    ------
    FileNotFoundError
        If the specified FASTA file does not exist.
    ValueError
        If the file is empty or cannot be parsed as FASTA format.
    
    Examples
    --------
    >>> proteins = load_fasta("proteins.fasta")
    >>> len(proteins)
    42
    >>> proteins['sp|P12345|EXAMPLE']
    'MKTAYIAKQRQISFVKSHFSRQLEERLGL...'
    """
    import os
    path = os.path.abspath(os.curdir)
    print(path)
    
    fasta_path = Path(fasta_path)
    if not fasta_path.exists():
        raise FileNotFoundError(f"FASTA file not found: {fasta_path} {path}")
    
    protein_dict = {}
    with fasta_path.open('r') as handle:
        for record in SeqIO.parse(handle, "fasta"):
            protein_dict[record.id] = str(record.seq)
    
    if not protein_dict:
        raise ValueError(f"No sequences found in FASTA file: {fasta_path} {path}")
    
    return protein_dict

In [None]:
#| export
protein_dict = load_fasta(TEST_DATA / 'test_sequence.fa')

/Users/mtinti/git_projects/protein_cutter/nbs


In [None]:
#| hide
assert(protein_dict['P15497'][0:5]=='MKAVV')

In [None]:
#| export
def digest(
    sequence: str,
    protein_id: str,
    enzyme: str = 'trypsin',
    missed_cleavages: int = 1,
    charge_states: list = [1, 2, 3]
) -> pd.DataFrame:
    """
    Digest a protein and add flanking amino acids for each peptide.
    
    Parameters
    ----------
    sequence : str
        Protein sequence to digest
    protein_id : str
        Protein identifier (for the DataFrame)
    enzyme : str
        Enzyme name (default: 'trypsin')
    missed_cleavages : int
        Number of allowed missed cleavages (default: 1)
    
    Returns
    -------
    pd.DataFrame
        DataFrame with columns: protein_id, start_index, end_index, 
        pep_seq, pep_length, prev_aa, next_aa, extended_seq
    """
    # Digest the protein
    cleavage_results = parser.xcleave(
        sequence,
        enzyme,
        missed_cleavages=missed_cleavages
    )
    
    # Create DataFrame with proper dtypes
    df = pd.DataFrame(
        cleavage_results,
        columns=['start_index', 'pep_seq']
    )
    
    # Add protein ID
    df['protein_id'] = protein_id
    
    # Calculate end index
    df['end_index'] = df['start_index'] + df['pep_seq'].str.len()
    
    # Add peptide length
    df['pep_length'] = df['pep_seq'].str.len()
    
    # Get flanking amino acids with proper boundary handling
    df['prev_aa'] = df['start_index'].apply(
        lambda idx: sequence[idx - 1] if idx > 0 else '-'
    )
    
    df['next_aa'] = df['end_index'].apply(
        lambda idx: sequence[idx] if idx < len(sequence) else '-'
    )
    
    # Create extended sequence (prev-peptide-next)
    df['extended_seq'] = df['prev_aa'] + df['pep_seq'] + df['next_aa']
    df['rep_extended_seq'] = '(' + df['prev_aa'] + ')' + df['pep_seq'] + '(' + df['next_aa'] + ')'
    
    # Calculate masses
    df['mass_mono'] = df['pep_seq'].apply(mass.fast_mass)
    df['mass_avg'] = df['pep_seq'].apply(
        lambda seq: mass.calculate_mass(seq, average=True)
    )
    
    # Calculate m/z for different charge states
    for z in charge_states:
        df[f'mz_{z}'] = df['pep_seq'].apply(
            lambda seq: mass.calculate_mass(seq, charge=z)
        )
    
    df = df[[
        'start_index', 'end_index', 'pep_seq', 'protein_id', 'pep_length',
        'prev_aa', 'next_aa', 'extended_seq', 'rep_extended_seq', 
        'mass_mono', 'mass_avg', 'mz_1', 'mz_2', 'mz_3'
    ]]
    
    return df

In [None]:
df = digest(
      sequence=protein_dict['P15497'],
      protein_id='P15497',
      enzyme='trypsin',
      missed_cleavages=0
  )

df.head()

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
0,0,2,MK,P15497,2,-,A,-MKA,(-)MK(A),277.146013,277.383844,278.153289,139.580283,93.389281
1,2,19,AVVLTLAVLFLTGSQAR,P15497,17,K,H,KAVVLTLAVLFLTGSQARH,(K)AVVLTLAVLFLTGSQAR(H),1758.040355,1759.101048,1759.047632,880.027454,587.020728
2,19,33,HFWQQDDPQSSWDR,P15497,14,R,V,RHFWQQDDPQSSWDRV,(R)HFWQQDDPQSSWDR(V),1830.77101,1831.856733,1831.778286,916.392781,611.26428
3,33,35,VK,P15497,2,R,D,RVKD,(R)VK(D),245.173942,245.319057,246.181218,123.594247,82.731924
4,35,46,DFATVYVEAIK,P15497,11,K,D,KDFATVYVEAIKD,(K)DFATVYVEAIK(D),1254.649604,1255.417813,1255.656881,628.332079,419.223811


In [None]:
#| hide
assert('RAHVETLRQ' in df[df['start_index']==176]['extended_seq'].values )

In [None]:
df2 = digest(
      sequence=protein_dict['P15497-2_KtoA_142'],
      protein_id='P15497-2_KtoA_142',
      enzyme='trypsin',
      missed_cleavages=0
  )

In [None]:
df2.head()

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
0,0,2,MK,P15497-2_KtoA_142,2,-,A,-MKA,(-)MK(A),277.146013,277.383844,278.153289,139.580283,93.389281
1,2,19,AVVLTLAVLFLTGSQAR,P15497-2_KtoA_142,17,K,H,KAVVLTLAVLFLTGSQARH,(K)AVVLTLAVLFLTGSQAR(H),1758.040355,1759.101048,1759.047632,880.027454,587.020728
2,19,33,HFWQQDDPQSSWDR,P15497-2_KtoA_142,14,R,V,RHFWQQDDPQSSWDRV,(R)HFWQQDDPQSSWDR(V),1830.77101,1831.856733,1831.778286,916.392781,611.26428
3,33,35,VK,P15497-2_KtoA_142,2,R,D,RVKD,(R)VK(D),245.173942,245.319057,246.181218,123.594247,82.731924
4,35,46,DFATVYVEAIK,P15497-2_KtoA_142,11,K,D,KDFATVYVEAIKD,(K)DFATVYVEAIK(D),1254.649604,1255.417813,1255.656881,628.332079,419.223811


In [None]:
#| hide
assert('RAHVETLRQ' in df2[df2['start_index']==176]['extended_seq'].values )

In [None]:
#| hide
assert(set(df['pep_seq'])-set(df2['pep_seq']) == {'VAPLGEEFR'})

In [None]:
#| hide
assert(set(df2['pep_seq'])-set(df['pep_seq'])=={'QAVAPLGEEFR'})

In [None]:
df2[~df2['pep_seq'].isin(df['pep_seq'])]

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
18,139,150,QAVAPLGEEFR,P15497-2_KtoA_142,11,R,E,RQAVAPLGEEFRE,(R)QAVAPLGEEFR(E),1215.624786,1216.345134,1216.632063,608.81967,406.215539


In [None]:
df[~df['pep_seq'].isin(df2['pep_seq'])]

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,rep_extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
19,141,150,VAPLGEEFR,P15497,9,K,E,KVAPLGEEFRE,(K)VAPLGEEFR(E),1016.529095,1017.137693,1017.536372,509.271824,339.850308


In [None]:
#| export

In [None]:
#| export
def foo(): pass

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()