# core

> Fill in a module description here

In [None]:
#| default_exp core

In [None]:
#| hide
from nbdev.showdoc import *


In [None]:
#| export
from typing import Dict, Union
from pyteomics import parser
from pyteomics import mass
from pathlib import Path
from Bio import SeqIO
import pandas as pd

In [None]:
#| export
def load_fasta(fasta_path: Union[str, Path]) -> Dict[str, str]:
      """
      Load a FASTA file and return a dictionary mapping protein IDs to sequences.
      
      Parameters
      ----------
      fasta_path : str or Path
          Path to the FASTA file to load.
      
      Returns
      -------
      Dict[str, str]
          Dictionary mapping protein IDs (record.id) to amino acid sequences (as strings).
      
      Raises
      ------
      FileNotFoundError
          If the specified FASTA file does not exist.
      ValueError
          If the file is empty or cannot be parsed as FASTA format.
      
      Examples
      --------
      >>> proteins = load_fasta("proteins.fasta")
      >>> len(proteins)
      42
      >>> proteins['sp|P12345|EXAMPLE']
      'MKTAYIAKQRQISFVKSHFSRQLEERLGL...'
      """
      fasta_path = Path(fasta_path)

      if not fasta_path.exists():
          raise FileNotFoundError(f"FASTA file not found: {fasta_path}")

      protein_dict = {}

      with fasta_path.open('r') as handle:
          for record in SeqIO.parse(handle, "fasta"):
              protein_dict[record.id] = str(record.seq)

      if not protein_dict:
          raise ValueError(f"No sequences found in FASTA file: {fasta_path}")

      return protein_dict

In [None]:
#| export
import os
path = os.path.abspath(os.curdir)
print(path)

/Users/mtinti/git_projects/protein_cutter/nbs


In [None]:
#| export
protein_dict = load_fasta('../test_data/test_sequence.fa')

In [None]:
#| export
assert(protein_dict['P15497'][0:5]=='MKAVV')

In [None]:
#| export
def digest(
    sequence: str,
    protein_id: str,
    enzyme: str = 'trypsin',
    missed_cleavages: int = 1,
    charge_states: list = [1, 2, 3]
    ) -> pd.DataFrame:
      """
      Digest a protein and add flanking amino acids for each peptide.
      
      Parameters
      ----------
      sequence : str
          Protein sequence to digest
      protein_id : str
          Protein identifier (for the DataFrame)
      enzyme : str
          Enzyme name (default: 'trypsin')
      missed_cleavages : int
          Number of allowed missed cleavages (default: 1)
      
      Returns
      -------
      pd.DataFrame
          DataFrame with columns: protein_id, start_index, end_index, 
          pep_seq, pep_length, prev_aa, next_aa, extended_seq
      """
      # Digest the protein
      cleavage_results = parser.xcleave(
          sequence,
          enzyme,
          missed_cleavages=missed_cleavages
      )
    
      # Create DataFrame with proper dtypes
      df = pd.DataFrame(
          cleavage_results,
          columns=['start_index', 'pep_seq']
      )
    
      # Add protein ID
      df['protein_id'] = protein_id
    
      # Calculate end index
      df['end_index'] = df['start_index'] + df['pep_seq'].str.len()
    
      # Add peptide length
      df['pep_length'] = df['pep_seq'].str.len()
    
      # Get flanking amino acids with proper boundary handling
      df['prev_aa'] = df['start_index'].apply(
          lambda idx: sequence[idx - 1] if idx > 0 else '-'
      )
    
      df['next_aa'] = df['end_index'].apply(
          lambda idx: sequence[idx] if idx < len(sequence) else '-'
      )
    
      # Create extended sequence (prev-peptide-next)
      df['extended_seq'] = df['prev_aa'] + df['pep_seq'] + df['next_aa']
      # Calculate masses
      df['mass_mono'] = df['pep_seq'].apply(mass.fast_mass)
      df['mass_avg'] = df['pep_seq'].apply(
          lambda seq: mass.calculate_mass(seq, average=True)
      )

      # Calculate m/z for different charge states
      for z in charge_states:
          df[f'mz_{z}'] = df['pep_seq'].apply(
              lambda seq: mass.calculate_mass(seq, charge=z)
          )

      df = df[['start_index', 'end_index', 'pep_seq', 'protein_id',  'pep_length',
       'prev_aa', 'next_aa', 'extended_seq', 'mass_mono', 'mass_avg', 'mz_1',
       'mz_2', 'mz_3']]
      return df

In [None]:
df = digest(
      sequence=protein_dict['P15497'],
      protein_id='P15497',
      enzyme='trypsin',
      missed_cleavages=0
  )

df[df.start_index==176]

Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
26,176,183,AHVETLR,P15497,7,R,Q,RAHVETLRQ,824.450451,824.926094,825.457727,413.232502,275.824093


In [None]:
assert('RAHVETLRQ' in df[df['start_index']==176]['extended_seq'].values )

In [None]:
df2 = digest(
      sequence=protein_dict['P15497-2_KtoA_142'],
      protein_id='P15497-2_KtoA_142',
      enzyme='trypsin',
      missed_cleavages=0
  )

assert('RAHVETLRQ' in df2[df2['start_index']==176]['extended_seq'].values )
df2[df.start_index==176]

  df2[df.start_index==176]


Unnamed: 0,start_index,end_index,pep_seq,protein_id,pep_length,prev_aa,next_aa,extended_seq,mass_mono,mass_avg,mz_1,mz_2,mz_3
26,183,194,QQLAPYSDDLR,P15497-2_KtoA_142,11,R,Q,RQQLAPYSDDLRQ,1304.636079,1305.395346,1305.643356,653.325316,435.88597


In [None]:
assert(set(df['pep_seq'])-set(df2['pep_seq']) == {'VAPLGEEFR'})

In [None]:
assert(set(df2['pep_seq'])-set(df['pep_seq'])=={'QAVAPLGEEFR'})

In [None]:
#| export

In [None]:
#| export
def foo(): pass

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()