# annotation

> Pipeline to annotate peptides


In [None]:
#| default_exp annotation

In [None]:
#| hide
from nbdev.showdoc import *

In [None]:
#| export
from typing import Dict, Union
from pyteomics import parser
from pyteomics import mass
from pathlib import Path
from Bio import SeqIO
import pandas as pd
from protein_cutter.core import annotate_peptides_inplace
from protein_cutter.core import fasta_to_peptide_set

In [None]:
#| export
from pathlib import Path
import os

# Get the repository root
if 'GITHUB_WORKSPACE' in os.environ:
    # In GitHub Actions
    REPO_ROOT = Path(os.environ['GITHUB_WORKSPACE'])
else:
    # Local development - find repo root
    REPO_ROOT = Path.cwd()
    while not (REPO_ROOT / 'settings.ini').exists():
        if REPO_ROOT == REPO_ROOT.parent:
            REPO_ROOT = Path.cwd()  # Fallback
            break
        REPO_ROOT = REPO_ROOT.parent

TEST_DATA = REPO_ROOT / 'test_data'

print(f"Repo root: {REPO_ROOT}")
print(f"Test data dir: {TEST_DATA}")
print(f"Test data exists: {TEST_DATA.exists()}")

Repo root: /Users/mtinti/git_projects/protein_cutter
Test data dir: /Users/mtinti/git_projects/protein_cutter/test_data
Test data exists: True


In [None]:
#| export
import shutil
def annotation_pipline():
    canonical_set = fasta_to_peptide_set(TEST_DATA / 'test_sequence.fa',
                                         mass_range = (0.0, 4000000.0),
                                         min_pep_length =5
                                        )
    print(canonical_set) 
    canonical_set.update(set(['DASGPAMTEIGEQPWGR','DVAGAVEFWTDR']))
                         
    annotate_peptides_inplace(TEST_DATA / "test_spectronaut_pep_out.tsv", canonical_set)

    spc_out = pd.read_csv(TEST_DATA / 'test_spectronaut_pep_out.tsv',sep='\t')
    print(spc_out.head())
    

    src = TEST_DATA / "test_spectronaut_pep_out.tsv.bk"
    dst = TEST_DATA / "test_spectronaut_pep_out.tsv"
    
    shutil.copy(src, dst)
    print(f"Restored {dst.name} from backup")    

annotation_pipline()    

Digesting proteins: 2it [00:00, 3164.32it/s]


{'VAPLGEEFR', 'LSPLAQELR', 'EGGGSLAEYHAK', 'VQPYLDEFQK', 'QEMHK', 'QAVAPLGEEFR', 'AVVLTLAVLFLTGSQAR', 'ETASLR', 'DLEEVK', 'DYVAQFEASALGK', 'DFATVYVEAIK', 'QLNLK', 'ASEQLK', 'VSILAAIDEASK', 'ALGEK', 'WHEEVEIYR', 'HFWQQDDPQSSWDR', 'QGLLPVLESLK', 'AHVETLR', 'LLDNWDTLASTLSK', 'AKPVLEDLR', 'VQELQDK', 'EQLGPVTQEFWDNLEK', 'QQLAPYSDDLR', 'LEALK'}


Annotating peptides: 10it [00:00, 27467.61it/s]

   PG.MolecularWeight PG.ProteinAccessions  PG.Genes PG.Organisms  PG.WBGene  \
0             13796.3           Phleomycin       NaN      Unknown        NaN   
1             13796.3           Phleomycin       NaN      Unknown        NaN   
2             13796.3           Phleomycin       NaN      Unknown        NaN   
3             13796.3           Phleomycin       NaN      Unknown        NaN   
4             13796.3           Phleomycin       NaN      Unknown        NaN   

   PG.Locus  PG.Status PEP.StrippedSequence  \
0       NaN        NaN         DVAGAVEFWTDR   
1       NaN        NaN    DASGPAMTEIGEQPWGR   
2       NaN        NaN    DASGPAMTEIGEQPWGR   
3       NaN        NaN          LTSAVPVLTAR   
4       NaN        NaN      DPAGNCVHFVAEEQD   

                             EG.PrecursorId  \
0                          _DVAGAVEFWTDR_.2   
1                     _DASGPAMTEIGEQPWGR_.2   
2                     _DASGPAMTEIGEQPWGR_.3   
3                           _LTSAVPVLTAR_.2   
4




In [None]:
#| hide
import nbdev; nbdev.nbdev_export()