In [None]:
## Load any changes to local modules
%load_ext autoreload
%autoreload 2

import os
import sys

pwd = %pwd
project_dir = '{0}/../../../'.format(pwd)
module_path = os.path.abspath(os.path.join(project_dir))
if module_path not in sys.path:
    sys.path.append(module_path)

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import re
import numpy as np
%matplotlib inline
from IPython.display import display

import sqlalchemy
import sqlite3

from nbcpact import AnalyzeQuantCompare,Peptide,PeptideGroup,UcbreUtils,PeptidesFromPeptideListBuilder


In [None]:
file_path = "../../../test/data/peptideList.csv"
peptide_generator = PeptidesFromPeptideListBuilder(peptide_list_file=file_path)
peptides = peptide_generator.generate_peptides()
pep = peptides[0]

print("ip2_peptide={0}, sequence={1}, mod_locs={2}, ptm_indices={3}, area_ratio={4}".format(pep.ip2_peptide, pep.sequence, pep.mod_locs, pep.ptm_indices, pep.area_ratio))
print("area_ratios={0}, annotation={1}, uniprot_ids={2}, run_counter={3}".format(pep.area_ratios, pep.annotation, pep.uniprot_ids, pep.run_counter))
print("decoy={0},unique1={1}".format(pep.decoy, pep.unique1))


"""
ip2_peptide=None, sequence=None, mod_locs=None, ptm_indices=None, area_ratio=None,
                 area_ratios=None, annotation=None, uniprot_ids=None, run_counter=None, decoy=None,
                 unique1=None
"""

In [466]:
from enum import Enum

class DataType(Enum):
    Float = 1
    Integer = 2
    
    
class PDReader:
    
    __psm_cys_mod_pattern = re.compile(r'C(\d+)\(isoTO\w*\)')
     
    def __init__(self, pd_result_file=None, 
                 num_quant_channels=2, 
                 quan_value_names=['QuanValueIsoTOPHeavy', 'QuanValueIsoTOPLight'], 
                 pd_version='2.1'):
        
        self.__pd_result_file = pd_result_file
        self.__num_quant_channels = num_quant_channels
        self.__quan_value_names = quan_value_names
        
        self.__connection = sqlite3.connect(pd_result_file)
        self.__pd_version = pd_version
        #self.__target_psms = self.get_target_psms()
        self.__data_cache = {}
        
    def __process_psm_modifications(self, modifications):
        mod_locs = list(map(int, re.findall(self.__psm_cys_mod_pattern, modifications)))
        return mod_locs
    
    def __extract_values(self,binary_data, n=None, dataType=None):
        """
        values: bytes from the blob
        n: number of channels
        t: type of data
            'decimal' for values in decimal format such as Abundances
            'integer_number' for values such as 'Found in'

        """
        result = []

        if dataType == DataType.Float:
            for i in range(n):
                sub = binary_data[9*i:9*i+8]
                result.append(struct.unpack("d",sub)[0])
        else:
            for i in range(n):
                result.append(struct.unpack("i",binary_data[5*i:5*i+4]))

        if len(result) == 1:
            return result[0]
        else:
            return result
        
    def __get_found_raw_files(self, targetPeptideGroupsPeptideGroupID):
        df = self.get_target_psms()
        df = df[df['TargetPeptideGroupsPeptideGroupID'] == targetPeptideGroupsPeptideGroupID]
        
        values = df['SpectrumFileName'].values
        
        return values
        
    def get_target_psms(self):
        data_name = 'target_psms'
        
        ## Don't really need the quan_value_names {0} below
        if not data_name in self.__data_cache.keys():
            sqlStr = """
                        SELECT
                        Sequence,
                        ModifiedSequence,
                        Modifications,
                        ParentProteinAccessions,
                        ParentProteinDescriptions,
                        SpectrumFileName,
                        QuanChannel,
                        TargetPeptideGroupsPeptideGroupID,
                        {0}
                        FROM TargetPsms t1, TargetPeptideGroupsTargetPsms t2 
                        WHERE t1.PeptideID = t2.TargetPsmsPeptideID AND QuanChannel IS NOT NULL
                    """.format(','.join(self.__quan_value_names))
        
            df = pd.read_sql(sqlStr, self.__connection)
            self.__data_cache[data_name] = df
        
        #df['ModLocations'] = df.Modifications.apply(self.__process_psm_modifications)
        return self.__data_cache[data_name]
    
    def get_target_peptides(self):
        data_name = 'target_peptides'
        
        if not data_name in self.__data_cache.keys():
            sqlStr = """
                        SELECT
                        PeptideGroupID, 
                        Checked,
                        Confidence,
                        Sequence,
                        Modifications_all_positions,
                        Modifications_best_positions,
                        Contaminant,
                        QvalityPEP,
                        Qvalityqvalue,
                        ParentProteinGroupCount,
                        ParentProteinCount,
                        PsmCount,
                        MasterProteinAccessions,
                        MissedCleavages,
                        TheoreticalMass,
                        QuanInfo,
                        IonsScoreMascot,
                        ConfidenceMascot,
                        PercolatorqValueMascot,
                        PercolatorPEPMascot,
                        AbundanceRatios,
                        Abundances
                        FROM TargetPeptideGroups tpg
                        WHERE AbundanceRatios IS NOT NULL LIMIT 10;
                    """

            df = pd.read_sql(sqlStr, self.__connection)

            #df['mod_locs'] = df['Modifications_best_positions'].apply(self.__process_modifications)

            """
            df['ip2_peptide'] = df.apply(self.__create_ip2peptide, axis=1)

             df['str_mod_locs'] = df['mod_locs'].astype(str) # To help with merge
            df['run_counter'] = df.apply(self.__create_run_counter, axis=1)       
            """
            df['files'] = df['PeptideGroupID'].apply(self.__get_found_raw_files) 
            
            
            df['AbundanceRatios'] = df['AbundanceRatios'].apply(self.__extract_values, n=1, dataType=DataType.Float)
            df['Log2Ratio'] = df['AbundanceRatios'].apply(np.log2)

            #df['ptm_indices'] = df.apply(self.__get_global_mod_position, axis=1)

            df = df.drop(['Abundances'], axis=1)

            self.__data_cache[data_name] = df
            
        return self.__data_cache[data_name]


file_path = '/usca/asperapoc/NB-CPACT/NB-CPACT-NIBR/NIBR_loaded_UCB_EN80/Analysis/PD2.1/KEA_isoTOP_DN_All.pdResult'
proteomeDiscovererReader = PDReader(pd_result_file=file_path, num_quant_channels=2)

psms = proteomeDiscovererReader.get_target_psms()
pepGroup = proteomeDiscovererReader.get_target_peptides()


display(psms.head())

display(pepGroup.head())


Unnamed: 0,Sequence,ModifiedSequence,Modifications,ParentProteinAccessions,ParentProteinDescriptions,SpectrumFileName,QuanChannel,TargetPeptideGroupsPeptideGroupID,QuanValueIsoTOPHeavy,QuanValueIsoTOPLight
0,DNLTLWTSDSAGEECDAAEGAEN,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTO),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN801_1.raw,IsoTOP Light,621,41102434.5,50112580.0
1,DNLTLWTSDSAGEECDAAEGAEN,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTO),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN801_1.raw,IsoTOP Light,621,15886787.75,19675040.0
2,DNLTLWTSDSAGEECDAAEGAEN,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTOR0),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN801_1.raw,IsoTOP Heavy,622,15886787.75,19675040.0
3,AAAPAPEEEMDECEQALAAEPK,AAAPAPEEEMDEcEQALAAEPK,C13(isoTO),P26641,GN=EEF1G Elongation factor 1-gamma OS=Homo sap...,KEA_EN801_1.raw,IsoTOP Light,799,91376038.5,144562400.0
4,AAAPAPEEEMDECEQALAAEPK,AAAPAPEEEMDEcEQALAAEPK,C13(isoTOR0),P26641,GN=EEF1G Elongation factor 1-gamma OS=Homo sap...,KEA_EN801_1.raw,IsoTOP Heavy,800,91376038.5,144562400.0


Unnamed: 0,PeptideGroupID,Checked,Confidence,Sequence,Modifications_all_positions,Modifications_best_positions,Contaminant,QvalityPEP,Qvalityqvalue,ParentProteinGroupCount,...,MissedCleavages,TheoreticalMass,QuanInfo,IonsScoreMascot,ConfidenceMascot,PercolatorqValueMascot,PercolatorPEPMascot,AbundanceRatios,files,Log2Ratio
0,378,0,2,LCSLMGTVFLLR,1×Oxidation [M5]; 1×isoTOP TEV heavy [C2],1×Oxidation [M5]; 1×isoTOP TEV heavy [C2],1,0.249299,0.012863,1,...,0,1896.059015,9,9.37,3,0.006328,0.1336,0.01,[KEA_EN802_4.raw],-6.643856
1,391,0,3,CMVQFVGR,1×isoTOP TEV Light [C1],1×isoTOP TEV Light [C1],1,0.034045,0.001524,1,...,0,1460.76132,9,23.22,3,0.000643,0.008399,1.651905,[KEA_EN801_4.raw],0.724131
2,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,0,2919.269734,9,26.79,3,0.0,6.798e-08,1.051228,"[KEA_EN801_1.raw, KEA_EN801_1.raw, KEA_EN803_2...",0.072076
3,622,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV heavy [C15],1×isoTOP TEV heavy [C15],1,0.000842,0.0,1,...,0,2925.283543,9,19.67,3,0.0,9.023e-05,1.366417,"[KEA_EN801_1.raw, KEA_EN803_2.raw, KEA_EN803_3...",0.450398
4,770,0,3,EDLNCQEEEDPMNK,1×isoTOP TEV Light [C5],1×isoTOP TEV Light [C5],1,0.000685,0.0,1,...,0,2214.975407,9,39.94,3,0.0,7.002e-05,1.10819,"[KEA_EN803_3.raw, KEA_EN803_3.raw]",0.148205


In [458]:

        
        

#display(pepGroup['AbundanceRatios'].apply(__extract_values, dataType=DataType.Float))
df=pepGroup.head(3)
df_with_psm = df.merge(psms, how='left')

display(df['FoundinSamples'].apply(__extract_values, dataType=DataType.String))
#help(struct)

#pepGroup.head(1)

df_with_psm  

0    [(0,), (4,)]
1    [(4,), (1,)]
2    [(4,), (1,)]
Name: FoundinSamples, dtype: object

Unnamed: 0,PeptideGroupID,Checked,Confidence,Sequence,Modifications_all_positions,Modifications_best_positions,Contaminant,QvalityPEP,Qvalityqvalue,ParentProteinGroupCount,...,ModifiedSequence,Modifications,ParentProteinAccessions,ParentProteinDescriptions,SpectrumFileName,QuanChannel,TargetPeptideGroupsPeptideGroupID,QuanValueIsoTOPHeavy,QuanValueIsoTOPLight,ModLocations
0,378,0,2,LCSLMGTVFLLR,1×Oxidation [M5]; 1×isoTOP TEV heavy [C2],1×Oxidation [M5]; 1×isoTOP TEV heavy [C2],1,0.249299,0.012863,1,...,LcSLmGTVFLLR,C2(isoTOR0); M5(Oxidation),Q96LT4,GN=SAMD8 Sphingomyelin synthase-related protei...,KEA_EN802_4.raw,IsoTOP Heavy,378,401655.5,,[2]
1,391,0,3,CMVQFVGR,1×isoTOP TEV Light [C1],1×isoTOP TEV Light [C1],1,0.034045,0.001524,1,...,cMVQFVGR,C1(isoTO),Q9NZJ7,GN=MTCH1 Mitochondrial carrier homolog 1 OS=Ho...,KEA_EN801_4.raw,IsoTOP Light,391,427946.1,706926.4,[1]
2,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTO),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN801_1.raw,IsoTOP Light,621,41102430.0,50112580.0,[15]
3,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTO),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN801_1.raw,IsoTOP Light,621,15886790.0,19675040.0,[15]
4,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTOR0),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN801_1.raw,IsoTOP Heavy,622,15886790.0,19675040.0,[15]
5,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTO),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN803_2.raw,IsoTOP Light,621,28741930.0,40787110.0,[15]
6,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTOR0),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN803_2.raw,IsoTOP Heavy,622,28741930.0,40787110.0,[15]
7,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTO),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN803_2.raw,IsoTOP Light,621,9404085.0,2981306.0,[15]
8,621,0,3,DNLTLWTSDSAGEECDAAEGAEN,1×isoTOP TEV Light [C15],1×isoTOP TEV Light [C15],1,3e-06,0.0,1,...,DNLTLWTSDSAGEEcDAAEGAEN,C15(isoTOR0),P27348,GN=YWHAQ 14-3-3 protein theta OS=Homo sapiens ...,KEA_EN803_3.raw,IsoTOP Heavy,622,3541362.0,5358270.0,[15]


In [404]:
class PeptideFromPD2_1Generator:
    
    __psm_cys_mod_pattern = re.compile(r'C(\d+)\(isoTO\w*\)')
    __exp_pattern = r'(KEA_EN80\d)_\d.raw'
    __peptide_mod_pattern = re.compile(r'.*\d.isoTOP TEV [heavyLight]+ \[(.+)\].*')
    __peptide_cys_mod_pattern = re.compile(r'C(\d+)')
    
    __data_cache = {}

    
    def __init__(self, pd_study_dir="/usca/asperapoc/NB-CPACT/NB-CPACT-NIBR/NIBR_loaded_UCB_EN80/Analysis/PD2.1/"):
        self.__pd_study_dir = pd_study_dir
        # TODO: Look for the latest pdResult file
        file_path = '{0}/{1}'.format(pd_study_dir, 'KEA_isoTOP_DN_All.pdResult')
        self.__connection = sqlite3.connect(file_path)
        self.__target_psms = self.get_target_psms()
        modifications = self.get_found_modifications()
        
        value = float(modifications[modifications['Name'] == 'IsoTOP heavy']['DeltaMonoisotopicMass'])
        self.__isoTopHeavyMass = "{0:.2f}".format(value)
        value = float(modifications[modifications['Name'] == 'IsoTOP light']['DeltaMonoisotopicMass'])
        self.__isoTopLightMass = "{0:.2f}".format(value)
        
    
    def __process_psm_modifications(self, modifications):
        mod_locs = list(map(int, re.findall(self.__psm_cys_mod_pattern, modifications)))
        return mod_locs
    
    def get_found_modifications(self):
        sqlStr = "SELECT * FROM FoundModifications"
        df = pd.read_sql(sqlStr, self.__connection)
        return df
    
    def get_target_psms(self):
        sqlStr = """
        SELECT
        Sequence,
        ModifiedSequence,
        Modifications,
        ParentProteinAccessions,
        ParentProteinDescriptions,
        SpectrumFileName,
        QuanChannel,
        QuanValueIsoTOPLight,
        QuanValueIsoTOPHeavy
        FROM TargetPsms
        WHERE (QuanChannelID = 1 OR QuanChannelID = 2)
        """
        df = pd.read_sql(sqlStr, self.__connection)
        
        df['mod_locs'] = df.Modifications.apply(self.__process_psm_modifications)
        df['str_mod_locs'] = df['mod_locs'].astype(str)
        df['EXPERIMENT'] = df.SpectrumFileName.str.extract(self.__exp_pattern)
        
        return df
    
    
    
    def __process_modifications(self, modifications):
        match = self.__peptide_mod_pattern.match(modifications)
        if match:
            mod_locs = list(map(int, re.findall(self.__peptide_cys_mod_pattern, match.group(1))))
            return mod_locs
        
    
    def __create_run_counter(self, row, experiments=['KEA_EN801', 'KEA_EN802', 'KEA_EN803']):
        sequence = row['Sequence']
        mod_locs = str(row['str_mod_locs'])
        targetPsmsDF = self.__target_psms
        
        psms = targetPsmsDF[(targetPsmsDF.Sequence == sequence) & (targetPsmsDF['str_mod_locs'] == mod_locs)]
        psm_exps = list(psms['EXPERIMENT'])
        run_counter = [i in psms_exps for i in experiments]

        return str(run_counter)
    
    def __get_global_mod_position(self, row):
        data_cache_name = 'target_prot_sequences'
        
        if not data_cache_name in self.__data_cache.keys():
            sqlStr = """
                        SELECT
                        t1.PeptideGroupID,t3.Sequence
                        FROM TargetPeptideGroups t1,
                        TargetPeptideGroupsTargetProteins t2,
                        TargetProteins t3
                        WHERE t1.PeptideGroupID = t2.TargetPeptideGroupsPeptideGroupID
                        AND t2.TargetProteinsUniqueSequenceID = t3.UniqueSequenceID
                    """
            
            df = pd.read_sql(sqlStr, self.__connection)
            self.__data_cache[data_cache_name] = df
            
        peptide_sequence = row['Sequence']
        
        local_positions = [int(numeric_string) for numeric_string in  row['mod_locs']]
        peptideGroupID = row['PeptideGroupID']
        
        df = self.__data_cache[data_cache_name]
        proteinSequences = df[df['PeptideGroupID'] == peptideGroupID]['Sequence']
        
        peptide_starts = []
        for proteinSequence in proteinSequences.values:
            peptide_starts.append((proteinSequence.find(peptide_sequence)))
            
        global_positions = []
        for peptide_start in peptide_starts:      
            pos_strings = []

            for local_position in local_positions:
                global_pos = np.nan if peptide_start == -1 else (local_position + peptide_start)
                pos_strings.append('C{0}'.format(global_pos))

            global_positions.append(','.join(pos_strings))
            
        
        return global_positions   
    
    def __create_ip2peptide(self, row):

        modifications = row['Modifications_best_positions']
        sequence = row['Sequence']
        mod_locs = row['mod_locs']
        
        mass = modifications
        if 'isoTOP TEV heavy' in modifications:
            mass = self.__isoTopHeavyMass
        elif 'isoTOP TEV Light' in modifications:
            mass = self.__isoTopLightMass
            
        for mod_loc in mod_locs:
            sequence = '{0}C({1}){2}'.format(sequence[0:mod_loc-1], mass, sequence[mod_loc:])

        return sequence


    def __extract_values(self, binary_data, n=None, dataType=None):
        """
        values: bytes from the blob
        n: number of channels
        t: type of data
            'decimal' for values in decimal format such as Abundances
            'integer_number' for values such as 'Found in'

        """
        result = []

        if dataType == DataType.Float:
            for i in range(n):
                sub = binary_data[9*i:9*i+8]
                result.append(struct.unpack("d",sub)[0])
        else:
            return binary_data

        if len(result) == 1:
            return result[0]
        else:
            return result
    
    
    def get_target_peptides_from_pdresult(self):
        sqlStr = """
            SELECT
            PeptideGroupID, 
            Checked,
            Confidence,
            Sequence,
            Modifications_all_positions,
            Modifications_best_positions,
            Contaminant,
            QvalityPEP,
            Qvalityqvalue,
            ParentProteinGroupCount,
            ParentProteinCount,
            PsmCount,
            MasterProteinAccessions,
            MissedCleavages,
            TheoreticalMass,
            QuanInfo,
            IonsScoreMascot,
            ConfidenceMascot,
            PercolatorqValueMascot,
            PercolatorPEPMascot,
            AbundanceRatios,
            Abundances,
            FoundinSamples
            FROM TargetPeptideGroups tpg
            WHERE AbundanceRatios IS NOT NULL LIMIT 10;
            """
        
        df = pd.read_sql(sqlStr, self.__connection)
        
        df['mod_locs'] = df['Modifications_best_positions'].apply(self.__process_modifications)
        
        df['ip2_peptide'] = df.apply(self.__create_ip2peptide, axis=1)
        
        df['str_mod_locs'] = df['mod_locs'].astype(str) # To help with merge
        df['run_counter'] = df.apply(self.__create_run_counter, axis=1)       
        
        #df['AbundanceRatios'] = df['AbundanceRatios'].apply(self.__extract_values, n=1, dataType=DataType.Float)
        df['Log2Ratio'] = df['AbundanceRatios'].apply(np.log2)
        
        df['ptm_indices'] = df.apply(self.__get_global_mod_position, axis=1)
        
        df = df.drop(['Abundances', 'FoundinSamples'], axis=1)
        
        return df
    
    def get_target_peptides(self):
        # TODO: Look for any xlsx file
        file_path = "{0}/KEA_isoTOP_DN_High_Med_All.xlsx".format(self.__pd_study_dir)
        df = pd.read_excel(file_path)
        ## Clean up dataframe
        df = df[~df['Abundance Ratio (log2): (IsoTOP Light) / (IsoTOP Heavy)'].isnull()]
        df['mod_locs'] = df['Modifications'].apply(self.__process_modifications)
        df['run_counter'] = df.apply(self.__create_run_counter, axis=1)
        
        return df

pep_generator = PeptideFromPD2_1Generator()



## 

In [None]:
peptide_from_pdresultsDF = pep_generator.get_target_peptides_from_pdresult()
peptide_from_pdresultsDF.T

In [None]:
sequence = 'SDFCSDFSDCSDFSDFSDF'

mod_loc=4
mass = '123.00'

sequence[0:mod_loc-1] + 'C' + mass + sequence[mod_loc:]

In [None]:
data = {}

## Transfer some values directly
data['uniprot_ids'] = peptide_from_pdresultsDF['MasterProteinAccessions']
data['area_ratio'] = peptide_from_pdresultsDF['AbundanceRatios']
data['sequence'] = peptide_from_pdresultsDF['Sequence']



# Do later
## TODO: Global PTM
data['ptm_indices'] = peptide_from_pdresultsDF['ptm_indices']
## Detailed protein annot
data['annotation'] = None
## The IP2 semicolon array that RunCounter is made from
data['unique1'] = None

data['run_counter'] = peptide_from_pdresultsDF['run_counter']
## Decoy is always false as the SQL Takes care of that. 
data['decoy'] = False
## TODO: Create ip2_peptide
ip2_peptide=peptide_from_pdresultsDF['ip2_peptide']

df = pd.DataFrame(data=data)


In [None]:
def __init_peptide(self, row):
    peptide = Peptide(sequence=row['sequence'],
                      mod_locs=row['mod_locs'],
                      ptm_indices=row['ptm_indices'],
                      area_ratio=row['area_ratio'],
                      area_ratios=row['area_ratios'],
                      annotation=row['annotation'],
                      uniprot_ids=row['uniprot_ids'],
                      run_counter=row['run_counter'],
                      decoy=row['decoy'],
                      unique1=row['UNIQUE_1'],
                      ip2_peptide=row['ip2_peptide'])

        return peptide

df.apply(__init_peptide)

In [None]:
pepgroup_df[pepgroup_df.Sequence == 'CGEEIAVQFVDMVK']

In [None]:
x = b'\x06\x00\x00\x00c@sA\x01\xfe\xff\xffG\x8b\x7feA\x01'

from enum import Enum

class DataType(Enum):
    Float = 1

def extract_values(binary_data, n=None, dataType=None):
    """
    values: bytes from the blob
    n: number of channels
    t: type of data
        'decimal' for values in decimal format such as Abundances
        'integer_number' for values such as 'Found in'

    """
    result = []
    
    if dataType == DataType.Float:
        for i in range(n):
            sub = binary_data[9*i:9*i+8]
            result.append(struct.unpack("d",sub)[0])
    else:
        return binary_data

    if len(result) == 1:
        return result[0]
    else:
        return result
    
file_path = "/usca/asperapoc/NB-CPACT/NB-CPACT-NIBR/NIBR_loaded_UCB_EN80/Analysis/PD2.1/KEA_isoTOP_DN_All.pdResult"
pd_connection = sqlite3.connect(file_path)

sqlStr = """
            SELECT Sequence, Abundances, AbundanceRatios, FoundInSamples FROM TargetPeptideGroups tpg 
            WHERE AbundanceRatios IS NOT NULL AND Sequence = 'CGEEIAVQFVDMVK'
        """
targetPeptideGroupsDF = pd.read_sql(sqlStr, pd_connection)
print(targetPeptideGroupsDF.AbundanceRatios.apply(extract_values, n=1, dataType=DataType.Float).apply(np.log2))
print(targetPeptideGroupsDF.Abundances.apply(extract_values, n=2, dataType=DataType.Float))
print(targetPeptideGroupsDF.FoundinSamples.apply(extract_values, n=1))

In [None]:
peptide_df[peptide_df.Sequence == 'CGEEIAVQFVDMVK'].T

In [None]:
def unpack_type(barray, n, t):
    """
    extract BLOB from pdResult file
    barray: barray that holds blob
    n: number of channels
    t: type of data
        'decimal' for values in decimal format such as Abundances
        'integer_number' for values such as 'Found in'

    """
    
    tmp = []
    if barray:
        if t == 'decimal':
            for i in range(n):
                tmp.append(struct.unpack("d",barray[9*i:9*i+8]))
        if ((t == 'integer_number') or (t == 'integer_text')):
            for i in range(n):
                tmp.append(struct.unpack("i",df[5*i:5*i+4]))
    # take care of the missing values     
    else: 
        if t == 'decimal':
            tmp = [0.0]*n    
        if ((t == 'integer_number') or (t == 'integer_text')):
            tmp = [0]*n

    return tmp


barray = b'{\x14\xaeG\xe1z\x84?\x01'
unpack_type(barray, 5, 'decimal')