Just look to see if the sequences overlap

In [3]:
## Load any changes to local modules
%load_ext autoreload
%autoreload 2

import os
import sys

pwd = %pwd

module_path = os.path.abspath(os.path.join('{0}/../../'.format(pwd)))
if module_path not in sys.path:
    sys.path.append(module_path)

study_dir = '{}/{}'.format(module_path, 'results/03072018_proteomics_informatics_tc/')


In [4]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import re
import numpy as np
%matplotlib inline
from IPython.display import display

pd.set_option('precision', 5)
pd.set_option('max_colwidth', 50)



from nbcpact import AnalyzeQuantCompare,Peptide,PeptideGroup,UcbreUtils,PeptidesFromPeptideListBuilder
from dpro.protdisc import PDReaderFactory

from test.test_analyzequantCompare import * 

In [3]:
def load_ip2_peptide_list(file_path):
    
    df = pd.read_csv(file_path)
    df['ip2_sequence'] = df.sequence

    df['sequence'] = df.sequence.str.replace('\(\d+\.\d+\)','')

    num_records = df.index.size
    num_seqs = len(set(df['sequence']))

    decoy_df = df[df['protein'].str.contains('Reverse')]
    num_decoy_records = decoy_df.index.size
    percent_decoy_records = (num_decoy_records/num_records * 100)
    num_decoy_seqs = len(set(decoy_df['sequence']))
    percent_decoy_seqs = (num_decoy_seqs/num_seqs * 100)

    

    df = df[~df['protein'].str.contains('Reverse')]
    num_forward_recs = df.index.size
    num_forward_seqs = len(set(df['sequence']))
    

    msg = 'Records: Tot {}, Forward {} , Decoy {} ({:.0f}%)'.format(num_records, 
                                                               num_forward_recs, 
                                                               num_decoy_records, 
                                                               percent_decoy_records)
    print(msg)
    
    msg = 'Sequences: Tot {}, Forward {}, Decoy {} ({:.0f}%)'.format(num_seqs, 
                                                                      num_forward_seqs, 
                                                                      num_decoy_seqs, 
                                                                      percent_decoy_seqs)
    print(msg)
    
    
    df['RatioRank'] = df.AREA_MEDIAN_RATIO_1.rank(ascending=True)

    ax = sns.regplot(x="RatioRank", y="AREA_MEDIAN_RATIO_1", data=df)
    plt.figure()
    
    intersection = set(df.sequence) & set(decoy_df.sequence)
    print('Peptide sequences seen in both the decoy and forward sets {}'.format(intersection))
    
    return df
    




In [4]:
class Comparison:
    
    def __init__(self, 
                 nibr_input=None, 
                 ucb_input=None, 
                 ucb_output=None, 
                 desc=None, 
                 fdr=0.01):
        self.nibr_input = nibr_input
        self.ucb_input = ucb_input
        self.ucb_output = ucb_output
        self.desc = desc.copy()
        self.fdr = fdr
        self.pdReader = None
        
        if nibr_input:
            if os.path.isfile(nibr_input):
                factory = PDReaderFactory()
                self.pdReader = factory.createPDReader(nibr_input,
                                                  include_non_quant=True,
                                                  fdr=self.fdr,
                                                  validate=True)
        
file_path = '{}/{}'.format(study_dir, 'comparisons.txt')
df = pd.read_table(file_path)

comparisons = df.apply(lambda x : Comparison(nibr_input=x['NIBR_INPUT'], 
                                             ucb_input=x['UCB_INPUT'], 
                                             ucb_output=x['UCB_OUTPUT'], 
                                             desc=x, 
                                             fdr=x['FDR']), 
                       axis=1)


for comp in comparisons:
    display(comp.desc)

print('DONE')

INFO:dpro.protdisc:Data will be read from sqlite:////da/dmp/cb/jonesmic/chemgx/dpro_studies/2018/NomuraCompare/KN80_FullTryptic/KEA_EN80.pdResult
INFO:dpro.protdisc:Ignoring 510196 (95.98%) TargetPeptideGroups with a Qvalityqvalue > 0.01. Adjust the fdr param to retain
INFO:dpro.protdisc:Ignoring 759245 (92.38%) TargetPsms with a PercolatorqValue > 0.01. Adjust the fdr param to retain


NIBR_INPUT                                                                   None
UCB_INPUT                       /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB_OUTPUT                      /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB Enz                                                              Full Tryptic
NIBR Enz                                                             Full Tryptic
FDR                                                                          0.01
UCB Max Internal Cleavages                                                      1
NIBR Max Internal Cleavages2                                                    1
UCB Diff Mods / Peptide*                                                        2
NIBR Diff Mods / Peptide*2                                                      2
UCB_FASTA                       ip2_ip2_data_dnomura_database__UniProt_human_0...
NIBR_FASTA                      ip2_ip2_data_dnomura_database__UniProt_human_0...
Notes           

NIBR_INPUT                                                                   None
UCB_INPUT                       /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB_OUTPUT                      /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB Enz                                                              Full Tryptic
NIBR Enz                                                             Full Tryptic
FDR                                                                           NaN
UCB Max Internal Cleavages                                                      1
NIBR Max Internal Cleavages2                                                    1
UCB Diff Mods / Peptide*                                                        2
NIBR Diff Mods / Peptide*2                                                      2
UCB_FASTA                       ip2_ip2_data_dnomura_database__UniProt_human_0...
NIBR_FASTA                      ip2_ip2_data_dnomura_database__UniProt_human_0...
Notes           

NIBR_INPUT                                                                   None
UCB_INPUT                                                                    None
UCB_OUTPUT                                                                    NaN
UCB Enz                                                              Full Tryptic
NIBR Enz                                                             Full Tryptic
FDR                                                                          0.01
UCB Max Internal Cleavages                                                      1
NIBR Max Internal Cleavages2                                                    1
UCB Diff Mods / Peptide*                                                        2
NIBR Diff Mods / Peptide*2                                                      2
UCB_FASTA                       ip2_ip2_data_dnomura_database__UniProt_human_0...
NIBR_FASTA                       uniprot.HUMAN_CANON_SMORF.FCE.vJune21_2017.fasta
Notes           

NIBR_INPUT                      /da/dmp/cb/jonesmic/chemgx/dpro_studies/2018/N...
UCB_INPUT                       /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB_OUTPUT                      /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB Enz                                                              Full Tryptic
NIBR Enz                                                             Full Tryptic
FDR                                                                          0.01
UCB Max Internal Cleavages                                                      1
NIBR Max Internal Cleavages2                                                    2
UCB Diff Mods / Peptide*                                                        3
NIBR Diff Mods / Peptide*2                                                      2
UCB_FASTA                       ip2_ip2_data_dnomura_database__UniProt_human_0...
NIBR_FASTA                      ip2_ip2_data_dnomura_database__UniProt_human_0...
Notes           

DONE


In [6]:



for comp in comparisons:
    if comp.pdReader:
        display(comp.desc)
        
        peptide_list_file = comp.ucb_input
        #peptide_list_file = '/home/jonesmic/gBuild/jonesmic_github/proteomics-scripts/test/data/peptideList.csv'
        peptidesFromPeptideListBuilder = PeptidesFromPeptideListBuilder(peptide_list_file)
        analyzeQuantCompare = AnalyzeQuantCompare(peptide_generator=peptidesFromPeptideListBuilder)
        groups = analyzeQuantCompare.build_peptide_groups()
        generated_results_csvDF = analyzeQuantCompare.build_results_from_peptide_groups(groups=groups)
        
        ucbResultsDF = UcbreUtils.read_results_verbose_csv(comp.ucb_output)

        compare_dataframes(ucbdf=ucbResultsDF, novdf=generated_results_csvDF,
                           merge_cols=['Peptide', 'ratios'],
                           identical_cols=['run_count', 'uniprot', 'annotations'],
                           close_cols=['mean_group_ratio'],
                           warn_cols=['ptm_index_from_ip2'])

NIBR_INPUT                      /da/dmp/cb/jonesmic/chemgx/dpro_studies/2018/N...
UCB_INPUT                       /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB_OUTPUT                      /home/jonesmic/gBuild/jonesmic_github/proteomi...
UCB Enz                                                              Full Tryptic
NIBR Enz                                                             Full Tryptic
FDR                                                                          0.01
UCB Max Internal Cleavages                                                      1
NIBR Max Internal Cleavages2                                                    2
UCB Diff Mods / Peptide*                                                        3
NIBR Diff Mods / Peptide*2                                                      2
UCB_FASTA                       ip2_ip2_data_dnomura_database__UniProt_human_0...
NIBR_FASTA                      ip2_ip2_data_dnomura_database__UniProt_human_0...
Notes           

##################################DATA: BEGIN###################
KEY: annotation, LEN: 8562
KEY: file_AREA_MEDIAN_RATIO_1, LEN: 8562
KEY: UNIQUE_1, LEN: 8562
KEY: decoy, LEN: 8562
KEY: uniprot_ids, LEN: 8562
KEY: mod_locs, LEN: 8562
KEY: peptide, LEN: 8562
KEY: ip2_peptide, LEN: 8562
KEY: ptm_indices, LEN: 8562
KEY: area_ratios, LEN: 8562
KEY: run_data, LEN: 8562
KEY: run_counter, LEN: 8562
##################################DATA: END###################


AttributeError: Can only use .str accessor with string values, which use np.object_ dtype in pandas

In [15]:
df = pd.DataFrame(data={'Prot':['p1', 'p2', 'p2', 'p3']})

df['STUFF'] = ['A', 'B']

ValueError: Length of values does not match length of index

In [None]:
pd_file_path = '/da/dmp/cb/jonesmic/chemgx/data/isoTopAnalysis/jonesmic/pd/pd2.2/KEA_EN80_NomiraFastaSequest_FullTryptic/KEA_EN80.pdResult'
ip2_file_path = '/home/jonesmic/gBuild/jonesmic_github/proteomics-scripts/datanocommit/peptideList.csv'


print('Compare PD on Canonical+Alternative FASTA vs. IP2')
print('{} vs. {}'.format(pd_file_path, ip2_file_path))
print('load PD')
pdReader = PDReader(pd_result_file=pd_file_path, include_non_quant=True)
pdDF = pdReader.get_target_peptides(include_additional_data=True)
print('Load IP2')
ip2DF = load_ip2_peptide_list(ip2_file_path)


do_sequence_overlap(ip2DF=ip2DF, pdDF=pdDF)   

In [None]:
mergeDF = pd.merge(pdDF, ip2DF, left_on='Sequence', right_on='sequence')

In [None]:
pdDF['IN_IP2'] = pdDF.Sequence.isin(ip2DF.sequence)
pdDF['NEG_LOG10_Q'] = pdDF.Qvalityqvalue.apply(lambda x : -np.log10(x+0.0001))
print(pdDF.columns)
sns.violinplot(y='NEG_LOG10_Q', x='IN_IP2', data=pdDF)

pdDF.groupby('IN_IP2')['Qvalityqvalue'].describe()
