Just look to see if the sequences overlap

In [None]:
## Load any changes to local modules
%load_ext autoreload
%autoreload 2

import os
import sys

pwd = %pwd

module_path = os.path.abspath(os.path.join('{0}/../../'.format(pwd)))
#if module_path not in sys.path:
#    sys.path.append(module_path)

study_dir = '{}/{}'.format(module_path, 'results/03072018_proteomics_informatics_tc/')


In [None]:
import pandas as pd
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
import re
import numpy as np
%matplotlib inline
from IPython.display import display

pd.set_option('precision', 5)
pd.set_option('max_colwidth', 50)



#from nbcpact import AnalyzeQuantCompare,Peptide,PeptideGroup,UcbreUtils,PeptidesFromPeptideListBuilder
from dpro.protdisc import PDReaderFactory


In [None]:
def load_ip2_peptide_list(file_path):
    
    df = pd.read_csv(file_path)
    df['ip2_sequence'] = df.sequence

    df['sequence'] = df.sequence.str.replace('\(\d+\.\d+\)','')

    num_records = df.index.size
    num_seqs = len(set(df['sequence']))

    decoy_df = df[df['protein'].str.contains('Reverse')]
    num_decoy_records = decoy_df.index.size
    percent_decoy_records = (num_decoy_records/num_records * 100)
    num_decoy_seqs = len(set(decoy_df['sequence']))
    percent_decoy_seqs = (num_decoy_seqs/num_seqs * 100)

    

    df = df[~df['protein'].str.contains('Reverse')]
    num_forward_recs = df.index.size
    num_forward_seqs = len(set(df['sequence']))
    

    msg = 'Records: Tot {}, Forward {} , Decoy {} ({:.0f}%)'.format(num_records, 
                                                               num_forward_recs, 
                                                               num_decoy_records, 
                                                               percent_decoy_records)
    print(msg)
    
    msg = 'Sequences: Tot {}, Forward {}, Decoy {} ({:.0f}%)'.format(num_seqs, 
                                                                      num_forward_seqs, 
                                                                      num_decoy_seqs, 
                                                                      percent_decoy_seqs)
    print(msg)
    
    
    df['RatioRank'] = df.AREA_MEDIAN_RATIO_1.rank(ascending=True)

    ax = sns.regplot(x="RatioRank", y="AREA_MEDIAN_RATIO_1", data=df)
    plt.figure()
    
    intersection = set(df.sequence) & set(decoy_df.sequence)
    print('Peptide sequences seen in both the decoy and forward sets {}'.format(intersection))
    
    return df
    




In [None]:
## Find overlap
def do_sequence_overlap(ip2DF=None,  pdDF=None):
    pdDF['in_ip2'] = pdDF.Sequence.isin(ip2DF['sequence'])
    ip2DF['in_pd'] = ip2DF.sequence.isin(pdDF.Sequence)
    
    ip2_peps = set(ip2DF['sequence'])
    pd_peps = set(pdDF['Sequence'])
    pd_peps_with_ratios = set(pdDF[~pdDF.ABUNDANCE_RATIOS.isnull()].Sequence)

    intersection = ip2_peps & pd_peps
    percent_ip2_covered = len(intersection)/len(ip2_peps)*100
    msg = 'Num IP2 Peps {}, num PD Peps {}, num common peps {}, percent IP2 Covered in PD {:.0f}%'.format(
                                                                                     len(ip2_peps), 
                                                                                     len(pd_peps), 
                                                                                     len(intersection),
                                                                                     percent_ip2_covered)
    print(msg)
    
    intersection_with_ratios = ip2_peps & pd_peps_with_ratios
    percent_ip2_covered = len(intersection)/len(ip2_peps)*100
    msg = 'Num IP2 Peps {}, num PD Peps with ratios {}, num common peps {}, percent IP2 Covered in PD {:.0f}%'.format(
                                                                                     len(ip2_peps), 
                                                                                     len(pd_peps_with_ratios), 
                                                                                     len(intersection_with_ratios),
                                                                                     percent_ip2_covered
                                                                                     )
    print(msg)
    
    overlapping_pd_peps_missing_ratios = intersection - intersection_with_ratios 
    print('overlapping_pd_peps_missing_ratios {}...'.format(list(overlapping_pd_peps_missing_ratios)[1:5]))
 

In [None]:
class Comparison:
    
    def __init__(self, 
                 nibr_input=None, 
                 ucb_input=None, 
                 ucb_output=None, 
                 desc=None, 
                 fdr=0.01):
        self.nibr_input = nibr_input
        self.ucb_input = ucb_input
        self.ucb_output = ucb_output
        self.desc = desc.copy()
        self.fdr = fdr
        self.pdReader = None
        
        if nibr_input:
            if os.path.isfile(nibr_input):
                factory = PDReaderFactory()
                self.pdReader = factory.createPDReader(nibr_input,
                                                  include_non_quant=True,
                                                  fdr=self.fdr,
                                                  validate=True)
        
file_path = '{}/{}'.format(study_dir, 'comparisons.txt')
df = pd.read_table(file_path)

comparisons = df.apply(lambda x : Comparison(nibr_input=x['NIBR_INPUT'], 
                                             ucb_input=x['UCB_INPUT'], 
                                             ucb_output=x['UCB_OUTPUT'], 
                                             desc=x, 
                                             fdr=x['FDR']), 
                       axis=1)


for comp in comparisons:
    display(comp.desc)

print('DONE')

In [None]:
for comp in comparisons:
    if comp.pdReader:
        display(comp.desc)
        print('get PD data')

        pdDF = comp.pdReader.get_target_peptides(include_additional_data=False)
        print('Load IP2')
        ip2DF = load_ip2_peptide_list(comp.ucb_input)

        do_sequence_overlap(ip2DF=ip2DF, pdDF=pdDF)
        print('###################################################################')
        
        print('###################################################################')

In [None]:
pd_file_path = '/da/dmp/cb/jonesmic/chemgx/data/isoTopAnalysis/jonesmic/pd/pd2.2/KEA_EN80_NomiraFastaSequest_FullTryptic/KEA_EN80.pdResult'
ip2_file_path = '/home/jonesmic/gBuild/jonesmic_github/proteomics-scripts/datanocommit/peptideList.csv'


print('Compare PD on Canonical+Alternative FASTA vs. IP2')
print('{} vs. {}'.format(pd_file_path, ip2_file_path))
print('load PD')
pdReader = PDReader(pd_result_file=pd_file_path, include_non_quant=True)
pdDF = pdReader.get_target_peptides(include_additional_data=True)
print('Load IP2')
ip2DF = load_ip2_peptide_list(ip2_file_path)


do_sequence_overlap(ip2DF=ip2DF, pdDF=pdDF)   

In [None]:
mergeDF = pd.merge(pdDF, ip2DF, left_on='Sequence', right_on='sequence')

In [None]:
pdDF['IN_IP2'] = pdDF.Sequence.isin(ip2DF.sequence)
pdDF['NEG_LOG10_Q'] = pdDF.Qvalityqvalue.apply(lambda x : -np.log10(x+0.0001))
print(pdDF.columns)
sns.violinplot(y='NEG_LOG10_Q', x='IN_IP2', data=pdDF)

pdDF.groupby('IN_IP2')['Qvalityqvalue'].describe()
