In [1]:
%config InlineBackend.figure_format = 'retina'
%matplotlib inline
import os
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as plt
from tqdm import tqdm
sns.set_palette('Dark2')
sns.set_style({'axes.axisbelow': True, 'axes.edgecolor': '.15', 'axes.facecolor': 'white',
               'axes.grid': True, 'axes.labelcolor': '.15', 'axes.linewidth': 1.25, 
               'figure.facecolor': 'white', 'font.family': ['sans-serif'], 'grid.color': '.15',
               'grid.linestyle': ':', 'grid.alpha': .5, 'image.cmap': 'Greys', 
               'legend.frameon': False, 'legend.numpoints': 1, 'legend.scatterpoints': 1,
               'lines.solid_capstyle': 'round', 'axes.spines.right': False, 'axes.spines.top': False,  
               'text.color': '.15',  'xtick.top': False, 'ytick.right': False, 'xtick.color': '.15',
               'xtick.direction': 'out', 'xtick.major.size': 6, 'xtick.minor.size': 3,
               'ytick.color': '.15', 'ytick.direction': 'out', 'ytick.major.size': 6,'ytick.minor.size': 3})
sns.set_context('paper')

#http://phyletica.org/matplotlib-fonts/
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [2]:
from snapanalysis.config import OUTPUT_DIRECTORY as MAIN_OUTPUT_DIRECTORY
OUTPUT_DIRECTORY = os.path.join(MAIN_OUTPUT_DIRECTORY, 'ptm-response')

if not os.path.isdir(OUTPUT_DIRECTORY):
    os.makedirs(OUTPUT_DIRECTORY)

In [3]:
from snapanalysis.models.enrichment.generate import OUTPUT_FILE as ENRICHMENT_FILE
enrichment_data = pd.read_hdf(ENRICHMENT_FILE, '/enrichment_data')

In [4]:
unique_peptides = enrichment_data[['Unique peptides (forward)', 'Unique peptides (reverse)']]
unique_peptides.columns.name = 'Direction'
unique_peptides = unique_peptides.rename(columns={'Unique peptides (forward)': 'forward',
                                                  'Unique peptides (reverse)': 'reverse'})

unique_peptides = unique_peptides.stack()
unique_peptides.name = 'unique_peptides'

unique_peptides = unique_peptides.swaplevel()

In [5]:
import importlib

In [6]:
import snapanalysis.models.ptm_response.main
import snapanalysis.models.ptm_response.predictor_graph
import snapanalysis.models.ptm_response.limma

importlib.reload(snapanalysis.models.ptm_response.main)
importlib.reload(snapanalysis.models.ptm_response.predictor_graph)
importlib.reload(snapanalysis.models.ptm_response.limma)

from snapanalysis.models.ptm_response.main import OUTPUT_FILE as PTM_RESPONSE_FILE
from snapanalysis.models.ptm_response.predictor_graph import longform_matrices_of_informative_nucleosomes, to_matrix_design_and_weights

from snapanalysis.models.ptm_response.limma import limma_fit
from snapanalysis.models.ptm_response.main import limma_camera_complexes



In [7]:
lfms, network_df = longform_matrices_of_informative_nucleosomes()

2020-02-25 14:56:01,783 - snapanalysis.informative_nucleosome_graph - INFO - PTM predictive network generated: 53 nodes, 82 edges
2020-02-25 14:56:01,785 - snapanalysis.informative_nucleosome_graph - INFO - Found 3 non informative di-nucleosomes: ['H17', 'H33', 'H34']
2020-02-25 14:56:01,791 - snapanalysis.informative_nucleosome_graph - INFO - Found 1 not covered predictors: ['H3K23ac']
2020-02-25 14:56:01,799 - snapanalysis.informative_nucleosome_graph - INFO - The numbers of nucleosomes for each predictor are:
     DNA Methylation: 9 nucleosomes
            H4K20me2: 8 nucleosomes
             H4K16ac: 8 nucleosomes
         H3K9acK14ac: 8 nucleosomes
                H3ac: 6 nucleosomes
            H3K27me3: 6 nucleosomes
             H3K4me1: 5 nucleosomes
             H3K4me3: 5 nucleosomes
                H4ac: 5 nucleosomes
            H4K20me3: 4 nucleosomes
             H3K9me3: 4 nucleosomes
               H2A.Z: 3 nucleosomes
             H3K27ac: 2 nucleosomes
             H

In [21]:
ptm = 'H2A.Z'

lfm = lfms[ptm]

In [22]:
matrix, design, weights = to_matrix_design_and_weights(lfm, min_unimputed=1)

In [23]:
weights

edge,H36-(self),H36-(self),H37-H26,H37-H26,H37-H26,H37-H26,H45-H43,H45-H43,H45-H43,H45-H43
ptm,True,True,False,False,True,True,False,False,True,True
Direction,forward,reverse,forward,reverse,forward,reverse,forward,reverse,forward,reverse
Gene label,Unnamed: 1_level_3,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3
A0A087X222,2,2,1,1,2,2,1,1,1,1
A0A0C4DGP2,2,2,2,3,2,2,3,3,3,3
A0A0C4DGP5,1,1,1,1,1,1,1,1,1,1
AAAS,4,4,3,3,3,3,1,1,1,1
AATF,7,4,2,2,3,4,6,8,6,6
ABCF2,3,3,3,3,3,3,1,1,1,1
ABRAXAS1,9,9,10,9,8,10,2,1,2,1
ACD,14,15,12,14,15,15,1,1,1,1
ACIN1 (1),1,1,1,1,1,2,5,5,5,4
ACIN1 (2),2,3,2,2,2,2,3,3,3,3


In [24]:
ans, __ = limma_fit(matrix, design, weights, t_test_coef='ptm', fdr_threshold=0.01, fc_threshold=1.0)

In [25]:
stats = ans['stats']

In [26]:
stats.head()

Unnamed: 0_level_0,logFC,CI.L,CI.R,AveExpr,t,P.Value,adj.P.Val,B,df_total,moderated_t_stdev,logFC_variance,confint_half_width,neg_log10_p,neg_log10_p_adjust,significant,significant_and_large_fc
Gene label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
TOP2B (2),-2.044945,-2.262295,-1.827595,0.177833,-21.41916,8.637891e-09,5e-06,11.045544,8.640923,0.095473,0.01186,0.21735,8.063592,5.33281,True,True
CTDSPL2,-2.796458,-3.075129,-2.517788,-1.318725,-22.845354,4.992383e-09,5e-06,9.954215,8.640923,0.122408,0.019496,0.278671,8.301692,5.33281,True,True
KANSL3,2.081084,1.862801,2.299366,1.247745,21.704575,7.718872e-09,5e-06,9.916961,8.640923,0.095882,0.011962,0.218283,8.112446,5.33281,True,True
TOP2B (1),-2.068321,-2.338737,-1.797904,0.135653,-17.412682,4.982978e-08,2e-05,8.77545,8.640923,0.118782,0.018358,0.270416,7.302511,4.696668,True,True
ZMYM3,0.496272,0.427452,0.565092,0.902379,16.4162,8.158507e-08,2.2e-05,8.768914,8.642937,0.030231,0.001189,0.06882,7.088389,4.658637,True,False


In [27]:
from helpers import *

In [28]:
for direction in ['Up', 'Down']:
    print(direction)
    
    if direction == 'Up':
        
        proteins = stats[stats['adj.P.Val'] <= 0.01].query('logFC > 0').index
    else:
        proteins = stats[stats['adj.P.Val'] <= 0.01].query('logFC < 0').index
        
    print(predictor_sorted_uri(ptm, sorted(proteins)))
    print()


Up
http://ife-snap-data/proteins?pdorder=H36,H26,H37,H43,H45&k=p:ACTR5&k=p:ASH2L&k=p:ASXL2&k=p:BAP1&k=p:BBX&k=p:BEND3&k=p:BLM&k=p:CASZ1&k=p:CDYL&k=p:CUX1%20%281%29&k=p:DIDO1&k=p:DPY30&k=p:E4F1&k=p:EHMT2&k=p:ELF4&k=p:FOXC1&k=p:GTF3C1&k=p:GTF3C2&k=p:GTF3C4&k=p:H1FX&k=p:HCFC1%20%281%29&k=p:HCFC2&k=p:INO80B&k=p:INTS1&k=p:INTS13&k=p:INTS14&k=p:INTS2&k=p:KANSL1&k=p:KANSL2&k=p:KANSL3&k=p:KLF16&k=p:KMT2A&k=p:KMT2B&k=p:KMT2C&k=p:KMT2D&k=p:LMNB2&k=p:MCRS1&k=p:MEN1&k=p:MTA1&k=p:NCOA6&k=p:NFIB&k=p:NFRKB&k=p:NSD1&k=p:OGT&k=p:PAX6&k=p:PAXIP1&k=p:PHF20&k=p:PRR12&k=p:PWWP2A&k=p:RBBP5&k=p:RBPJ&k=p:RFWD3&k=p:RMI1&k=p:RMI2&k=p:RNF40&k=p:RPA1&k=p:RPA2&k=p:RPA3&k=p:SP1&k=p:SPEN&k=p:TFPT&k=p:TOP3A&k=p:WIZ%20%281%29&k=p:YY1&k=p:ZBTB40&k=p:ZMYM3&k=p:ZNF148&k=p:ZNF319&k=p:ZNF367&k=p:ZNF644&k=p:ZNF839&showsimilar=false&noclusterproteins=true

Down
http://ife-snap-data/proteins?pdorder=H36,H26,H37,H43,H45&k=p:ACTL6A&k=p:ACTR6&k=p:ACT%5BA1%2CA2%2CC1%2CG2%5D&k=p:BCOR&k=p:BRMS1L&k=p:CDCA2&k=p:CENPF&k=p:CETN2&k=p:CT

In [29]:
stats[(stats['adj.P.Val'] <= 0.01) & (stats['logFC'] > 0)].sort_values(by='logFC', ascending=False)

Unnamed: 0_level_0,logFC,CI.L,CI.R,AveExpr,t,P.Value,adj.P.Val,B,df_total,moderated_t_stdev,logFC_variance,confint_half_width,neg_log10_p,neg_log10_p_adjust,significant,significant_and_large_fc
Gene label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
KANSL1,2.737683,1.703789,3.771576,1.216416,6.028206,2.305185e-04,0.005240,0.723217,8.640923,0.454145,0.268362,1.033893,3.637294,2.280649,True,True
KANSL2,2.367292,1.314503,3.420081,1.675544,5.119070,7.143055e-04,0.009839,-0.431097,8.640923,0.462446,0.278262,1.052789,3.146116,2.007031,True,True
PWWP2A,2.225045,1.809242,2.640849,0.853007,12.182365,9.761675e-07,0.000113,6.285193,8.640923,0.182645,0.043406,0.415803,6.010476,3.948700,True,True
KANSL3,2.081084,1.862801,2.299366,1.247745,21.704575,7.718872e-09,0.000005,9.916961,8.640923,0.095882,0.011962,0.218283,8.112446,5.332810,True,True
MCRS1,1.994129,1.495156,2.493103,2.065529,9.098217,1.028739e-05,0.000791,4.034268,8.640923,0.219178,0.062507,0.498974,4.987695,3.102011,True,True
BEND3,1.900086,1.277164,2.523008,0.175233,6.944170,8.213990e-05,0.002651,1.659947,8.640923,0.273623,0.097418,0.622922,4.085446,2.576512,True,True
PHF20,1.893625,1.529585,2.257665,1.596036,11.842021,1.231425e-06,0.000133,6.060497,8.640923,0.159907,0.033271,0.364040,5.909592,3.877780,True,True
DIDO1,1.700074,1.065513,2.334635,2.557505,6.099235,2.120074e-04,0.005185,-0.243806,8.640923,0.278736,0.101092,0.634561,3.673649,2.285289,True,True
ZNF839,1.500891,0.919942,2.081840,0.662474,5.881552,2.745777e-04,0.005517,0.866645,8.640923,0.255186,0.084732,0.580949,3.561335,2.258304,True,True
ZNF367,1.413719,0.883886,1.943553,0.441449,6.074412,2.182850e-04,0.005200,1.082222,8.640923,0.232734,0.070477,0.529834,3.660976,2.284015,True,True


In [30]:
enrichment_data['Peptides (forward)']

Gene label  Pull-Down ID
A0A087X222  H01              4
            H01M             0
            H02              4
            H03              5
            H03M             0
            H04              5
            H04M             0
            H05              3
            H06              4
            H07              5
            H07M             0
            H08              5
            H08M             0
            H09              5
            H10              4
            H11              6
            H12              4
            H13              3
            H14              3
            H15              4
            H16              6
            H17              5
            H18              3
            H19              4
            H20              4
            H21              4
            H22              6
            H23              6
            H24              5
            H25              2
                            ..
ZZZ3        H2

In [31]:
df = limma_camera_complexes(matrix, design, weights,
                            min_size=5, max_size=20,
                            limma_stats=stats, coef='ptm').sort_values(by='FDR')

In [32]:
df.head(20)

Unnamed: 0_level_0,NGenes,Direction,PValue,FDR,mean_logFC,mean_CI.L,mean_CI.R,mean_AveExpr,mean_t,mean_P.Value,...,mean_logFC_variance,mean_confint_half_width,mean_neg_log10_p,mean_neg_log10_p_adjust,mean_significant,mean_significant_and_large_fc,mean_proteins,empirical_median,empirical_median_ci_left,empirical_median_ci_right
Complex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
SRCAP,9.0,Down,1.727014e-11,1.364341e-09,-0.997962,-1.34975,-0.64617,0.787369,-6.38087,0.000217032,...,0.0337359,0.351791,3.80139,2.38413,1.0,0.333333,ACTL6A/ACTR6/DMAP1/RUVBL1/RUVBL2/SRCAP/VPS72/Y...,-0.920221,-1.125635,-0.725621
NSL,10.0,Up,9.559792e-09,3.776118e-07,1.42051,0.93118,1.90985,1.24343,7.89,0.0145868,...,0.0914991,0.489334,4.01344,2.54971,0.7,0.5,HCFC1 (1)/HCFC1 (2)/KANSL1/KANSL2/KANSL3/KAT8/...,1.526035,1.151564,1.929783
MLL1/2,11.0,Up,3.377348e-08,8.893683e-07,0.462225,0.24745,0.676999,0.999912,5.91117,0.0133165,...,0.0251498,0.214774,3.43214,2.14986,0.727273,0.0,ASH2L/DPY30/HCFC1 (1)/HCFC1 (2)/HCFC2/KAT8/KMT...,0.353778,0.274647,0.438087
MLL3/4,10.0,Up,2.304261e-07,4.550915e-06,0.424937,0.233024,0.616851,0.655355,5.37678,0.00260167,...,0.0106431,0.191913,3.23562,2.02853,0.7,0.0,ASH2L/DPY30/KDM6A/KMT2C/KMT2D/NCOA6/PAGR1/PAXI...,0.420563,0.342444,0.501999
ncPRC1.1,11.0,Down,4.846916e-07,7.658128e-06,-0.277117,-0.470624,-0.0836106,0.246525,-4.89407,0.224191,...,0.0108279,0.193507,2.6299,1.67371,0.363636,0.0,BCOR/BCORL1 (1)/BCORL1 (2)/KDM2B/PCGF1/RING1/R...,-0.315147,-0.401127,-0.218448
INO80 (exclusive subunits),11.0,Up,1.375821e-06,1.811497e-05,1.23202,0.525866,1.93817,2.60162,4.85341,0.0241735,...,0.150446,0.70615,2.8667,1.78163,0.454545,0.454545,ACTR5/ACTR8/INO80/INO80B/INO80C/INO80D/INO80E/...,1.24204,1.022596,1.464438
Integrator (exclusive subunits),13.0,Up,2.264093e-06,2.555191e-05,0.253521,0.0745363,0.432505,0.456586,4.43688,0.0653745,...,0.0106574,0.178984,2.58245,1.61162,0.307692,0.0,INTS1/INTS10/INTS11/INTS12/INTS13/INTS14/INTS2...,0.268385,0.214449,0.317275
APC/C,13.0,Down,8.066254e-06,7.965426e-05,-0.206577,-0.407672,-0.00548294,-0.106962,-2.42072,0.0559976,...,0.0119219,0.201094,1.52971,0.891053,0.0,0.0,ANAPC1/ANAPC10/ANAPC11/ANAPC13/ANAPC15/ANAPC16...,-0.20036,-0.260789,-0.141357
Integrator,14.0,Up,1.22369e-05,0.0001074127,0.231145,0.0530299,0.40926,0.444262,4.06175,0.0919128,...,0.0103952,0.178115,2.42368,1.51027,0.285714,0.0,INTS1/INTS10/INTS11/INTS12/INTS13/INTS14/INTS2...,0.259491,0.206178,0.308092
MLL3/4 (exclusive subunits),6.0,Up,6.42545e-05,0.0005076105,0.499758,0.265729,0.733787,0.395148,5.29122,0.00370724,...,0.014921,0.234029,3.16941,1.98009,0.666667,0.0,KDM6A/KMT2C/KMT2D/NCOA6/PAGR1/PAXIP1,0.498886,0.403326,0.595716


In [33]:
df.loc['ORC']

NGenes                                                  5
Direction                                            Down
PValue                                         0.00293452
FDR                                             0.0126371
mean_logFC                                      -0.813233
mean_CI.L                                        -1.60236
mean_CI.R                                      -0.0241055
mean_AveExpr                                      1.85922
mean_t                                           -2.44185
mean_P.Value                                    0.0441356
mean_adj.P.Val                                   0.161202
mean_B                                           -4.47202
mean_df_total                                     8.64092
mean_moderated_t_stdev                            0.34663
mean_logFC_variance                              0.218301
mean_confint_half_width                          0.789128
mean_neg_log10_p                                  1.41682
mean_neg_log10