# DIRAC Analysis of LC M001 Liver Proteomics — DIRAC RCI Correlations of GOBP Modules

***by Kengo Watanabe***  

This Jupyter Notebook (with Python 3 kernel) assessed the pairwise correlations of module rank conservation index (RCI) among the sample groups. The differential rank conservation (DIRAC; Eddy, J.A. et al. PLoS Comput. Biol. 2010) metrics were calculated in Code01_LC-M001-proteomics_DIRAC-GOBP.ipynb.  

Input files:  
- DIRAC RCI data: 230214_LC-M001-proteomics-DIRAC-ver7-2_DIRAC-GOBP_RankConservationIndex-BS-combined.tsv  
- Statistical test summary of the RCI comparisons: 230214_LC-M001-proteomics-DIRAC-ver7-2_DIRAC-GOBP_inter-group-comparison_RCI.tsv  

Output figures and tables:  
- Supplementary Figure 1b  

Original notebook (memo for my future tracing):  
- dalek:\[JupyterLab HOME\]/230206_LC-M001-proteomics-DIRAC-ver7/230511_LC-M001-proteomics-DIRAC-ver7-2_DIRAC-GOBP-correlation.ipynb  

In [1]:
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#For Arial font
#!conda install -c conda-forge -y mscorefonts
##-> The below was also needed in matplotlib 3.4.2
#import shutil
#import matplotlib
#shutil.rmtree(matplotlib.get_cachedir())
import warnings
warnings.filterwarnings('ignore')
from IPython.display import display
import time
#For exporting .pdf file with editable text
import matplotlib
matplotlib.rcParams['pdf.fonttype']=42
matplotlib.rcParams['ps.fonttype']=42

import sys
from decimal import Decimal, ROUND_HALF_UP
from statsmodels.stats import multitest as multi

!conda list

# packages in environment at /opt/conda/envs/arivale-py3:
#
# Name                    Version                   Build  Channel
_libgcc_mutex             0.1                 conda_forge    conda-forge
_openmp_mutex             4.5                       1_gnu    conda-forge
analytics                 0.1                      pypi_0    pypi
argon2-cffi               21.1.0           py39h3811e60_0    conda-forge
arivale-data-interface    0.1.0                    pypi_0    pypi
async_generator           1.10                       py_0    conda-forge
atk-1.0                   2.36.0               h3371d22_4    conda-forge
attrs                     21.2.0             pyhd8ed1ab_0    conda-forge
backcall                  0.2.0              pyh9f0ad1d_0    conda-forge
backports                 1.0                        py_2    conda-forge
backports.functools_lru_cache 1.6.4              pyhd8ed1ab_0    conda-forge
biopython                 1.79             py39h3811e60_0    conda-forge
bleach 

In [None]:
#Custom function for the P-value conversion
def convert_pval(pval):
    #This function converts P-value to its annotation for a figure.
    #Of note, this function can also handle the case that P-value is zero due to the system limitation.
    #Input: P-value derived from a statistical test (float)
    #Output: P-value annotation (string)
    
    #import sys
    #from decimal import Decimal, ROUND_HALF_UP
    
    #Check the input
    if type(pval) is str:
        text = 'Check the input object type!'
        print(text)
        return text
    if pval<0 or pval>1:
        text = 'Check whether the input value was P-value!'
        print(text)
        return text
    
    below_limit = 0#Initialize
    if pval==1.0:
        pval_text = '1.0'
    else:
        if pval==0.0:#Due to smaller than the system float minimum
            pval = sys.float_info.min
            below_limit = 1#Update
        pval_text = f'{Decimal(str(pval)):.3E}'#Take more digits because rounding is bad here
        significand, exponent = pval_text.split(sep='E-')
        significand = str(Decimal(significand).quantize(Decimal('0.1'), rounding=ROUND_HALF_UP))
        if significand=='10.0':
            significand = '1.0'
            exponent = str(int(exponent)-1)
        if int(exponent)>2:
            pval_text = significand+r'$\times$'+'10'+r'$^{{-{0}}}$'.format(exponent)##Font is different in r'$ $' but \mathrm{\mathsf{{0}} doesn't work...
        elif int(exponent)>0:
            pval_text = '0.'+'0'*(int(exponent)-1)+significand.replace('.', '')
        else:
            pval_text = significand
    if below_limit==1:
        text = '<'+pval_text
    else:
        text = '='+pval_text
    
    return text

## 1. Prepare RCI

In [None]:
#Import the combined DIRAC results
fileDir = './ExportData/'
ipynbName = '230214_LC-M001-proteomics-DIRAC-ver7-2_DIRAC-GOBP_'
tempD1 = {'RCI':'RankConservationIndex-BS-combined.tsv'}
tempD2 = {}
for metric in tempD1.keys():
    fileName = tempD1[metric]
    tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t')
    tempD2[metric] = tempDF
    
    print(metric+' dataframe:', tempDF.shape)
    print(' - Unique modules:', len(tempDF['ModuleID'].unique()))
    print(' - Unique templates:', len(tempDF['Template'].unique()))
    display(tempDF)
    print('')

rciDF = tempD2['RCI']

## 2. Extract RCI (the mean of RMSs under the own phenotype consensus)

In [None]:
#Extract RCI whose template phenotype corresponds to the own phenotype
phenotypeL = rciDF.drop(columns=['ModuleID', 'Template']).columns.tolist()
rciDF_kk = pd.DataFrame(index=pd.Index(rciDF['ModuleID'].unique(), name='ModuleID'))
tempDF = rciDF.set_index('ModuleID')
for k in phenotypeL:
    tempS = tempDF[k].loc[tempDF['Template']==k]
    rciDF_kk = pd.merge(rciDF_kk, tempS, left_index=True, right_index=True, how='left')
##Sort
tempL = ['Ctrl', 'Aca', '17aE2', 'Rapa']
rciDF_kk = rciDF_kk[tempL]
display(rciDF_kk)
display(rciDF_kk.describe())

## 3. Test Spearman's correlations

In [None]:
tempDF = rciDF_kk

#Calculate correlation matrix and extract lower triangle matrix
tempDF1 = tempDF.corr(method='spearman')
print('Spearman\'s rho:')
display(tempDF1)
tempDF1 = tempDF1.where(np.tril(np.ones(tempDF1.shape), k=-1).astype(np.bool), other=np.nan)
tempDF1.index.rename('Variable1', inplace=True)
tempDF1 = tempDF1.reset_index().melt(var_name='Variable2', value_name='Spearman_rho', id_vars=['Variable1'])
tempDF1 = tempDF1.dropna()

#Statistical tests
tempDF2 = pd.DataFrame(columns=['Xvar', 'Yvar', 'N', 'DoF', 'Rho', 'Pval'])
for row_i in range(len(tempDF1)):
    xvar = tempDF1['Variable2'].iloc[row_i]
    yvar = tempDF1['Variable1'].iloc[row_i]
    #Spearman's correlation
    rho, pval = stats.spearmanr(tempDF[xvar], tempDF[yvar])
    size = len(tempDF)
    dof = size - 2
    tempDF2.loc[xvar+'-vs-'+yvar] = [xvar, yvar, size, dof, rho, pval]
##P-value adjustment by using Benjamini–Hochberg method
tempDF2['AdjPval'] = multi.multipletests(tempDF2['Pval'], alpha=0.05, method='fdr_bh',
                                         is_sorted=False, returnsorted=False)[1]
tempDF2.index.rename('ComparisonLabel', inplace=True)
display(tempDF2)
##Save
fileDir = './ExportData/'
ipynbName = '230511_LC-M001-proteomics-DIRAC-ver7-2_DIRAC-GOBP-correlation_'
fileName = 'test-summary.tsv'
tempDF2.to_csv(fileDir+ipynbName+fileName, sep='\t', index=True)

statDF = tempDF2

## 4. Categorize modules for visualization

> It would be better to highlight the changed vs. unchanged modules.  
> –> Use the adjusted ANOVA P-value < 0.05.  

In [None]:
#Import the statistical test summary
fileDir = './ExportData/'
ipynbName = '230214_LC-M001-proteomics-DIRAC-ver7-2_DIRAC-GOBP_'
fileName = 'inter-group-comparison_RCI.tsv'
tempDF = pd.read_csv(fileDir+ipynbName+fileName, sep='\t').set_index('ModuleID')

#Categorize modules
variable = 'ANOVA_AdjPval'
tempL = []
for module in tempDF.index:
    pval = tempDF.loc[module, variable]
    if pval < 0.05:
        tempL.append('Yes')
    else:
        tempL.append('No')
tempDF['Changed?'] = tempL

#Clean (just for display)
tempDF = tempDF.sort_values(by=variable, ascending=True)
tempL1 = tempDF.loc[:, tempDF.columns.str.contains('_RMSmean')].columns.tolist()
tempL2 = tempDF.loc[:, tempDF.columns.str.contains('Pval')].columns.tolist()
tempDF = tempDF[[col_n for subL in [['ModuleName', 'Changed?'], tempL1, tempL2] for col_n in subL]]

display(tempDF)
display(tempDF['Changed?'].value_counts())

moduleDF = tempDF

## 5. Visualize correlations

In [None]:
tempDF1 = rciDF_kk
tempDF2 = statDF
tempDF3 = moduleDF
tempD1 = {'Ctrl':'Control', 'Aca':'Acarbose',
          '17aE2':'17'+r'$\alpha$'+'-Estradiol', 'Rapa':'Rapamycin'}
tempD2 = {'No':'tab:blue', 'Yes':'tab:orange'}

#Rename labels
tempDF1 = tempDF1.rename(columns=tempD1)
tempDF2 = tempDF2.reset_index()
tempDF2['ComparisonLabel'] = tempDF2['Xvar'].map(tempD1)+'-vs-'+tempDF2['Yvar'].map(tempD1)
tempDF2 = tempDF2.set_index('ComparisonLabel')

#Check correlation matrix
tempDF = tempDF1.corr(method='spearman')
print('Spearman\'s rho:')
display(tempDF)

#Check summary of correlation tests
display(tempDF2)

#Add module category
tempDF = pd.merge(tempDF1, tempDF3['Changed?'], left_index=True, right_index=True, how='left')

#Visualization
sns.set(style='ticks', font='Arial', context='talk')
p = sns.PairGrid(tempDF, hue='Changed?', hue_order=tempD2.keys(), palette=tempD2,
                 height=2.5, aspect=1, layout_pad=0.0)
p.map_lower(sns.scatterplot, edgecolor='black', alpha=0.5, s=25)
p.map_diag(sns.distplot, axlabel=False, kde_kws={'alpha':0.8}, hist_kws={'edgecolor':'white', 'alpha':0.5})
for i, j in zip(*np.triu_indices_from(p.axes, 1)):
    p.axes[i, j].set_visible(False)
for i, j in zip(*np.tril_indices_from(p.axes, 0)):
    p.axes[i, j].set(xlim=(0.49, 1.01), xticks=np.arange(0.5, 1.01, 0.1),
                     ylim=(0.49, 1.01), yticks=np.arange(0.5, 1.01, 0.1))
for i, j in zip(*np.tril_indices_from(p.axes, -1)):
    p.axes[i, j].grid(axis='both', linestyle='--', color='gray', alpha=0.3)
    #Annotate correlation statistics
    xvar = tempDF1.columns.tolist()[j]
    yvar = tempDF1.columns.tolist()[i]
    rho = tempDF2['Rho'].loc[xvar+'-vs-'+yvar]
    rho_text = str(Decimal(str(rho)).quantize(Decimal('0.001'), rounding=ROUND_HALF_UP))
    pval = tempDF2['AdjPval'].loc[xvar+'-vs-'+yvar]
    pval_text = convert_pval(pval).replace('=', '= ').replace('<', '< ')#Add a white space
    text = 'Spearman\'s '+r'$\rho$'+' = '+rho_text+'\n'+r'$P_{\mathrm{\mathsf{adj}}}$'+' '+pval_text
    p.axes[i, j].annotate(text, xy=(0.025, 1.0), xycoords='axes fraction',
                          horizontalalignment='left', verticalalignment='top',
                          multialignment='left', fontsize='x-small', color='k')
pl = plt.legend(bbox_to_anchor=(0.8, 4.1), loc='upper right', title='Changed by any intervention?')
##Add sample size in lagend
for row_i in range(len(pl.get_texts())):
    changed = pl.get_texts()[row_i].get_text()
    count = len(tempDF.loc[tempDF['Changed?']==changed])
    pl.get_texts()[row_i].set_text(changed+' ('+r'$n$'+' = '+f'{count:,}'+' modules)')
##Save
fileDir = './ExportFigures/'
ipynbName = '230511_LC-M001-proteomics-DIRAC-ver7-2_DIRAC-GOBP-correlation_'
fileName = 'RCI-correlations.pdf'
plt.gcf().savefig(fileDir+ipynbName+fileName, dpi=300, bbox_inches='tight', pad_inches=0.04, transparent=True)
plt.show()

# — End of notebook —